Document Title | Salifort Motors - ML Modelling - Decision Tree |
Author | Rod Slater |
Version | 1.0 |
Created | 01-11-2023 |
Modified | 16-11-2023 |
Client Name | Salifort Motors |
Client Contact | Mr HR Team |
Client Email | hr@salifortmotors.it |
Client Project | HR Team Data Driven Solutions from Machine Learning Models |
ML Modelling using Decision Tree for HR Data provided by Salifort Motors. This notebook details the Decision Tree Modelling process and Performance comparisons
model_dt1
model_dt1
model_dt1
model_dt1
model_dt1
- Train datamodel_dt1
model_dt1
model_dt1
- Train Datamodel_dt1
- Train datamodel_dt1
- GridSearch - Train datamodel_dt1
- Train datamodel_dt2
- Train datamodel_dt1
- Test datamodel_dt1
- Test datamodel_dt1
- Test datamodel_dt1
- Test data
model_dt1
- Train data
model_dt2
model_dt2
- No Departments
model_dt2
model_dt2
model_dt2
model_dt2
- Train datamodel_dt2
- Train datamodel_dt2
model_dt2
- Train datamodel_dt2
- Train datamodel_dt2
- Train data
model_dt2
- Test data
model_dt1
- Train data
# Import packages
# Data manipulation
import numpy as np
import pandas as pd
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support, \
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, precision_recall_curve, auc
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from datetime import datetime as dt
import json
# Shap Explainer
import shap
# For saving models
import pickle
# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/" # Source folder for cleaned data
load_path = "00-data_cleaned/" # Source folder for cleaned data
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # destination for pickle saved models
model_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # path to load/save pickled models
model_path = "04-pickle-ML-models/" # path to load/save pickled models
import chime
import time
def beepr():
for x in range(2):
for i in range(3):
chime.success()
time.sleep(0.25)
time.sleep(.5)
#beepr()
def display_results():
'''
Load Results.csv containing store test scores, return the scores for display
In:
none
Out: pandas df of Results,csv containing precision, recall, f1, accuracy, AUC scores and prediction count of the models
'''
model_results = pd.read_csv("Results.csv")
model_results.drop(columns=['Unnamed: 0'], inplace=True)
model_results = model_results.sort_values(by='AUC', ascending=False)
return model_results
def format_GS_results(model_name:str, model_object, metric:str):
'''
Returns a pandas df with the F1, recall, precision, accuracy, and auc scores
from the GridSearch.
Arguments:
model_name (string): what you want the model to be called in the output table
model_object: a fit GridSearchCV object from test data
metric (string): precision, recall, f1, accuracy, or auc
'''
# Create dictionary that maps input metric to actual metric name in GridSearchCV
metric_dict = {'auc': 'mean_test_roc_auc',
'precision': 'mean_test_precision',
'recall': 'mean_test_recall',
'f1': 'mean_test_f1',
'accuracy': 'mean_test_accuracy'
}
# Get all the results from the CV and put them in a df
cv_results = pd.DataFrame(model_object.cv_results_)
# Isolate the row of the df with the max(metric) score
best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]
# Extract Accuracy, precision, recall, and f1 score from that row
auc = best_estimator_results.mean_test_roc_auc
f1 = best_estimator_results.mean_test_f1
recall = best_estimator_results.mean_test_recall
precision = best_estimator_results.mean_test_precision
accuracy = best_estimator_results.mean_test_accuracy
# Create table of results
table = pd.DataFrame()
table = pd.DataFrame({'Model': [model_name],
'Precision': [precision],
'Recall': [recall],
'F1': [f1],
'Accuracy': [accuracy],
'AUC': [auc]
})
return table
# Get results from predict to store in comparison table
def make_results(model_name:str, model_object:str, X_var:str, y_var:str, y_pred_var:str):
'''
Returns a pandas df with the F1, recall, precision, and accuracy scores
across all validation folds.
In:
a model name (string) - Used as the heading
a model object : The ML Model
X_var, y_var, y_pred_var the variables used in the model
out: pandas df containing precision, recall, f1, accuracy, and AUC scores of the models
'''
# Get all the results from the CV and put them in a dict
report = classification_report(y_var, y_pred_var, output_dict=True)
# Calculate precision, recall, and F1 score for the "True" class
predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
f1_true_class = predict_f1_score[1] # Index 1 corresponds to the "True" class
f1_false_class = predict_f1_score[0] # Index 1 corresponds to the "True" class
# Extract accuracy, precision, recall, and f1 score from that row
f1 = report['weighted avg']['f1-score']
recall = report['weighted avg']['recall']
precision = report['weighted avg']['precision']
accuracy = report['accuracy']
auc = roc_auc_score(y_var, model_object.predict_proba(X_var)[:,1])
# Create table of results
table = pd.DataFrame({'Model': model_name,
'Precision': precision,
'Recall': recall,
'F1': f1,
'Accuracy': accuracy,
'AUC': auc,
'Predict Leave': f1_true_class,
'Predict Stay' : f1_false_class
},
index=[0]
)
return table
def classification_report_summary(name:str, y_var:str, y_pred_var:str):
'''
Gather stats from predictions and format into a report
In:
name:str : Test data name for report header e.g. TEST or TRAIN
y_var : y variable
y_pred_var : y prediction Variable
Out: Display of precision, recall, f1, accuracy, and AUC scores of the models, Weighted Average and Prediction f1 score for true/false
'''
targetnames = ['Predicted would not leave', 'Predicted would leave']
print("\nClassification Report : ", name)
print(classification_report(y_var, y_pred_var, target_names=targetnames))
print("Recall : {:.4%}".format(recall_score(y_var, y_pred_var)))
print("f1_score : {:.4%}".format(f1_score(y_var, y_pred_var)))
print("Precision : {:.4%}".format(precision_score(y_var, y_pred_var)))
print("Accuracy : {:.4%}".format(accuracy_score(y_var, y_pred_var)))
report = classification_report(y_var, y_pred_var, output_dict=True)
print()
print('\u2500' * 35)
print("Weighted Average")
print('\u2500' * 35)
print("Recall : {:.4%}".format(report['weighted avg']['recall']))
print("f1 Score : {:.4%}".format(report['weighted avg']['f1-score']))
print("Precision : {:.4%}".format(report['weighted avg']['precision']))
print("Support : {:.0f}".format(report['weighted avg']['support']))
# Calculate precision, recall, and F1 score for the "True" class
predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
f1_true_class = predict_f1_score[1] # Index 1 corresponds to the "True" class
f1_false_class = predict_f1_score[0] # Index 0 corresponds to the "False" class
print()
print('\u2500' * 35)
print("Prediction F1 score")
print('\u2500' * 35)
print("Predict Leave : {:.4%}".format(f1_true_class))
print("Predict Stay : {:.4%}".format(f1_false_class))
def write_pickle(path, model_object, save_as:str):
'''
In:
path: path of folder where you want to save the pickle
model_object: a model you want to pickle
save_as: filename for how you want to save the model
Out: A call to pickle the model in the folder indicated
'''
with open(path + save_as + '.pickle', 'wb') as to_write:
pickle.dump(model_object, to_write)
def read_pickle(path, saved_model_name:str):
'''
In:
path: path to folder where you want to read from
saved_model_name: filename of pickled model you want to read in
Out:
model: the pickled model
'''
with open(path + saved_model_name + '.pickle', 'rb') as to_read:
model = pickle.load(to_read)
return model
Two datasets for model performance comparison, :
salifort_data_FE.csv
is the full data set feature engineered with salary
encoded to ordinal, avg_mth_hrs
binary encoded to overwork
and dept encoded with dummiessalifort_data_FE_focus.csv
is the same data with the dummy encoded dept fields removed.Dept
appears to have low correlation across the dataset and I'm curious how much the models are influenced with low correlation features. It turns out that low correlation features have little impact on model performance. which is no surprise really!
model_prefix = text indicating which ML model is being used, added to the model description during csv save to Results.csv
dataset = Text indicating which dataset is being used, added to the model description during csv save to Results.csv
rerun = flag to identify the first run of the model comparisons and write a NEW Results.csv file. 0 = first run and a new file will be created with headers / 1 = Continuation, Results will be appended to the file.
refit = flag to identify id the model needs fitting. Not a big issue for decision tree
# Load cleaned dataset into a dataframe
df1 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_AllFeat.csv", index_col = False) # Feature engineering on salary, avg_mnth_hrs, dept, outliers removed
df1 = df1.sort_index(axis=1)
# model_prefix : Str = prefix for results.csv added to dataset
model_prefix = 'dt1'
# dataset : Str = dataset name for results.csv
dataset = 'ALLFeat'
# rerun : int = Flag to set 1 = append to Results.csv / 0 = Overwrite with new file
rerun = 1
refit = 0 # run refit, not a big issue for Decision tree.
print("df1 - Feature engineering on salary, avg_mnth_hrs, dept, outliers removed\n")
# Display dataframe columns
df1.columns
df1 - Feature engineering on salary, avg_mnth_hrs, dept, outliers removed
Index(['dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd', 'dept_sales', 'dept_support', 'dept_technical', 'last_eval', 'left', 'number_project', 'overworked', 'promotion', 'salary', 'satisfaction', 'tenure'], dtype='object')
model_data = df1.copy()
# Isolate the outcome and feature variables
# Isolate the outcome variable 'left' is a binary value where true = employee left employment
Y = model_data['left']
# Select & Isolate the feature variables and drop the outcome variable
X = model_data.copy()
X = X.drop('left', axis = 1)
# Prepare training and test data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, stratify=Y, random_state=0)
# Instantiate model
dtc = DecisionTreeClassifier(random_state=0)
# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth':[4, 6, 8, None],
'min_samples_leaf': [2, 5, 1],
'min_samples_split': [2, 4, 6]
}
# Assign a dictionary of scoring metrics to capture
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')
# Instantiate GridSearch
model_dt1 = GridSearchCV(dtc, cv_params, scoring=scoring, cv=4, refit='roc_auc')
%%time
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"))
model_dt1.fit(X_train, y_train)
beepr()
Started / Last Run = 2023-12-04 12:01:55 CPU times: user 5.14 s, sys: 1.16 ms, total: 5.14 s Wall time: 7.69 s
## Write pickle
if refit == 1: # refit = 1 run the fit and save the model
write_pickle(model_path, model_dt1, 'hr_dt1-'+'AllFeat')
# Read in pickle
if refit == 0: # refit = 0 load model, don't fit the data
model_dt1 = read_pickle(model_path, 'hr_dt1-'+'AllFeat')
print("Start Time =", dt.now().strftime("%H:%M:%S"))
# Get the parameters of the best-performing model
print(model_prefix+' - '+dataset+' - '+'test\n')
print("Best Parameters : ", model_dt1.best_params_)
# Get the average f1 score of the best-performing model
print("Best Score : {:.4f}".format(model_dt1.best_score_))
# Get the best estimators of the parameters
print("Best Estimator : ", model_dt1.best_estimator_)
Start Time = 12:02:03 dt1 - ALLFeat - test Best Parameters : {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2} Best Score : 0.9737 Best Estimator : DecisionTreeClassifier(max_depth=4, random_state=0)
model_dt1_cv_results = format_GS_results(model_prefix+' - '+dataset+' - '+'GS train', model_dt1, 'auc')
model_dt1_cv_results.to_csv("Results.csv", mode='a', index=True, header=False)
pd.options.display.float_format = '{:.3f}'.format
#print(model_dt1_cv_results,"\n")
display_results()
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
4 | dt1 - ALLFeat - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.796 | 0.827 | 0.804 | 0.827 | 0.892 | 0.328 | 0.900 |
3 | lr2 - NOdept - train | 0.793 | 0.825 | 0.802 | 0.825 | 0.892 | 0.318 | 0.900 |
0 | lr1 - ALLFeat - test | 0.794 | 0.822 | 0.803 | 0.822 | 0.882 | 0.341 | 0.897 |
2 | lr2 - NOdept - test | 0.795 | 0.823 | 0.804 | 0.823 | 0.882 | 0.343 | 0.898 |
# Make predictions on test data
best_model = model_dt1.best_estimator_ # Store best model parameters for later testing
y_pred_train = best_model.predict(X_train)
explainer = shap.TreeExplainer(best_model)
# Compute the SHAP values for a set of observations
#shap_values = explainer(X_train)
shap_values = explainer(X_train[:100])
shap_values = shap_values[..., 1]
plt.grid(True, linestyle='--', alpha=0.7)
# Plot the SHAP values
shap.plots.waterfall(shap_values[0], max_display=12)
shap.summary_plot(shap_values)
shap.plots.bar(shap_values[0])
# Make predictions on test data
model_dt1_best_model = model_dt1.best_estimator_ # Save the best parameters for later testing
y_pred_test = model_dt1_best_model.predict(X_test)
# def classification_report_summary(name:str, y_var:str, y_pred_var:str):
classification_report_summary(model_prefix+' - '+dataset+' - '+'test', y_test,y_pred_test)
Classification Report : dt1 - ALLFeat - test precision recall f1-score support Predicted would not leave 0.99 0.99 0.99 2321 Predicted would leave 0.94 0.93 0.94 471 accuracy 0.98 2792 macro avg 0.96 0.96 0.96 2792 weighted avg 0.98 0.98 0.98 2792 Recall : 92.9936% f1_score : 93.5897% Precision : 94.1935% Accuracy : 97.8510% ─────────────────────────────────── Weighted Average ─────────────────────────────────── Recall : 97.8510% f1 Score : 97.8455% Precision : 97.8416% Support : 2792 ─────────────────────────────────── Prediction F1 score ─────────────────────────────────── Predict Leave : 93.5897% Predict Stay : 98.7091%
#make_lr_results(model_name:str, model_object:str, X_var:str, y_var:str, y_pred_var:str)
# format prediction results into a dataframe
dt_pred_test_results = make_results(model_prefix+' - '+dataset+' - '+'test', model_dt1_best_model, X_test, y_test, y_pred_test)
# save the prediction results in Results.csv
dt_pred_test_results.to_csv("Results.csv", mode='a', index=True, header=False)
print(dt_pred_test_results) # Display results of the prediction
display_results() # Display contents of Results.csv
Model Precision Recall F1 Accuracy AUC Predict Leave Predict Stay 0 dt1 - ALLFeat - test 0.978 0.979 0.978 0.979 0.979 0.936 0.987
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
5 | dt1 - ALLFeat - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
4 | dt1 - ALLFeat - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.796 | 0.827 | 0.804 | 0.827 | 0.892 | 0.328 | 0.900 |
3 | lr2 - NOdept - train | 0.793 | 0.825 | 0.802 | 0.825 | 0.892 | 0.318 | 0.900 |
0 | lr1 - ALLFeat - test | 0.794 | 0.822 | 0.803 | 0.822 | 0.882 | 0.341 | 0.897 |
2 | lr2 - NOdept - test | 0.795 | 0.823 | 0.804 | 0.823 | 0.882 | 0.343 | 0.898 |
# Make predictions on train data
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"))
model_dt1_best_model = model_dt1.best_estimator_ # Save the best parameters for later testing
y_pred_train = model_dt1_best_model.predict(X_train)
Started / Last Run = 2023-12-04 12:02:06
# def classification_report_summary(name:str, y_var:str, y_pred_var:str):
classification_report_summary(model_prefix+' - '+dataset+' - '+'train', y_train,y_pred_train)
Classification Report : dt1 - ALLFeat - train precision recall f1-score support Predicted would not leave 0.98 0.99 0.99 6964 Predicted would leave 0.96 0.92 0.94 1411 accuracy 0.98 8375 macro avg 0.97 0.95 0.96 8375 weighted avg 0.98 0.98 0.98 8375 Recall : 91.5663% f1_score : 93.5215% Precision : 95.5621% Accuracy : 97.8627% ─────────────────────────────────── Weighted Average ─────────────────────────────────── Recall : 97.8627% f1 Score : 97.8444% Precision : 97.8434% Support : 8375 ─────────────────────────────────── Prediction F1 score ─────────────────────────────────── Predict Leave : 93.5215% Predict Stay : 98.7202%
#make_lr_results(model_name:str, model_object:str, X_var:str, y_var:str, y_pred_var:str)
# format prediction results into a dataframe
dt_pred_test_results = make_results(model_prefix+' - '+dataset+' - '+'train', model_dt1_best_model, X_train, y_train, y_pred_train)
# Save dataframe into results.csv, no header
dt_pred_test_results.to_csv("Results.csv", mode='a', index=True, header=False) # save the prediction results in Results.csv
print(dt_pred_test_results) # Display results of the prediction
pd.options.display.float_format = '{:.3f}'.format
display_results() # Display contents of Results.csv
Model Precision Recall F1 Accuracy AUC Predict Leave Predict Stay 0 dt1 - ALLFeat - train 0.978 0.979 0.978 0.979 0.979 0.935 0.987
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
5 | dt1 - ALLFeat - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
6 | dt1 - ALLFeat - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
4 | dt1 - ALLFeat - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.796 | 0.827 | 0.804 | 0.827 | 0.892 | 0.328 | 0.900 |
3 | lr2 - NOdept - train | 0.793 | 0.825 | 0.802 | 0.825 | 0.892 | 0.318 | 0.900 |
0 | lr1 - ALLFeat - test | 0.794 | 0.822 | 0.803 | 0.822 | 0.882 | 0.341 | 0.897 |
2 | lr2 - NOdept - test | 0.795 | 0.823 | 0.804 | 0.823 | 0.882 | 0.343 | 0.898 |
plt.figure(figsize=(18,15))
plot_tree(model_dt1_best_model,max_depth=3, fontsize=10, feature_names=X_test.columns, class_names={0:'stayed',1:'left'}, filled=True);
plt.show()
model_dt2
¶model_dt2
- No Departments¶Two datasets for model performance comparison, :
salifort_data_FE.csv
is the full data set feature engineered with salary
encoded to ordinal, avg_mth_hrs
binary encoded to overwork
and dept encoded with dummiessalifort_data_FE_focus.csv
is the same data with the dummy encoded dept fields removed.Dept
appears to have low correlation across the dataset and I'm curious how much the models are influenced with low correlation features. It turns out that low correlation features have little impact on model performance. which is no surprise really!
model_prefix = text indicating which ML model is being used, added to the model description during csv save to Results.csv
dataset = Text indicating which dataset is being used, added to the model description during csv save to Results.csv
rerun = flag to identify the first run of the model comparisons and write a NEW Results.csv file. 0 = first run and a new file will be created with headers / 1 = Continuation, Results will be appended to the file.
refit = flag to identify id the model needs fitting. Not a big issue for decision tree
df2 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_NoDept.csv", index_col = False) # Feature engineering on salary, avg_mnth_hrs, outliers removed, departments removed
print("df2 - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed\n")
df2.sort_index(axis=1, inplace=True)
# model_prefix : Str = prefix for results.csv added to dataset
model_prefix = 'dt2'
# dataset : Str = dataset name for results.csv
dataset = 'NOdept'
# rerun : int = Flag to set 1 = append to Results.csv / 0 = Overwrite with new file
rerun = 1
refit = 0 # run refit, not a big issue for Decision tree.
# Display dataframe columns
df2.columns
df2 - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed
Index(['last_eval', 'left', 'number_project', 'overworked', 'promotion', 'salary', 'satisfaction', 'tenure'], dtype='object')
model_data2 = df2.copy()
# Isolate the outcome and feature variables
# Isolate the outcome variable 'left' is a binary value where true = employee left employment
Y2 = model_data2['left']
# Select & Isolate the feature variables and drop the outcome variable
X2 = model_data2.copy()
X2 = X2.drop('left', axis = 1)
# Prepare training and test data
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,Y2, test_size=0.25, stratify=Y, random_state=0)
# Instantiate model
tree2 = DecisionTreeClassifier(random_state=0)
# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth':[4, 6, 8, None],
'min_samples_leaf': [2, 5, 1],
'min_samples_split': [2, 4, 6]
}
# Assign a dictionary of scoring metrics to capture
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')
# Instantiate GridSearch
model_dt2 = GridSearchCV(tree2, cv_params, scoring=scoring, cv=4, refit='roc_auc')
%%time
model_dt2.fit(X_train2, y_train2)
beepr()
CPU times: user 4.44 s, sys: 5.76 ms, total: 4.45 s Wall time: 6.99 s
## Write pickle
if refit == 1: # refit = 1 run the fit and save the model
write_pickle(model_path, model_dt2, 'hr_dt2-'+'NOdept')
# Read in pickle
if refit == 0: # refit = 0 load model, don't fit the data
model_xg2 = read_pickle(model_path, 'hr_dt2-'+'NOdept')
# Get the parameters of the best-performing model
print(model_prefix+' - '+dataset+' - '+'test\n')
print("Best Parameters : ", model_dt2.best_params_)
# Get the average f1 score of the best-performing model
print("Best Score : {:.4f}".format(model_dt2.best_score_))
# Get the best estimators of the parameters
print("Best Estimator : ", model_dt2.best_estimator_)
dt2 - NOdept - test Best Parameters : {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2} Best Score : 0.9737 Best Estimator : DecisionTreeClassifier(max_depth=4, random_state=0)
model_dt2_cv_results = format_GS_results(model_prefix+' - '+dataset+' - '+'GS train', model_dt2, 'auc')
model_dt2_cv_results.to_csv("Results.csv", mode='a', index=True, header=False)
pd.options.display.float_format = '{:.3f}'.format
print(model_dt2_cv_results,"\n")
display_results()
Model Precision Recall F1 Accuracy AUC 0 dt2 - NOdept - GS train 0.951 0.914 0.932 0.977 0.974
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
5 | dt1 - ALLFeat - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
6 | dt1 - ALLFeat - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
7 | dt2 - NOdept - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.796 | 0.827 | 0.804 | 0.827 | 0.892 | 0.328 | 0.900 |
3 | lr2 - NOdept - train | 0.793 | 0.825 | 0.802 | 0.825 | 0.892 | 0.318 | 0.900 |
0 | lr1 - ALLFeat - test | 0.794 | 0.822 | 0.803 | 0.822 | 0.882 | 0.341 | 0.897 |
2 | lr2 - NOdept - test | 0.795 | 0.823 | 0.804 | 0.823 | 0.882 | 0.343 | 0.898 |
# Make predictions on test data
model_dt2_best_model = model_dt2.best_estimator_ # Save the best parameters for later testing
y_pred_test2 = model_dt2_best_model.predict(X_test2)
# def classification_report_summary(name:str, y_var:str, y_pred_var:str):
classification_report_summary(model_prefix+' - '+dataset+' - '+'test', y_test2,y_pred_test2)
Classification Report : dt2 - NOdept - test precision recall f1-score support Predicted would not leave 0.99 0.99 0.99 2321 Predicted would leave 0.94 0.93 0.94 471 accuracy 0.98 2792 macro avg 0.96 0.96 0.96 2792 weighted avg 0.98 0.98 0.98 2792 Recall : 92.9936% f1_score : 93.5897% Precision : 94.1935% Accuracy : 97.8510% ─────────────────────────────────── Weighted Average ─────────────────────────────────── Recall : 97.8510% f1 Score : 97.8455% Precision : 97.8416% Support : 2792 ─────────────────────────────────── Prediction F1 score ─────────────────────────────────── Predict Leave : 93.5897% Predict Stay : 98.7091%
dt_pred_test_results2 = make_results(model_prefix+' - '+dataset+' - '+'test', model_dt2_best_model, X_test2, y_test2, y_pred_test2) # format prediction results into a dataframe
dt_pred_test_results2.to_csv("Results.csv", mode='a', index=True, header=False) # save the prediction results in Results.csv
print(dt_pred_test_results2) # Display results of the prediction
display_results() # Display contents of Results.csv
Model Precision Recall F1 Accuracy AUC Predict Leave Predict Stay 0 dt2 - NOdept - test 0.978 0.979 0.978 0.979 0.979 0.936 0.987
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
5 | dt1 - ALLFeat - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
8 | dt2 - NOdept - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
6 | dt1 - ALLFeat - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
7 | dt2 - NOdept - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.796 | 0.827 | 0.804 | 0.827 | 0.892 | 0.328 | 0.900 |
3 | lr2 - NOdept - train | 0.793 | 0.825 | 0.802 | 0.825 | 0.892 | 0.318 | 0.900 |
0 | lr1 - ALLFeat - test | 0.794 | 0.822 | 0.803 | 0.822 | 0.882 | 0.341 | 0.897 |
2 | lr2 - NOdept - test | 0.795 | 0.823 | 0.804 | 0.823 | 0.882 | 0.343 | 0.898 |
# Make predictions on train data
print("Current Time =", dt.now().strftime("%H:%M:%S"))
dt_best_model2 = model_dt2.best_estimator_ # Save the best parameters for later testing
y_pred_train2 = model_dt2_best_model.predict(X_train2)
Current Time = 12:02:17
# def classification_report_summary(name:str, y_var:str, y_pred_var:str):
classification_report_summary(model_prefix+' - '+dataset+' - '+'train', y_train2,y_pred_train2)
Classification Report : dt2 - NOdept - train precision recall f1-score support Predicted would not leave 0.98 0.99 0.99 6964 Predicted would leave 0.96 0.92 0.94 1411 accuracy 0.98 8375 macro avg 0.97 0.95 0.96 8375 weighted avg 0.98 0.98 0.98 8375 Recall : 91.5663% f1_score : 93.5215% Precision : 95.5621% Accuracy : 97.8627% ─────────────────────────────────── Weighted Average ─────────────────────────────────── Recall : 97.8627% f1 Score : 97.8444% Precision : 97.8434% Support : 8375 ─────────────────────────────────── Prediction F1 score ─────────────────────────────────── Predict Leave : 93.5215% Predict Stay : 98.7202%
#make_lr_results(model_name:str, model_object:str, X_var:str, y_var:str, y_pred_var:str)
dt_pred_test_results2 = make_results(model_prefix+' - '+dataset+' - '+'train', model_dt2_best_model, X_train2, y_train2, y_pred_train2) # format prediction results into a dataframe
dt_pred_test_results2.to_csv("Results.csv", mode='a', index=True, header=False) # save the prediction results in Results.csv
print(dt_pred_test_results2) # Display results of the prediction
pd.options.display.float_format = '{:.4f}'.format
display_results() # Display contents of Results.csv
Model Precision Recall F1 Accuracy AUC Predict Leave Predict Stay 0 dt2 - NOdept - train 0.978 0.979 0.978 0.979 0.979 0.935 0.987
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
5 | dt1 - ALLFeat - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
8 | dt2 - NOdept - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
6 | dt1 - ALLFeat - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
9 | dt2 - NOdept - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
7 | dt2 - NOdept - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.7960 | 0.8266 | 0.8041 | 0.8266 | 0.8923 | 0.3284 | 0.9005 |
3 | lr2 - NOdept - train | 0.7930 | 0.8248 | 0.8015 | 0.8248 | 0.8919 | 0.3180 | 0.8995 |
0 | lr1 - ALLFeat - test | 0.7943 | 0.8216 | 0.8031 | 0.8216 | 0.8819 | 0.3413 | 0.8969 |
2 | lr2 - NOdept - test | 0.7952 | 0.8227 | 0.8039 | 0.8227 | 0.8819 | 0.3426 | 0.8975 |
plt.figure(figsize=(18,15))
plot_tree(model_dt2_best_model,max_depth=3, fontsize=10, feature_names=X_test2.columns, class_names={0:'stayed',1:'left'}, filled=True);
plt.show()
model1 = model_dt1
model2 = model_dt2
model_name1 = "model_dt1"
model_name2 = "model_dt2"
model_best_model1 = model_dt1_best_model
model_best_model2 = model_dt2_best_model
There is little difference in performance between the two datasets.
ALLFeat
= complete feature engineered datasetNoDept
= dataset with dummy encoded dept removedThe AUC score for the DecisionTree is [0.978] This is a significant improvement over Logistical Regression which came in at ~0.886 but had a low predicted to leave score.
F1, Precision, Recall and Accuracy are also within acceptable ranges
Predictions have also improved significantly, with predicted to leave now at ~0.9x.
display_results()
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
5 | dt1 - ALLFeat - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
8 | dt2 - NOdept - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
6 | dt1 - ALLFeat - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
9 | dt2 - NOdept - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
7 | dt2 - NOdept - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.7960 | 0.8266 | 0.8041 | 0.8266 | 0.8923 | 0.3284 | 0.9005 |
3 | lr2 - NOdept - train | 0.7930 | 0.8248 | 0.8015 | 0.8248 | 0.8919 | 0.3180 | 0.8995 |
0 | lr1 - ALLFeat - test | 0.7943 | 0.8216 | 0.8031 | 0.8216 | 0.8819 | 0.3413 | 0.8969 |
2 | lr2 - NOdept - test | 0.7952 | 0.8227 | 0.8039 | 0.8227 | 0.8819 | 0.3426 | 0.8975 |
# Prepare confusion matrices for LR1 test
cm_test1 = metrics.confusion_matrix(y_test, y_pred_test) # Use the optimized model
#cm_test1_percent = cm_test1 / cm_test1.sum() * 100
# Prepare confusion matrices for LR1 train
cm_test2 = metrics.confusion_matrix(y_test2, y_pred_test2) # Use the optimized model
#cm_test2_percent = cm_test2 / cm_test2.sum() * 100
#cm = confusion_matrix(y_test, y_pred_test, labels=model_lr1.classes_)
# Plot confusion matrix
#disp = ConfusionMatrixDisplay(confusion_matrix=cm,
# display_labels=model_lr1.classes_)
#disp.plot(values_format='');
fig, ax = plt.subplots(2, 2, figsize=(10,8))
# Calculate percentages for TEST
sum_by_true_class = np.sum(cm_test1, axis=1)
percentage_matrix = cm_test1 / sum_by_true_class[:, np.newaxis]
model_name = "dt1"
# Create a figure and plot the percentage confusion matrix as a heatmap
sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,0]) #, xticklabels=model_lr1.classes_, yticklabels=model_lr1.classes_)
ax[0,0].set_title('{} Confusion Matrix (Percentage)'.format(model_name))
ax[0,0].set_ylabel('True label')
ax[0,0].set_xlabel('Predicted label')
ax[0,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[0,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='white')
# Create a figure and plot the COUNT confusion matrix as a heatmap for TEST
sns.heatmap(cm_test1, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,0])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,0].set_title('{} Confusion Matrix (Count)'.format(model_name))
ax[1,0].set_ylabel('True label')
ax[1,0].set_xlabel('Predicted label')
ax[1,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')
# Calculate percentages for TRAIN
sum_by_true_class = np.sum(cm_test2, axis=1)
percentage_matrix = cm_test2 / sum_by_true_class[:, np.newaxis]
model_name = "dt2"
sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[0,1].set_title('{} Confusion Matrix (Percentage)'.format(model_name))
ax[0,1].set_ylabel('True label')
ax[0,1].set_xlabel('Predicted label')
ax[0,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[0,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='white')
# Create a figure and plot the COUNT confusion matrix as a heatmap for TRAIN
sns.heatmap(cm_test2, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,1].set_title('{} Confusion Matrix (Count)'.format(model_name))
ax[1,1].set_ylabel('True label')
ax[1,1].set_xlabel('Predicted label')
ax[1,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')
plt.tight_layout()
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
#tree2_importances = pd.DataFrame(tree2.best_estimator_.feature_importances_, columns=X.columns)
# Get feature importances (Gini importance)
feature_importance1 = model_best_model1.feature_importances_
feature_importance2 = model_best_model2.feature_importances_
# Get the names of the features
feature_names1 = X.columns # Replace with your feature names
feature_names2 = X2.columns # Replace with your feature names
# Create a DataFrame to store feature names and their Gini importance
feature_importance_df1 = pd.DataFrame({'Feature': feature_names1, 'Importance': feature_importance1})
feature_importance_df2 = pd.DataFrame({'Feature': feature_names2, 'Importance': feature_importance2})
feature_importance_df1.sort_values(by='Importance', ascending=False,axis=0, inplace=True)
feature_importance_df2.sort_values(by='Importance', ascending=False,axis=0, inplace=True)
merged_df = pd.merge(feature_importance_df1, feature_importance_df2, on='Feature', how='left', suffixes = (' model_xg1 - AllFeat', ' model_xg2 - NoDept'))
# Print the merged DataFrame
print(merged_df)
feature_importance_df1.sort_values(by='Importance', ascending=True,axis=0, inplace=True)
feature_importance_df2.sort_values(by='Importance', ascending=True,axis=0, inplace=True)
# Plot side-by-side bar plots
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.barh(feature_importance_df1['Feature'], feature_importance_df1['Importance'], color='skyblue')
plt.title('Feature Importance - {} - AllFeat'.format(model_name1))
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.subplot(1, 2, 2)
plt.barh(feature_importance_df2['Feature'], feature_importance_df2['Importance'], color='salmon')
plt.title('Feature Importance - {} - NoDept'.format(model_name2))
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
Feature Importance model_xg1 - AllFeat Importance model_xg2 - NoDept 0 satisfaction 0.5898 0.5898 1 tenure 0.1542 0.1542 2 last_eval 0.1475 0.1475 3 number_project 0.1086 0.1086 4 dept_technical 0.0000 NaN 5 salary 0.0000 0.0000 6 promotion 0.0000 0.0000 7 overworked 0.0000 0.0000 8 dept_accounting 0.0000 NaN 9 dept_hr 0.0000 NaN 10 dept_sales 0.0000 NaN 11 dept_randd 0.0000 NaN 12 dept_product_mng 0.0000 NaN 13 dept_marketing 0.0000 NaN 14 dept_management 0.0000 NaN 15 dept_it 0.0000 NaN 16 dept_support 0.0000 NaN
## Prepare predictions and calculate model scores
y_prob1 = model1.predict_proba(X_test)[:, 1]
y_prob2 = model2.predict_proba(X_test2)[:, 1]
precision1, recall1, _ = precision_recall_curve(y_test, y_prob1)
precision2, recall2, _ = precision_recall_curve(y_test2, y_prob2)
# Compute area under the curve (AUC)
auc_score1 = auc(recall1, precision1)
auc_score2 = auc(recall2, precision2)
print(type(precision1))
print(type(recall1))
print(type(auc_score1))
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.float64'>
# read in previous plot data
# Logistic Regression
variables = [['auc_score1', 'model_lr1'],
['auc_score2', 'model_lr2'],
['roc_auc1', 'model_lr1'],
['roc_auc2', 'model_lr2']
]
arrays = [['precision1','model_lr1'],
['recall1', 'model_lr1'],
['precision2','model_lr2'],
['recall2', 'model_lr2'],
['fpr1', 'model_lr1'],
['tpr1', 'model_lr1'],
['fpr2', 'model_lr2'],
['tpr2', 'model_lr2'],
]
loaded_plot_vars = {}
loaded_plot_arrays = {}
for var_name, model in variables:
#print(var_name, model)
#var = globals()[var_name]
with open(f'99-documentation-project/08-plot_data/{model}-{var_name}.csv', 'r') as file:
var = f'{model}-{var_name}'
#print(var)
loaded_plot_vars[var] = json.load(file)
for array_name, model in arrays:
#print(array_name, model)
#var = globals()[var_name]
filepath = f'99-documentation-project/08-plot_data/{model}-{array_name}.csv'
array = f'{model}-{array_name}'
loaded_plot_arrays[array] = pd.read_csv(filepath)
#print(loaded_plot_arrays)
#var = loaded_plot_vars.get('model_lr1-roc_auc1')
#print(var)
#array= loaded_plot_arrays.get('model_dt2-precision2')
#lr1_fpr1 = loaded_plot_arrays.get('model_lr1-fpr1')
#va = lr1_fpr1.values()
#print(type(lr1_fpr1))
#print(lr1_fpr1)
# Retrieve arrays and variable from previous model
lr1_precision1 = loaded_plot_arrays.get('model_lr1-precision1') # retrieve fpr1 array from model1 = AllFeat
lr1_recall1 = loaded_plot_arrays.get('model_lr1-recall1') # retrieve fpr1 array from model1 = AllFeat
lr1_auc_score1 = loaded_plot_vars.get('model_lr1-auc_score1') # retrieve auc score from model1 = AllFeat
lr2_precision2 = loaded_plot_arrays.get('model_lr2-precision2') # retrieve fpr1 array from model1 = AllFeat
lr2_recall2 = loaded_plot_arrays.get('model_lr2-recall2') # retrieve fpr1 array from model1 = AllFeat
lr2_auc_score2 = loaded_plot_vars.get('model_lr2-auc_score2') # retrieve auc score from model1 = AllFeat
lr1_fpr1 = loaded_plot_arrays.get('model_lr1-fpr1') # retrieve fpr1 array from model1 = AllFeat
lr1_tpr1 = loaded_plot_arrays.get('model_lr1-tpr1') # retrieve tpr1 array from model1 = AllFeat
lr1_roc_auc1 = loaded_plot_vars.get('model_lr1-roc_auc1') # retrieve auc score from model1 = AllFeat
lr1_model_name1 = arrays[0][1] # Retrieve model_name1
lr2_fpr2 = loaded_plot_arrays.get('model_lr2-fpr2') # retrieve fpr1 array from model2 = AllFeat
lr2_tpr2 = loaded_plot_arrays.get('model_lr2-tpr2') # retrieve tpr1 array from model2 = AllFeat
lr2_roc_auc2 = loaded_plot_vars.get('model_lr2-roc_auc2') # retrieve auc score from model1 = AllFeat
lr2_model_name2 = arrays[3][1] # Retrieve model_name2
#print(lr1_precision1)
#print(lr1_recall1)
# Plot the Precision / Recall curve
#plt.subplot(1, 2, 1)
plt.figure(figsize=(12, 5))
plt.plot(recall1, precision1, color='blue', label=f'{model_name1} - AUC = {auc_score1:.2f}')
plt.plot(lr1_recall1, lr1_precision1, color='purple', label=f'{lr1_model_name1} - AUC = {lr1_auc_score1:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('{} Precision-Recall Curve'.format(model_name1))
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')
plt.show()
#plt.subplot(1, 2, 2)
plt.figure(figsize=(12, 5))
plt.plot(recall2, precision2, color='blue', label=f'{model_name2} - AUC = {auc_score2:.2f}')
plt.plot(lr2_recall2, lr2_precision2, color='purple', label=f'{lr2_model_name2} - AUC = {lr2_auc_score2:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('{} Precision-Recall Curve'.format(model_name2))
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')
#plt.tight_layout()
plt.show()
# Compute ROC curve
fpr1, tpr1, _ = roc_curve(y_test, y_prob1) # true positive rate, false positive rate
fpr2, tpr2, _ = roc_curve(y_test2, y_prob2) # true positive rate, false positive rate
# Compute area under the curve (AUC)
roc_auc1 = auc(fpr1, tpr1)
roc_auc2 = auc(fpr2, tpr2)
#plt.subplot(1, 2, 1)
# Plot ROC curve
plt.figure(figsize=(12, 5))
plt.plot(fpr1, tpr1, color='darkorange', lw=2, label=f'{model_name1} - AUC = {roc_auc1:.2f}') # Plot from current model
plt.plot(lr1_fpr1, lr1_tpr1, color='purple', lw=2, label=f'{lr1_model_name1} - AUC = {lr1_roc_auc1:.2f}') # plot from previous LR model
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('{} Receiver Operating Characteristic (ROC) Curve'.format(model_name1))
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')
plt.show()
#plt.subplot(1, 2,2)
# Plot ROC curve
plt.figure(figsize=(12, 5))
plt.plot(fpr2, tpr2, color='darkorange', lw=2, label=f'{model_name2} - AUC = {roc_auc2:.2f}')
plt.plot(lr2_fpr2, lr2_tpr2, color='purple', lw=2, label=f'{lr2_model_name2} - AUC = {lr2_roc_auc2:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('{} Receiver Operating Characteristic (ROC) Curve'.format(model_name2))
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')
plt.show()
# Save model plot data
arrays = [['precision1',model_name1],
['recall1', model_name1],
['precision2',model_name2],
['recall2', model_name2],
['fpr1', model_name1],
['tpr1', model_name1],
['fpr2', model_name2],
['tpr2', model_name2],
]
variables = [['auc_score1', model_name1],
['auc_score2', model_name2],
['roc_auc1', model_name1],
['roc_auc2', model_name2]
]
# Save plot data scores (auc, roc)
for var_name, model in variables:
# print(model+"-"+var_name )
var = globals()[var_name]
with open(f'99-documentation-project/08-plot_data/{model}-{var_name}.csv', 'w') as file:
json.dump(var, file)
# Save plot data arrays (recall, precision, fpr, tpr)
for array_name, model in arrays:
#print(model+'-'+array_name)
var = globals()[array_name]
df = pd.DataFrame({array_name: var})
df.to_csv(f'99-documentation-project/08-plot_data/{model}-{array_name}.csv', index=False, header=False)
Again, there is almost no difference to the performance of the two datasets (AllFeat
/NoDept
) when modelled through either Logistic Regression or Decision Tree. While there's a tiny improvement in DecisionTree, there's no difference in feature importance.
The improvement we see in DecisionTree is in predictions, where predicted to leave has increased from ~30% to ~98%