Document Title | Salifort Motors - ML Modelling - XGBoost |
Author | Rod Slater |
Version | 1.0 |
Created | 01-11-2023 |
Modified | 16-11-2023 |
Client Name | Salifort Motors |
Client Contact | Mr HR Team |
Client Email | hr@salifortmotors.it |
Client Project | HR Team Data Driven Solutions from Machine Learning Models |
ML Modelling using XGBoost for HR Data provided by Salifort Motors. This notebook details the XGBoost Modelling process and performance comparisons
refit
option flags in this code once the model has been fitted and saved once.xg1
- All Features includedxg1-ALLFeat
xg1-ALLFeat
xg1-ALLFeat
xg1-ALLFeat
model_xg1
- Train datamodel_xg1
- Train datamodel_xg1
- Train datamodel_xg1
- Train datamodel_xg1
- Test datamodel_xg1
- Test Datamodel_xg1
- Test data
model_xg1
- Train data
xg2
- No Departmentsxg2
- No Departments
xg2
xg2
model_xg2
model_xg2
- Train datamodel_xg2
- Train datamodel_xg2
- Train datamodel_xg2
- Train datamodel_xg2
- Train data
model_xg2
- Test data
model_xg2
- Train data
# Import packages
# Data manipulation
import numpy as np
import pandas as pd
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Set Options
pd.set_option('display.max_columns', None)
# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support, \
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, precision_recall_curve, auc
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from datetime import datetime as dt
import statsmodels.api as sm
import json
# For saving models
import pickle
# Shap Explainer
import shap
# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
#pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/" # Source folder for cleaned data
#save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # destination for pickle saved models
model_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # path to load/save pickled models
import chime
import time
def beepr():
for x in range(3):
for i in range(3):
chime.success()
time.sleep(0.25)
time.sleep(1)
#beepr()
def display_results():
'''
Load Results.csv containing store test scores, return the scores for display
In: none
Out: pandas df of Results,csv containing precision, recall, f1, accuracy, and AUC scores of the models
'''
model_results = pd.read_csv("Results.csv")
model_results.drop(columns=['Unnamed: 0'], inplace=True)
model_results = model_results.sort_values(by='AUC', ascending=False)
return model_results
def format_GS_results(model_name: str, model_object: object, metric: str):
'''
Returns a pandas df with the F1, recall, precision, accuracy, and auc scores
from the GridSearch.
Arguments:
model_name (string): what you want the model to be called in the output table
model_object: a fit GridSearchCV object from test data
metric (string): precision, recall, f1, accuracy, or auc
'''
# Create dictionary that maps input metric to actual metric name in GridSearchCV
metric_dict = {'auc': 'mean_test_roc_auc',
'precision': 'mean_test_precision',
'recall': 'mean_test_recall',
'f1': 'mean_test_f1',
'accuracy': 'mean_test_accuracy'
}
# Get all the results from the CV and put them in a df
cv_results = pd.DataFrame(model_object.cv_results_)
# Isolate the row of the df with the max(metric) score
best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]
# Extract Accuracy, precision, recall, and f1 score from that row
auc = round(best_estimator_results.mean_test_roc_auc,3)
f1 = round(best_estimator_results.mean_test_f1,3)
recall = round(best_estimator_results.mean_test_recall,3)
precision = round(best_estimator_results.mean_test_precision,3)
accuracy = round(best_estimator_results.mean_test_accuracy,3)
# Create table of results
table = pd.DataFrame()
table = pd.DataFrame({'Model': [model_name],
'Precision': [precision],
'Recall': [recall],
'F1': [f1],
'Accuracy': [accuracy],
'AUC': [auc]
})
return table
# Get results from model to store in comparison table
def make_results(model_name: str, model_object: object, X_var: str, y_var: str, y_pred_var: str):
'''
Accepts as arguments :
model name (string) - Used as the heading in Results.csv
model object : The ML Model
X_var, y_var, y_pred_var the variables used in the model
Returns a pandas df with the F1, recall, precision, and accuracy scores
for the model with the best mean F1 score across all validation folds.
'''
# Get all the results from the CV and put them in a dict
report = classification_report(y_var, y_pred_var, output_dict=True)
# Calculate precision, recall, and F1 score for the "True" class
predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
f1_true_class = predict_f1_score[1] # Index 1 corresponds to the "True" class
f1_false_class = predict_f1_score[0] # Index 1 corresponds to the "True" class
# Extract accuracy, precision, recall, and f1 score from that row
f1 = report['weighted avg']['f1-score']
recall = report['weighted avg']['recall']
precision = report['weighted avg']['precision']
accuracy = report['accuracy']
auc = roc_auc_score(y_var, model_object.predict_proba(X_var)[:,1])
# Create table of results
table = pd.DataFrame({'Model': model_name,
'Precision': precision,
'Recall': recall,
'F1': f1,
'Accuracy': accuracy,
'AUC': auc,
'Predict Leave': f1_true_class,
'Predict Stay' : f1_false_class
},
index=[0]
)
return table
def classification_report_summary(name: str, y_var: str, y_pred_var: str):
'''
Gather stats from predictions
In:
name: str : Title for report header e.g. TEST or TRAIN
y_var: str : y variable
y_pred_var: str : y prediction Variable
Out: Display of precision, recall, f1, accuracy, and AUC scores of the models, Weighted Average and Prediction f1 score for true/false
'''
targetnames = ['Predicted would not leave', 'Predicted would leave']
print("\nClassification Report : ", name)
print('\u2500' * 35)
print(classification_report(y_var, y_pred_var, target_names=targetnames))
print("Recall : {:.4%}".format(recall_score(y_var, y_pred_var)))
print("f1_score : {:.4%}".format(f1_score(y_var, y_pred_var)))
print("Precision : {:.4%}".format(precision_score(y_var, y_pred_var)))
print("Accuracy : {:.4%}".format(accuracy_score(y_var, y_pred_var)))
report = classification_report(y_var, y_pred_var, output_dict=True)
print()
print('\u2500' * 35)
print("Weighted Average")
print('\u2500' * 35)
print("Recall : {:.4%}".format(report['weighted avg']['recall']))
print("f1 Score : {:.4%}".format(report['weighted avg']['f1-score']))
print("Precision : {:.4%}".format(report['weighted avg']['precision']))
print("Support : {:.4%}".format(report['weighted avg']['support']))
# Calculate precision, recall, and F1 score for the "True" class
predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
f1_true_class = predict_f1_score[1] # Index 1 corresponds to the "True" class
f1_false_class = predict_f1_score[0] # Index 0 corresponds to the "False" class
print()
print('\u2500' * 35)
print("Prediction F1 score")
print('\u2500' * 35)
print("Predict Leave : {:.4%}".format(f1_true_class))
print("Support Stay : {:.4%}".format(f1_false_class))
def write_pickle(path, model_object, save_as:str):
'''
In:
path: path of folder where you want to save the pickle
model_object: a model you want to pickle
save_as: filename for how you want to save the model
Out: A call to pickle the model in the folder indicated
'''
with open(path + save_as + '.pickle', 'wb') as to_write:
pickle.dump(model_object, to_write)
def read_pickle(path, saved_model_name:str):
'''
In:
path: path to folder where you want to read from
saved_model_name: filename of pickled model you want to read in
Out:
model: the pickled model
'''
with open(path + saved_model_name + '.pickle', 'rb') as to_read:
model = pickle.load(to_read)
return model
xg1-ALLFeat
¶Two datasets for model performance comparison, :
salifort_data_FE.csv
(THIS DATA) is the full data set feature engineered with salary
encoded to ordinal, avg_mth_hrs
binary encoded to overwork
and dept encoded with dummiessalifort_data_FE_focus.csv
is the same data with the dummy encoded dept fields removed.Dept
appears to have low correlation across the dataset and I'm curious how much the models are influenced with low correlation features. It turns out that low correlation features have little impact on model performance. which is no surprise really!
rerun = flag to identify the first run of the model comparisons and write a NEW Results.csv file. 0 = first run and a new file will be created with headers / 1 = Continuation, Results will be appended to the file.
dataset = Text indicating which dataset is being used, added to the model description during csv save to Results.csv
model_prefix = text indicating which ML model is being used, added to the model description during csv save to Results.csv
refit = XGBoost fits with this data takes 20-30 mins. 1 = Fit Data and Pickle Save Model 0 = don't refit data, Pickle load model
fitver = suffix added to pickle save model file name to help distinguish xg1 - all features or xg2 - just focus features datasets, no Department features
# Load cleaned dataset into a dataframe
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"),"\n")
# Feature engineering on salary, avg_mnth_hrs, dept, outliers removed
df1 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_AllFeat.csv", index_col = False)
# Always sort the index
df1.sort_index(axis=1, inplace=True)
# model_prefix : Str prefix for results.csv added to dataset
model_prefix = 'xg1'
# dataset : Str = dataset save name for results.csv
dataset = 'ALLFeat'
# fitver: Str = Suffix for the pickle save filename
fitver = 'AllFeat'
# rerun : int = Flag to set 1 = append to Results.csv / 0 = Overwrite with new file
rerun = 1
# refit : int - 1 = Fit Data and Save model / 0 = load model, don't fit data
refit = 0
print("df1 - All Features - Feature engineering on salary, avg_mnth_hrs, dept. outliers removed\n")
# Display dataframe columns
print(df1.info())
Started / Last Run = 2023-12-04 12:10:43 df1 - All Features - Feature engineering on salary, avg_mnth_hrs, dept. outliers removed <class 'pandas.core.frame.DataFrame'> RangeIndex: 11167 entries, 0 to 11166 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dept_accounting 11167 non-null bool 1 dept_hr 11167 non-null bool 2 dept_it 11167 non-null bool 3 dept_management 11167 non-null bool 4 dept_marketing 11167 non-null bool 5 dept_product_mng 11167 non-null bool 6 dept_randd 11167 non-null bool 7 dept_sales 11167 non-null bool 8 dept_support 11167 non-null bool 9 dept_technical 11167 non-null bool 10 last_eval 11167 non-null float64 11 left 11167 non-null float64 12 number_project 11167 non-null float64 13 overworked 11167 non-null int64 14 promotion 11167 non-null float64 15 salary 11167 non-null int64 16 satisfaction 11167 non-null float64 17 tenure 11167 non-null float64 dtypes: bool(10), float64(6), int64(2) memory usage: 807.1 KB None
model_data = df1.copy()
# Isolate the outcome variable
Y = model_data['left']
# Select & Isolate the feature variables and drop the outcome variable
X = model_data.copy()
X = X.drop('left', axis = 1)
#X = sm.add_constant(X)
X.columns
Index(['dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd', 'dept_sales', 'dept_support', 'dept_technical', 'last_eval', 'number_project', 'overworked', 'promotion', 'salary', 'satisfaction', 'tenure'], dtype='object')
# Prepare training and test data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, stratify=Y, random_state=0)
# Instantiate model
# xgb.XGBRegressor() for regression problems, xgb.XGBClassifier() for classification problems
xgc = XGBClassifier(random_state=0)
# Assign a dictionary of hyperparameters to search over
cv_params = {'learning_rate': [0.05, 0.1, 0.15],
'max_depth': [3, 4, 5, 6, 8],
'min_child_weight': [1, 3, 5, 7],
'gamma': [0.0, 0.1, 0.2],
'colsample_bytree': [0.3, 0.4]
}
# Assign a dictionary of scoring metrics to capture
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')
# Instantiate GridSearch
model_xg1 = GridSearchCV(xgc, cv_params, scoring=scoring, cv=4, refit='roc_auc')
X_train.columns
Index(['dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd', 'dept_sales', 'dept_support', 'dept_technical', 'last_eval', 'number_project', 'overworked', 'promotion', 'salary', 'satisfaction', 'tenure'], dtype='object')
%%time
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"))# Fit data
if refit == 1: # if 1, refit, we'll save the already fit model
model_xg1.fit(X_train, y_train) # --> Wall time: 21min 5s /XGBoost 20 mins
beepr()
print("Finish Time =", dt.now().strftime("%H:%M:%S"))
Started / Last Run = 2023-12-04 12:10:44 Finish Time = 12:10:44 CPU times: user 0 ns, sys: 491 µs, total: 491 µs Wall time: 385 µs
## Write pickle
if refit == 1: # refit = 1 run the fit and save the model
write_pickle(model_path, model_xg1, 'hr_xg1-'+fitver)
# Read in pickle
if refit == 0: # refit = 0 load model, don't fit the data
model_xg1 = read_pickle(model_path, 'hr_xg1-'+fitver)
X_train.columns
Index(['dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd', 'dept_sales', 'dept_support', 'dept_technical', 'last_eval', 'number_project', 'overworked', 'promotion', 'salary', 'satisfaction', 'tenure'], dtype='object')
# Get the parameters of the best-performing model
print("Best Parameters : ", model_xg1.best_params_)
# Get the average f1 score of the best-performing model
print("Best Score : ", model_xg1.best_score_)
# Get the best estimators of the parameters
print("Best Estimator : ", model_xg1.best_estimator_)
Best Parameters : {'colsample_bytree': 0.3, 'gamma': 0.2, 'learning_rate': 0.15, 'max_depth': 4, 'min_child_weight': 1} Best Score : 0.9819165476477163 Best Estimator : XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.3, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=0.2, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.15, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=4, max_leaves=None, min_child_weight=1, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=0, ...)
pd.options.display.float_format = '{:.3f}'.format
model_xg1_results = format_GS_results(model_prefix+' - '+dataset+' - '+'GS train', model_xg1, 'auc')
model_xg1_results.to_csv("./Results.csv", mode='a', index=True, header=False)
pd.options.display.float_format = '{:.4f}'.format
print(model_xg1_results)
display_results()
Model Precision Recall F1 Accuracy AUC 0 xg1 - ALLFeat - GS train 0.9570 0.9100 0.9330 0.9780 0.9820
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
10 | xg1 - ALLFeat - GS train | 0.9570 | 0.9100 | 0.9330 | 0.9780 | 0.9820 | NaN | NaN |
5 | dt1 - ALLFeat - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
8 | dt2 - NOdept - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
6 | dt1 - ALLFeat - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
9 | dt2 - NOdept - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
7 | dt2 - NOdept - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.7960 | 0.8266 | 0.8041 | 0.8266 | 0.8923 | 0.3284 | 0.9005 |
3 | lr2 - NOdept - train | 0.7930 | 0.8248 | 0.8015 | 0.8248 | 0.8919 | 0.3180 | 0.8995 |
0 | lr1 - ALLFeat - test | 0.7943 | 0.8216 | 0.8031 | 0.8216 | 0.8819 | 0.3413 | 0.8969 |
2 | lr2 - NOdept - test | 0.7952 | 0.8227 | 0.8039 | 0.8227 | 0.8819 | 0.3426 | 0.8975 |
# Make predictions on test data
xg1_best_model = model_xg1.best_estimator_ # Store best model parameters for later testing
y_pred_test = xg1_best_model.predict(X_test)
y_pred_train= xg1_best_model.predict(X_train) # Used for confusion matrix plots later
classification_report_summary(model_prefix+' - '+dataset+' - '+'test', y_test, y_pred_test)
Classification Report : xg1 - ALLFeat - test ─────────────────────────────────── precision recall f1-score support Predicted would not leave 0.98 0.99 0.99 2321 Predicted would leave 0.95 0.93 0.94 471 accuracy 0.98 2792 macro avg 0.97 0.96 0.96 2792 weighted avg 0.98 0.98 0.98 2792 Recall : 92.5690% f1_score : 93.7634% Precision : 94.9891% Accuracy : 97.9226% ─────────────────────────────────── Weighted Average ─────────────────────────────────── Recall : 97.9226% f1 Score : 97.9119% Precision : 97.9075% Support : 279200.0000% ─────────────────────────────────── Prediction F1 score ─────────────────────────────────── Predict Leave : 93.7634% Support Stay : 98.7538%
model_xg1_results = make_results(model_prefix+' - '+dataset+' - '+'test', xg1_best_model, X_test, y_test, y_pred_test)
model_xg1_results.to_csv("./Results.csv", mode='a', index=True, header=False)
print(model_xg1_results)
print()
display_results()
Model Precision Recall F1 Accuracy AUC Predict Leave Predict Stay 0 xg1 - ALLFeat - test 0.9791 0.9792 0.9791 0.9792 0.9860 0.9376 0.9875
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
11 | xg1 - ALLFeat - test | 0.9791 | 0.9792 | 0.9791 | 0.9792 | 0.9860 | 0.9376 | 0.9875 |
10 | xg1 - ALLFeat - GS train | 0.9570 | 0.9100 | 0.9330 | 0.9780 | 0.9820 | NaN | NaN |
5 | dt1 - ALLFeat - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
8 | dt2 - NOdept - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
6 | dt1 - ALLFeat - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
9 | dt2 - NOdept - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
7 | dt2 - NOdept - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.7960 | 0.8266 | 0.8041 | 0.8266 | 0.8923 | 0.3284 | 0.9005 |
3 | lr2 - NOdept - train | 0.7930 | 0.8248 | 0.8015 | 0.8248 | 0.8919 | 0.3180 | 0.8995 |
0 | lr1 - ALLFeat - test | 0.7943 | 0.8216 | 0.8031 | 0.8216 | 0.8819 | 0.3413 | 0.8969 |
2 | lr2 - NOdept - test | 0.7952 | 0.8227 | 0.8039 | 0.8227 | 0.8819 | 0.3426 | 0.8975 |
# Make predictions on test data
xg1_best_model = model_xg1.best_estimator_ # Store best model parameters for later testing
y_pred_test = xg1_best_model.predict(X_test)
y_pred_train= xg1_best_model.predict(X_train) # Used for confusion matrix plots later
classification_report_summary(model_prefix+' - '+dataset+' - '+'train', y_train, y_pred_train)
Classification Report : xg1 - ALLFeat - train ─────────────────────────────────── precision recall f1-score support Predicted would not leave 0.98 0.99 0.99 6964 Predicted would leave 0.96 0.91 0.94 1411 accuracy 0.98 8375 macro avg 0.97 0.95 0.96 8375 weighted avg 0.98 0.98 0.98 8375 Recall : 91.4954% f1_score : 93.8568% Precision : 96.3433% Accuracy : 97.9821% ─────────────────────────────────── Weighted Average ─────────────────────────────────── Recall : 97.9821% f1 Score : 97.9612% Precision : 97.9656% Support : 837500.0000% ─────────────────────────────────── Prediction F1 score ─────────────────────────────────── Predict Leave : 93.8568% Support Stay : 98.7928%
model_xg1_results = make_results(model_prefix+' - '+dataset+' - '+'train', xg1_best_model, X_train, y_train, y_pred_train)
model_xg1_results.to_csv("./Results.csv", mode='a', index=True, header=False)
print(model_xg1_results)
print()
display_results()
Model Precision Recall F1 Accuracy AUC Predict Leave Predict Stay 0 xg1 - ALLFeat - train 0.9797 0.9798 0.9796 0.9798 0.9870 0.9386 0.9879
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
12 | xg1 - ALLFeat - train | 0.9797 | 0.9798 | 0.9796 | 0.9798 | 0.9870 | 0.9386 | 0.9879 |
11 | xg1 - ALLFeat - test | 0.9791 | 0.9792 | 0.9791 | 0.9792 | 0.9860 | 0.9376 | 0.9875 |
10 | xg1 - ALLFeat - GS train | 0.9570 | 0.9100 | 0.9330 | 0.9780 | 0.9820 | NaN | NaN |
5 | dt1 - ALLFeat - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
8 | dt2 - NOdept - test | 0.9784 | 0.9785 | 0.9785 | 0.9785 | 0.9791 | 0.9359 | 0.9871 |
6 | dt1 - ALLFeat - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
9 | dt2 - NOdept - train | 0.9784 | 0.9786 | 0.9784 | 0.9786 | 0.9789 | 0.9352 | 0.9872 |
7 | dt2 - NOdept - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.9506 | 0.9135 | 0.9317 | 0.9774 | 0.9737 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.7960 | 0.8266 | 0.8041 | 0.8266 | 0.8923 | 0.3284 | 0.9005 |
3 | lr2 - NOdept - train | 0.7930 | 0.8248 | 0.8015 | 0.8248 | 0.8919 | 0.3180 | 0.8995 |
0 | lr1 - ALLFeat - test | 0.7943 | 0.8216 | 0.8031 | 0.8216 | 0.8819 | 0.3413 | 0.8969 |
2 | lr2 - NOdept - test | 0.7952 | 0.8227 | 0.8039 | 0.8227 | 0.8819 | 0.3426 | 0.8975 |
# Make predictions on test data
xg1_best_model = model_xg1.best_estimator_ # Store best model parameters for later testing
y_pred_train = xg1_best_model.predict(X_train)
explainer = shap.TreeExplainer(xg1_best_model)
# Compute the SHAP values for a set of observations
shap_values = explainer(X_train)
plt.grid(True, linestyle='--', alpha=0.7)
# Plot the SHAP values
shap.plots.waterfall(shap_values[0])
plt.grid(True, linestyle='--', alpha=0.7)
shap.summary_plot(shap_values, X_train)
plt.grid(True, linestyle='--', alpha=0.7)
shap.plots.bar(shap_values[0])
[12:10:47] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.
explainer = shap.TreeExplainer(xg1_best_model)
# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)
# Plot SHAP summary plot
shap.summary_plot(shap_values, X_test, feature_names=df1.columns)
# Add a title to the plot
plt.title('SHAP Summary Plot')
# Display the plot
plt.show()
[12:10:53] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.
xg2
- No Departments¶Two datasets for model performance comparison, :
salifort_data_FE.csv
is the full data set feature engineered with salary
encoded to ordinal, avg_mth_hrs
binary encoded to overwork
and dept encoded with dummiessalifort_data_FE_focus.csv
(THIS DATA) is the same data with the dummy encoded dept fields removed.Dept
appears to have low correlation across the dataset and I'm curious how much the models are influenced with low correlation features. It turns out that low correlation features have little impact on model performance. which is no surprise really!
rerun = flag to identify the first run of the model comparisons and write a NEW Results.csv file. 0 = first run and a new file will be created with headers / 1 = Continuation, Results will be appended to the file.
dataset = Text indicating which dataset is being used, added to the model description during csv save to Results.csv
model_prefix = text indicating which ML model is being used, added to the model description during csv save to Results.csv
refit = fits with this data takes 20-30 mins. 1 = Fit Data and Pickle Save Model 0 = don't refit data, Pickle load model
fitver = suffix added to pickle save model file name to help distinguish xg1 - all feature or xg2 and focus features datasets
# Load cleaned dataset into a dataframe
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"),"\n")
# Feature engineering on salary, avg_mnth_hrs, outliers removed, departments removed
df2 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_NoDept.csv", index_col = False)
# Always sort the index
df2.sort_index(axis=1, inplace=True)
# model_prefix : Str prefix for results.csv added to dataset
model_prefix = 'xg2'
# dataset : Str = dataset save name for results.csv
dataset = 'NOdept'
# fitver : Str = Suffix for the pickle save filename
fitver = 'NOdept'
# rerun : int = Flag to set 1 = append to Results.csv / 0 = Overwrite with new file
rerun = 1
# refit : int - 1 = Fit Data and Save model / 0 = load model, don't fit data
refit = 0 # 1 = Fit Data and Save model / 0 = load model, don't fit data
print("df2 - FOCUS FEATURES - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed\n")
# Display dataframe columns
print(df2.info())
Started / Last Run = 2023-12-04 12:10:55 df2 - FOCUS FEATURES - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed <class 'pandas.core.frame.DataFrame'> RangeIndex: 11167 entries, 0 to 11166 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 last_eval 11167 non-null float64 1 left 11167 non-null float64 2 number_project 11167 non-null float64 3 overworked 11167 non-null int64 4 promotion 11167 non-null float64 5 salary 11167 non-null int64 6 satisfaction 11167 non-null float64 7 tenure 11167 non-null float64 dtypes: float64(6), int64(2) memory usage: 698.1 KB None
model_data2 = df2.copy()
# Isolate the outcome variable
Y2 = model_data2['left']
# Select & Isolate the feature variables and drop the outcome variable
X2 = model_data2.copy()
X2 = X2.drop('left', axis = 1)
#X = sm.add_constant(X)
X2.columns
Index(['last_eval', 'number_project', 'overworked', 'promotion', 'salary', 'satisfaction', 'tenure'], dtype='object')
# Prepare training and test data
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,Y2, test_size=0.25, stratify=Y, random_state=0)
# Instantiate model
# xgb.XGBRegressor() for regression problems, xgb.XGBClassifier() for classification problems
xgc = XGBClassifier(random_state=0)
# Assign a dictionary of hyperparameters to search over
cv_params = {'learning_rate': [0.05, 0.1, 0.15],
'max_depth': [3, 4, 5, 6, 8],
'min_child_weight': [1, 3, 5, 7],
'gamma': [0.0, 0.1, 0.2],
'colsample_bytree': [0.3, 0.4]
}
# Assign a dictionary of scoring metrics to capture
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')
# Instantiate GridSearch
model_xg2 = GridSearchCV(xgc, cv_params, scoring=scoring, cv=4, refit='roc_auc')
%%time
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"))
# Fit data
if refit == 1:
model_xg2.fit(X_train2, y_train2) # --> Wall time: 18-21 mins
beepr()
print("Finish Time =", dt.now().strftime("%H:%M:%S"))
Started / Last Run = 2023-12-04 12:10:56 Finish Time = 12:10:56 CPU times: user 593 µs, sys: 133 µs, total: 726 µs Wall time: 851 µs
## Write pickle
if refit == 1: # refit = 1 run the fit and save the model
write_pickle(model_path, model_xg2, 'hr_xg2-'+fitver)
# Read in pickle
if refit == 0: # refit = 0 load model, don't fit the data
model_xg2 = read_pickle(model_path, 'hr_xg2-'+fitver)
# Get the parameters of the best-performing model
print("Best Parameters : ", model_xg2.best_params_)
# Get the average f1 score of the best-performing model
print("Best Score : ", model_xg2.best_score_)
# Get the best estimators of the parameters
print("Best Estimator : ", model_xg2.best_estimator_)
Best Parameters : {'colsample_bytree': 0.3, 'gamma': 0.2, 'learning_rate': 0.15, 'max_depth': 4, 'min_child_weight': 1} Best Score : 0.9806005736725114 Best Estimator : XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.3, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=0.2, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.15, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=4, max_leaves=None, min_child_weight=1, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=0, ...)
pd.options.display.float_format = '{:.3f}'.format
model_xg2_results = format_GS_results(model_prefix+' - '+dataset+' - '+'GS train', model_xg2, 'auc')
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)
pd.options.display.float_format = '{:.3f}'.format
print(model_xg2_results)
display_results()
Model Precision Recall F1 Accuracy AUC 0 xg2 - NOdept - GS train 0.960 0.913 0.936 0.979 0.981
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
12 | xg1 - ALLFeat - train | 0.980 | 0.980 | 0.980 | 0.980 | 0.987 | 0.939 | 0.988 |
11 | xg1 - ALLFeat - test | 0.979 | 0.979 | 0.979 | 0.979 | 0.986 | 0.938 | 0.988 |
10 | xg1 - ALLFeat - GS train | 0.957 | 0.910 | 0.933 | 0.978 | 0.982 | NaN | NaN |
13 | xg2 - NOdept - GS train | 0.960 | 0.913 | 0.936 | 0.979 | 0.981 | NaN | NaN |
5 | dt1 - ALLFeat - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
8 | dt2 - NOdept - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
6 | dt1 - ALLFeat - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
9 | dt2 - NOdept - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
7 | dt2 - NOdept - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.796 | 0.827 | 0.804 | 0.827 | 0.892 | 0.328 | 0.900 |
3 | lr2 - NOdept - train | 0.793 | 0.825 | 0.802 | 0.825 | 0.892 | 0.318 | 0.900 |
0 | lr1 - ALLFeat - test | 0.794 | 0.822 | 0.803 | 0.822 | 0.882 | 0.341 | 0.897 |
2 | lr2 - NOdept - test | 0.795 | 0.823 | 0.804 | 0.823 | 0.882 | 0.343 | 0.898 |
# Make predictions on test data
xg2_best_model = model_xg2.best_estimator_ # Store best model parameters for later testing
y_pred_train2 = xg2_best_model.predict(X_train2)
explainer = shap.TreeExplainer(xg2_best_model)
# Compute the SHAP values for a set of observations
shap_values = explainer(X_train2)
plt.grid(True, linestyle='--', alpha=0.7)
# Plot the SHAP values
shap.plots.waterfall(shap_values[0])
shap.summary_plot(shap_values, X_train2)
shap.plots.bar(shap_values[0])
[12:10:56] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.
# Make predictions on test data
# Store best model parameters for later testing
xg2_best_model = model_xg2.best_estimator_
y_pred_test2 = xg2_best_model.predict(X_test2)
y_pred_train2 = xg2_best_model.predict(X_train2)
classification_report_summary(model_prefix+' - '+dataset+' - '+'test', y_test2, y_pred_test2)
Classification Report : xg2 - NOdept - test ─────────────────────────────────── precision recall f1-score support Predicted would not leave 0.99 0.99 0.99 2321 Predicted would leave 0.96 0.93 0.94 471 accuracy 0.98 2792 macro avg 0.97 0.96 0.97 2792 weighted avg 0.98 0.98 0.98 2792 Recall : 92.7813% f1_score : 94.1810% Precision : 95.6236% Accuracy : 98.0659% ─────────────────────────────────── Weighted Average ─────────────────────────────────── Recall : 98.0659% f1 Score : 98.0542% Precision : 98.0513% Support : 279200.0000% ─────────────────────────────────── Prediction F1 score ─────────────────────────────────── Predict Leave : 94.1810% Support Stay : 98.8402%
model_xg2_results = make_results(model_prefix+' - '+dataset+' - '+'train', xg2_best_model, X_train2, y_train2, y_pred_train2)
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)
model_xg2_results = make_results(model_prefix+' - '+dataset+' - '+'test', xg2_best_model, X_test2, y_test2, y_pred_test2)
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)
print(model_xg2_results)
print()
display_results()
Model Precision Recall F1 Accuracy AUC Predict Leave Predict Stay 0 xg2 - NOdept - test 0.981 0.981 0.981 0.981 0.983 0.942 0.988
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
12 | xg1 - ALLFeat - train | 0.980 | 0.980 | 0.980 | 0.980 | 0.987 | 0.939 | 0.988 |
11 | xg1 - ALLFeat - test | 0.979 | 0.979 | 0.979 | 0.979 | 0.986 | 0.938 | 0.988 |
14 | xg2 - NOdept - train | 0.980 | 0.980 | 0.980 | 0.980 | 0.985 | 0.940 | 0.988 |
15 | xg2 - NOdept - test | 0.981 | 0.981 | 0.981 | 0.981 | 0.983 | 0.942 | 0.988 |
10 | xg1 - ALLFeat - GS train | 0.957 | 0.910 | 0.933 | 0.978 | 0.982 | NaN | NaN |
13 | xg2 - NOdept - GS train | 0.960 | 0.913 | 0.936 | 0.979 | 0.981 | NaN | NaN |
5 | dt1 - ALLFeat - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
8 | dt2 - NOdept - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
6 | dt1 - ALLFeat - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
9 | dt2 - NOdept - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
7 | dt2 - NOdept - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.796 | 0.827 | 0.804 | 0.827 | 0.892 | 0.328 | 0.900 |
3 | lr2 - NOdept - train | 0.793 | 0.825 | 0.802 | 0.825 | 0.892 | 0.318 | 0.900 |
0 | lr1 - ALLFeat - test | 0.794 | 0.822 | 0.803 | 0.822 | 0.882 | 0.341 | 0.897 |
2 | lr2 - NOdept - test | 0.795 | 0.823 | 0.804 | 0.823 | 0.882 | 0.343 | 0.898 |
# Make predictions on test data
xg2_best_model = model_xg2.best_estimator_ # Store best model parameters for later testing
y_pred_test2 = xg2_best_model.predict(X_test2)
explainer = shap.TreeExplainer(xg2_best_model)
# Compute the SHAP values for a set of observations
shap_values = explainer(X_test2)
plt.grid(True, linestyle='--', alpha=0.7)
# Plot the SHAP values
shap.plots.waterfall(shap_values[0])
shap.summary_plot(shap_values, X_test2)
shap.plots.bar(shap_values[0])
[12:10:59] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.
# Make predictions on test data
# Store best model parameters for later testing
xg2_best_model = model_xg2.best_estimator_
y_pred_train2 = xg2_best_model.predict(X_train2)
#y_pred_train2 = xg2_best_model.predict(X_train2)
classification_report_summary(model_prefix+' - '+dataset+' - '+'train', y_train2, y_pred_train2)
Classification Report : xg2 - NOdept - train ─────────────────────────────────── precision recall f1-score support Predicted would not leave 0.98 0.99 0.99 6964 Predicted would leave 0.96 0.92 0.94 1411 accuracy 0.98 8375 macro avg 0.97 0.96 0.96 8375 weighted avg 0.98 0.98 0.98 8375 Recall : 91.7080% f1_score : 94.0407% Precision : 96.4952% Accuracy : 98.0418% ─────────────────────────────────── Weighted Average ─────────────────────────────────── Recall : 98.0418% f1 Score : 98.0218% Precision : 98.0264% Support : 837500.0000% ─────────────────────────────────── Prediction F1 score ─────────────────────────────────── Predict Leave : 94.0407% Support Stay : 98.8284%
model_xg2_results = make_results(model_prefix+' - '+dataset+' - '+'train', xg2_best_model, X_train2, y_train2, y_pred_train2)
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)
model_xg2_results = make_results(model_prefix+' - '+dataset+' - '+'test', xg2_best_model, X_test2, y_test2, y_pred_test2)
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)
print(model_xg2_results)
print()
display_results()
Model Precision Recall F1 Accuracy AUC Predict Leave Predict Stay 0 xg2 - NOdept - test 0.981 0.981 0.981 0.981 0.983 0.942 0.988
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
12 | xg1 - ALLFeat - train | 0.980 | 0.980 | 0.980 | 0.980 | 0.987 | 0.939 | 0.988 |
11 | xg1 - ALLFeat - test | 0.979 | 0.979 | 0.979 | 0.979 | 0.986 | 0.938 | 0.988 |
16 | xg2 - NOdept - train | 0.980 | 0.980 | 0.980 | 0.980 | 0.985 | 0.940 | 0.988 |
14 | xg2 - NOdept - train | 0.980 | 0.980 | 0.980 | 0.980 | 0.985 | 0.940 | 0.988 |
17 | xg2 - NOdept - test | 0.981 | 0.981 | 0.981 | 0.981 | 0.983 | 0.942 | 0.988 |
15 | xg2 - NOdept - test | 0.981 | 0.981 | 0.981 | 0.981 | 0.983 | 0.942 | 0.988 |
10 | xg1 - ALLFeat - GS train | 0.957 | 0.910 | 0.933 | 0.978 | 0.982 | NaN | NaN |
13 | xg2 - NOdept - GS train | 0.960 | 0.913 | 0.936 | 0.979 | 0.981 | NaN | NaN |
5 | dt1 - ALLFeat - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
8 | dt2 - NOdept - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
9 | dt2 - NOdept - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
6 | dt1 - ALLFeat - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
7 | dt2 - NOdept - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.796 | 0.827 | 0.804 | 0.827 | 0.892 | 0.328 | 0.900 |
3 | lr2 - NOdept - train | 0.793 | 0.825 | 0.802 | 0.825 | 0.892 | 0.318 | 0.900 |
0 | lr1 - ALLFeat - test | 0.794 | 0.822 | 0.803 | 0.822 | 0.882 | 0.341 | 0.897 |
2 | lr2 - NOdept - test | 0.795 | 0.823 | 0.804 | 0.823 | 0.882 | 0.343 | 0.898 |
# Set parameters for following cells
model1 = model_xg1
model2 = model_xg2
model_name1 = "model_xg1"
model_name2 = "model_xg2"
model_best_model1 = xg1_best_model
model_best_model2 = xg2_best_model
There is little difference in performance between the two datasets.
ALLFeat
= complete feature engineered datasetNoDept
= dataset with dummy encoded dept removeddisplay_results()
Model | Precision | Recall | F1 | Accuracy | AUC | Predict Leave | Predict Stay | |
---|---|---|---|---|---|---|---|---|
12 | xg1 - ALLFeat - train | 0.980 | 0.980 | 0.980 | 0.980 | 0.987 | 0.939 | 0.988 |
11 | xg1 - ALLFeat - test | 0.979 | 0.979 | 0.979 | 0.979 | 0.986 | 0.938 | 0.988 |
16 | xg2 - NOdept - train | 0.980 | 0.980 | 0.980 | 0.980 | 0.985 | 0.940 | 0.988 |
14 | xg2 - NOdept - train | 0.980 | 0.980 | 0.980 | 0.980 | 0.985 | 0.940 | 0.988 |
17 | xg2 - NOdept - test | 0.981 | 0.981 | 0.981 | 0.981 | 0.983 | 0.942 | 0.988 |
15 | xg2 - NOdept - test | 0.981 | 0.981 | 0.981 | 0.981 | 0.983 | 0.942 | 0.988 |
10 | xg1 - ALLFeat - GS train | 0.957 | 0.910 | 0.933 | 0.978 | 0.982 | NaN | NaN |
13 | xg2 - NOdept - GS train | 0.960 | 0.913 | 0.936 | 0.979 | 0.981 | NaN | NaN |
5 | dt1 - ALLFeat - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
8 | dt2 - NOdept - test | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.936 | 0.987 |
9 | dt2 - NOdept - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
6 | dt1 - ALLFeat - train | 0.978 | 0.979 | 0.978 | 0.979 | 0.979 | 0.935 | 0.987 |
7 | dt2 - NOdept - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
4 | dt1 - ALLFeat - GS train | 0.951 | 0.914 | 0.932 | 0.977 | 0.974 | NaN | NaN |
1 | lr1 - ALLFeat - train | 0.796 | 0.827 | 0.804 | 0.827 | 0.892 | 0.328 | 0.900 |
3 | lr2 - NOdept - train | 0.793 | 0.825 | 0.802 | 0.825 | 0.892 | 0.318 | 0.900 |
0 | lr1 - ALLFeat - test | 0.794 | 0.822 | 0.803 | 0.822 | 0.882 | 0.341 | 0.897 |
2 | lr2 - NOdept - test | 0.795 | 0.823 | 0.804 | 0.823 | 0.882 | 0.343 | 0.898 |
# Prepare confusion matrices for LR1 test
cm_test1 = metrics.confusion_matrix(y_test, y_pred_test) # Use the optimized model
#cm_test1_percent = cm_test1 / cm_test1.sum() * 100
# Prepare confusion matrices for LR1 train
cm_test2 = metrics.confusion_matrix(y_test2, y_pred_test2) # Use the optimized model
#cm_test2_percent = cm_test2 / cm_test2.sum() * 100
#cm = confusion_matrix(y_test, y_pred_test, labels=model_lr1.classes_)
# Plot confusion matrix
#disp = ConfusionMatrixDisplay(confusion_matrix=cm,
# display_labels=model_lr1.classes_)
#disp.plot(values_format='');
fig, ax = plt.subplots(2, 2, figsize=(10,8))
# Calculate percentages for TEST
sum_by_true_class = np.sum(cm_test1, axis=1)
percentage_matrix = cm_test1 / sum_by_true_class[:, np.newaxis]
# Create a figure and plot the percentage confusion matrix as a heatmap
sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,0]) #, xticklabels=model_lr1.classes_, yticklabels=model_lr1.classes_)
ax[0,0].set_title('{} Confusion Matrix (Percentage)'.format(model_name1))
ax[0,0].set_ylabel('True label')
ax[0,0].set_xlabel('Predicted label')
ax[0,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[0,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='white')
# Create a figure and plot the COUNT confusion matrix as a heatmap for TEST
sns.heatmap(cm_test1, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,0])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,0].set_title('{} Confusion Matrix (Count)'.format(model_name1))
ax[1,0].set_ylabel('True label')
ax[1,0].set_xlabel('Predicted label')
ax[1,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')
# Calculate percentages for TRAIN
sum_by_true_class = np.sum(cm_test2, axis=1)
percentage_matrix = cm_test2 / sum_by_true_class[:, np.newaxis]
sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[0,1].set_title('{} Confusion Matrix (Percentage)'.format(model_name2))
ax[0,1].set_ylabel('True label')
ax[0,1].set_xlabel('Predicted label')
ax[0,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[0,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='white')
# Create a figure and plot the COUNT confusion matrix as a heatmap for TRAIN
sns.heatmap(cm_test2, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,1].set_title('{} Confusion Matrix (Count)'.format(model_name2))
ax[1,1].set_ylabel('True label')
ax[1,1].set_xlabel('Predicted label')
ax[1,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')
plt.tight_layout()
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
#tree2_importances = pd.DataFrame(tree2.best_estimator_.feature_importances_, columns=X.columns)
# Get feature importances (Gini importance)
feature_importance1 = model_best_model1.feature_importances_
feature_importance2 = model_best_model2.feature_importances_
# Get the names of the features
feature_names1 = X.columns # Replace with your feature names
feature_names2 = X2.columns # Replace with your feature names
# Create a DataFrame to store feature names and their Gini importance
feature_importance_df1 = pd.DataFrame({'Feature': feature_names1, 'Importance': feature_importance1})
feature_importance_df2 = pd.DataFrame({'Feature': feature_names2, 'Importance': feature_importance2})
feature_importance_df1.sort_values(by='Importance', ascending=False,axis=0, inplace=True)
feature_importance_df2.sort_values(by='Importance', ascending=False,axis=0, inplace=True)
merged_df = pd.merge(feature_importance_df1, feature_importance_df2, on='Feature', how='left', suffixes = (' model_xg1 - AllFeat', ' model_xg2 - NoDept'))
# Print the merged DataFrame
print(merged_df)
feature_importance_df1.sort_values(by='Importance', ascending=True,axis=0, inplace=True)
feature_importance_df2.sort_values(by='Importance', ascending=True,axis=0, inplace=True)
# Plot side-by-side bar plots
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.barh(feature_importance_df1['Feature'], feature_importance_df1['Importance'], color='skyblue')
plt.title('Feature Importance - {} - AllFeat'.format(model_name1))
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.subplot(1, 2, 2)
plt.barh(feature_importance_df2['Feature'], feature_importance_df2['Importance'], color='salmon')
plt.title('Feature Importance - {} - NoDept'.format(model_name2))
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
Feature Importance model_xg1 - AllFeat Importance model_xg2 - NoDept 0 number_project 0.314 0.229 1 tenure 0.281 0.296 2 satisfaction 0.152 0.233 3 last_eval 0.102 0.115 4 overworked 0.100 0.105 5 salary 0.020 0.017 6 promotion 0.005 0.005 7 dept_management 0.005 NaN 8 dept_sales 0.004 NaN 9 dept_randd 0.004 NaN 10 dept_accounting 0.003 NaN 11 dept_technical 0.002 NaN 12 dept_support 0.002 NaN 13 dept_it 0.002 NaN 14 dept_product_mng 0.002 NaN 15 dept_marketing 0.002 NaN 16 dept_hr 0.001 NaN
# read in previous plot data
# Logistic Regression
variables = [['auc_score1', 'model_lr1'],
['roc_auc1', 'model_lr1'],
['auc_score2', 'model_lr2'],
['roc_auc2', 'model_lr2'],
['auc_score1', 'model_dt1'],
['roc_auc1', 'model_dt1'],
['auc_score2', 'model_dt2'],
['roc_auc2', 'model_dt2']
]
arrays = [['precision1','model_lr1'],
['recall1', 'model_lr1'],
['fpr1', 'model_lr1'],
['tpr1', 'model_lr1'],
['precision2','model_lr2'],
['recall2', 'model_lr2'],
['fpr2', 'model_lr2'],
['tpr2', 'model_lr2'],
['precision1','model_dt1'],
['recall1', 'model_dt1'],
['fpr1', 'model_dt1'],
['tpr1', 'model_dt1'],
['precision2','model_dt2'],
['recall2', 'model_dt2'],
['fpr2', 'model_dt2'],
['tpr2', 'model_dt2']
]
loaded_plot_vars = {}
loaded_plot_arrays = {}
for var_name, model in variables:
print(model+'-'+var_name)
#var = globals()[var_name]
with open(f'99-documentation-project/08-plot_data/{model}-{var_name}.csv', 'r') as file:
var = f'{model}-{var_name}'
print(var)
loaded_plot_vars[var] = json.load(file)
for array_name, model in arrays:
#print(array_name, model)
#var = globals()[var_name]
filepath = f'99-documentation-project/08-plot_data/{model}-{array_name}.csv'
array = f'{model}-{array_name}'
loaded_plot_arrays[array] = pd.read_csv(filepath)
#print(loaded_plot_arrays)
#var = loaded_plot_vars.get('model_lr1-roc_auc1')
#print(var)
#array= loaded_plot_arrays.get('model_dt2-precision2')
#lr1_fpr1 = loaded_plot_arrays.get('model_lr1-fpr1')
#va = lr1_fpr1.values()
#print(type(lr1_fpr1))
#print(lr1_fpr1)
# Retrieve arrays and variable from previous model
lr1_precision1 = loaded_plot_arrays.get('model_lr1-precision1') # retrieve fpr1 array from model1 = AllFeat
lr1_recall1 = loaded_plot_arrays.get('model_lr1-recall1') # retrieve fpr1 array from model1 = AllFeat
lr1_auc_score1 = loaded_plot_vars.get('model_lr1-auc_score1') # retrieve auc score from model1 = AllFeat
lr2_precision2 = loaded_plot_arrays.get('model_lr2-precision2') # retrieve fpr1 array from model1 = AllFeat
lr2_recall2 = loaded_plot_arrays.get('model_lr2-recall2') # retrieve fpr1 array from model1 = AllFeat
lr2_auc_score2 = loaded_plot_vars.get('model_lr2-auc_score2') # retrieve auc score from model1 = AllFeat
lr1_fpr1 = loaded_plot_arrays.get('model_lr1-fpr1') # retrieve fpr1 array from model1 = AllFeat
lr1_tpr1 = loaded_plot_arrays.get('model_lr1-tpr1') # retrieve tpr1 array from model1 = AllFeat
lr1_roc_auc1 = loaded_plot_vars.get('model_lr1-roc_auc1') # retrieve auc score from model1 = AllFeat
lr1_model_name1 = arrays[0][1] # Retrieve model_name1
lr2_fpr2 = loaded_plot_arrays.get('model_lr2-fpr2') # retrieve fpr1 array from model2 = AllFeat
lr2_tpr2 = loaded_plot_arrays.get('model_lr2-tpr2') # retrieve tpr1 array from model2 = AllFeat
lr2_roc_auc2 = loaded_plot_vars.get('model_lr2-roc_auc2') # retrieve auc score from model1 = AllFeat
lr2_model_name2 = arrays[4][1] # Retrieve model_name2
dt1_precision1 = loaded_plot_arrays.get('model_dt1-precision1') # retrieve fpr1 array from model1 = AllFeat
dt1_recall1 = loaded_plot_arrays.get('model_dt1-recall1') # retrieve fpr1 array from model1 = AllFeat
dt1_auc_score1 = loaded_plot_vars.get('model_dt1-auc_score1') # retrieve auc score from model1 = AllFeat
dt2_precision2 = loaded_plot_arrays.get('model_dt2-precision2') # retrieve fpr1 array from model1 = AllFeat
dt2_recall2 = loaded_plot_arrays.get('model_dt2-recall2') # retrieve fpr1 array from model1 = AllFeat
dt2_auc_score2 = loaded_plot_vars.get('model_dt2-auc_score2') # retrieve auc score from model1 = AllFeat
dt1_fpr1 = loaded_plot_arrays.get('model_dt1-fpr1') # retrieve fpr1 array from model1 = AllFeat
dt1_tpr1 = loaded_plot_arrays.get('model_dt1-tpr1') # retrieve tpr1 array from model1 = AllFeat
dt1_roc_auc1 = loaded_plot_vars.get('model_dt1-roc_auc1') # retrieve auc score from model1 = AllFeat
dt1_model_name1 = arrays[8][1] # Retrieve model_name1
dt2_fpr2 = loaded_plot_arrays.get('model_dt2-fpr2') # retrieve fpr1 array from model2 = AllFeat
dt2_tpr2 = loaded_plot_arrays.get('model_dt2-tpr2') # retrieve tpr1 array from model2 = AllFeat
dt2_roc_auc2 = loaded_plot_vars.get('model_dt2-roc_auc2') # retrieve auc score from model1 = AllFeat
dt2_model_name2 = arrays[12][1] # Retrieve model_name2
print(lr1_model_name1)
print(lr2_model_name2)
print(dt1_model_name1)
print(dt2_model_name2)
model_lr1-auc_score1 model_lr1-auc_score1 model_lr1-roc_auc1 model_lr1-roc_auc1 model_lr2-auc_score2 model_lr2-auc_score2 model_lr2-roc_auc2 model_lr2-roc_auc2 model_dt1-auc_score1 model_dt1-auc_score1 model_dt1-roc_auc1 model_dt1-roc_auc1 model_dt2-auc_score2 model_dt2-auc_score2 model_dt2-roc_auc2 model_dt2-roc_auc2 model_lr1 model_lr2 model_dt1 model_dt2
y_prob1 = model1.predict_proba(X_test)[:, 1]
y_prob2 = model2.predict_proba(X_test2)[:, 1]
precision1, recall1, _ = precision_recall_curve(y_test, y_prob1)
precision2, recall2, _ = precision_recall_curve(y_test2, y_prob2)
# Compute area under the curve (AUC)
auc_score1 = auc(recall1, precision1)
auc_score2 = auc(recall2, precision2)
#plt.subplot(1, 2, 1)
plt.figure(figsize=(12, 5))
plt.plot(recall1, precision1, color='blue', label=f'{model_name1} - AUC = {auc_score1:.2f}')
plt.plot(lr1_recall1, lr1_precision1, color='purple', label=f'{lr1_model_name1} - AUC = {lr1_auc_score1:.2f}')
plt.plot(dt1_recall1, dt1_precision1, color='green', label=f'{dt1_model_name1} - AUC = {dt1_auc_score1:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('{} Precision-Recall Curve'.format(model_name1))
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')
plt.show()
#plt.subplot(1, 2, 2)
plt.figure(figsize=(12, 5))
plt.plot(recall2, precision2, color='blue', label=f'{model_name2} - AUC = {auc_score2:.2f}')
plt.plot(lr2_recall2, lr2_precision2, color='purple', label=f'{lr2_model_name2} - AUC = {lr1_auc_score1:.2f}')
plt.plot(dt2_recall2, dt2_precision2, color='green', label=f'{dt2_model_name2} - AUC = {dt1_auc_score1:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('{} Precision-Recall Curve'.format(model_name2))
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')
#plt.tight_layout()
plt.show()
# Compute ROC curve
fpr1, tpr1, _ = roc_curve(y_test, y_prob1)
fpr2, tpr2, _ = roc_curve(y_test2, y_prob2)
# Compute area under the curve (AUC)
roc_auc1 = auc(fpr1, tpr1)
roc_auc2 = auc(fpr2, tpr2)
#plt.subplot(1, 2, 1)
# Plot ROC curve
plt.figure(figsize=(12, 5))
plt.plot(fpr1, tpr1, color='darkorange', lw=2, label=f'{model_name1} - AUC = {roc_auc1:.2f}')
plt.plot(lr1_fpr1, lr1_tpr1, color='purple', lw=2, label=f'{lr1_model_name1} - AUC = {roc_auc1:.2f}')
plt.plot(dt1_fpr1, dt1_tpr1, color='green', lw=2, label=f'{dt1_model_name1} - AUC = {roc_auc1:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('{} Receiver Operating Characteristic (ROC) Curve'.format(model_name1))
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')
plt.show()
#plt.subplot(1, 2,2)
# Plot ROC curve
plt.figure(figsize=(12, 5))
plt.plot(fpr2, tpr2, color='darkorange', lw=2, label=f'{model_name2} - AUC = {roc_auc2:.2f}')
plt.plot(lr2_fpr2 , lr2_tpr2, color='purple', lw=2, label=f'{lr2_model_name2} - AUC = {roc_auc1:.2f}')
plt.plot(dt2_fpr2, dt2_tpr2, color='green', lw=2, label=f'{dt2_model_name2} - AUC = {roc_auc1:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('{} Receiver Operating Characteristic (ROC) Curve'.format(model_name2))
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')
plt.show()
# Save model plot data
arrays = [['precision1',model_name1],
['recall1', model_name1],
['precision2',model_name2],
['recall2', model_name2],
['fpr1', model_name1],
['tpr1', model_name1],
['fpr2', model_name2],
['tpr2', model_name2],
]
variables = [['auc_score1', model_name1],
['auc_score2', model_name2],
['roc_auc1', model_name1],
['roc_auc2', model_name2]
]
# Save plot data scores (auc, roc)
for var_name, model in variables:
print(model+"-"+var_name )
var = globals()[var_name]
with open(f'99-documentation-project/08-plot_data/{model}-{var_name}.csv', 'w') as file:
json.dump(var, file)
# Save plot data arrays (recall, precision, fpr, tpr)
for array_name, model in arrays:
print(model+'-'+array_name)
var = globals()[array_name]
df = pd.DataFrame({array_name: var})
df.to_csv(f'99-documentation-project/08-plot_data/{model}-{array_name}.csv', index=False, header=False)
model_xg1-auc_score1 model_xg2-auc_score2 model_xg1-roc_auc1 model_xg2-roc_auc2 model_xg1-precision1 model_xg1-recall1 model_xg2-precision2 model_xg2-recall2 model_xg1-fpr1 model_xg1-tpr1 model_xg2-fpr2 model_xg2-tpr2
XGBoost is the best performing model !
AUC is 0.97 / 0.96
ROC is 0.99/0.98
The model would correctly predict someone leaving 98-99% of the time assuming a threshold of 50%