# Import packages

# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

# Set Options
pd.set_option('display.max_columns', None)

# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support, \
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from datetime import datetime as dt

import statsmodels.api as sm

# For saving models
import pickle

import shap

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
#pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

# Add a little colour

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/" # Source folder for cleaned data
#save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # destination for pickle saved models

model_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # path to load/save pickled models

def display_results():
    '''
    Load Results.csv containing store test scores, return the scores for display
    In: none
    Out: pandas df of Results,csv containing precision, recall, f1, accuracy, and AUC scores of the models
    '''
    model_results = pd.read_csv("Results.csv")
    model_results.drop(columns=['Unnamed: 0'], inplace=True)
    model_results = model_results.sort_values(by='AUC', ascending=False)
    
    return model_results

def write_pickle(path, model_object, save_as:str):
    '''
    In: 
        path:         path of folder where you want to save the pickle
        model_object: a model you want to pickle
        save_as:      filename for how you want to save the model

    Out: A call to pickle the model in the folder indicated
    '''    

    with open(path + save_as + '.pickle', 'wb') as to_write:
        pickle.dump(model_object, to_write)
        
def read_pickle(path, saved_model_name:str):
    '''
    In: 
        path:             path to folder where you want to read from
        saved_model_name: filename of pickled model you want to read in

    Out: 
        model: the pickled model 
    '''
    with open(path + saved_model_name + '.pickle', 'rb') as to_read:
        model = pickle.load(to_read)

    return model

# Load cleaned dataset into a dataframe
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"),"\n")

# Feature engineering on salary, avg_mnth_hrs, dept, outliers removed, all features
AllFeat_df = pd.read_csv(load_path + "data_cleaned_NoOl_FE_AllFeat.csv", index_col = False) 
AllFeat_df.sort_index(axis = 1, inplace=True)

# Feature engineering - departments removed
NoDept_df = pd.read_csv(load_path + "data_cleaned_NoOl_FE_NoDept.csv", index_col = False) 
NoDept_df.sort_index(axis = 1, inplace=True)

Started / Last Run = 2023-12-01 09:25:03

# List of models to pickle load
model_list = ['hr_dt1-AllFeat', 'hr_dt2-NOdept', 
              'hr_rf1-AllFeat','hr_rf2-NOdept', 
              'hr_xg1-AllFeat','hr_xg2-NOdept']

# List of dataframes to iterate
data_list  = ['AllFeat_df', 'NoDept_df',
              'AllFeat_df', 'NoDept_df',
              'AllFeat_df', 'NoDept_df']

# List of titles for the shap plots
model_name = ['Decision Tree - All Features', 'Decision Tree - No Departments', 
              'Random Forest - All Features','Random Forest- No Departments', 
              'XGBoost - All Features', 'XGBoost - No Departments']

import warnings

warnings.filterwarnings("ignore")

for i in range(len(model_list)):
    model_file = model_list[i]
    model_data = globals()[data_list[i]].copy()
    model_title = model_name[i]

    # Isolate the outcome variable
    y = model_data['left']

    # Select & Isolate the feature variables and drop the outcome variable
    X = model_data.copy()
    X = X.drop('left', axis = 1)

    # Prepare training and test data
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, stratify=y, random_state=0)

    #model_file = model_list[0]
    model = read_pickle(model_path, model_file)
    print(color.BOLD + model_title + " - " + model_file + color.END)
    best_model = model.best_estimator_ # Store best model parameters for later testing
    
    y_pred_test = best_model.predict(X_test)

    explainer = shap.TreeExplainer(best_model)

    # Calculate SHAP values for a specific observation
    shap_values = explainer.shap_values(X_test)

    #shap_values = shap_values[..., 1]
    

    predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_test, y_pred_test, average=None)
    f1_true_class = round(predict_f1_score[1],3)  # Index 1 corresponds to the "True" class
    f1_false_class = round(predict_f1_score[0],3)  # Index o corresponds to the "False" class

# Plot the SHAP values for that observation
    print("Predict will leave (True)  : {}".format(f1_true_class))
    print("Predict will stay (True)  : {}\n".format(f1_false_class))
    plt.title(model_title)
    plt.grid(True, linestyle='--', alpha=0.7)
    shap.summary_plot(shap_values, X_test)
    
    #shap.plots.waterfall(shap_values)

Decision Tree - All Features - hr_dt1-AllFeat
Predict will leave (True)  : 0.936
Predict will stay (True)  : 0.987

Decision Tree - No Departments - hr_dt2-NOdept
Predict will leave (True)  : 0.936
Predict will stay (True)  : 0.987

Random Forest - All Features - hr_rf1-AllFeat
Predict will leave (True)  : 0.943
Predict will stay (True)  : 0.989

Random Forest- No Departments - hr_rf2-NOdept
Predict will leave (True)  : 0.943
Predict will stay (True)  : 0.989

XGBoost - All Features - hr_xg1-AllFeat
Predict will leave (True)  : 0.938
Predict will stay (True)  : 0.988

XGBoost - No Departments - hr_xg2-NOdept
Predict will leave (True)  : 0.942
Predict will stay (True)  : 0.988

pd.options.display.float_format = '{:.4f}'.format
results_df = display_results()
results_df

results_test_df = results_df[results_df['Model'].str.contains('test')]
results_test_df

plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='AUC', hue='AUC', data=results_test_df, palette='colorblind')
plt.xlabel('Model')
plt.ylabel('AUC')
plt.title('Model Comparison test data')
plt.ylim(0.75, 1)  # Set y-axis range between 0 and 1
plt.legend(title='AUC Score')
plt.xticks(rotation='vertical')
plt.show()

import warnings
pd.options.display.float_format = '{:.4f}'.format

warnings.filterwarnings("ignore")

for i in range(6):
    model_file = model_list[i]
    model_data = globals()[data_list[i]].copy()
    model_title = model_name[i]

    # Isolate the outcome variable
    Y = model_data['left']

    # Select & Isolate the feature variables and drop the outcome variable
    X = model_data.copy()
    X = X.drop('left', axis = 1)

    # Prepare training and test data
    X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, stratify=Y, random_state=0)

    #model_file = model_list[0]
    model = read_pickle(model_path, model_file)
    print(color.BOLD + model_title + " - " + model_file + color.END)
    best_model = model.best_estimator_ # Store best model parameters for later testing
    
    y_pred_test = best_model.predict(X_train)

#    explainer = shap.TreeExplainer(best_model)

    # Calculate SHAP values for a specific observation
#    shap_values = explainer.shap_values(X_train)

    #shap_values = shap_values[..., 1]
    

    predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_train, y_pred_test, average=None)
    f1_true_class = predict_f1_score[1]  # Index 1 corresponds to the "True" class
    f1_false_class = predict_f1_score[0]  # Index o corresponds to the "False" class


    print("Predict will leave (True)  : {:.4%}".format(f1_true_class))
    print("Predict will stay (False)  : {:.4%}\n".format(f1_false_class))

Decision Tree - All Features - hr_dt1-AllFeat
Predict will leave (True)  : 93.5215%
Predict will stay (False)  : 98.7202%

Decision Tree - No Departments - hr_dt2-NOdept
Predict will leave (True)  : 93.5215%
Predict will stay (False)  : 98.7202%

Random Forest - All Features - hr_rf1-AllFeat
Predict will leave (True)  : 93.9029%
Predict will stay (False)  : 98.8081%

Random Forest- No Departments - hr_rf2-NOdept
Predict will leave (True)  : 93.9029%
Predict will stay (False)  : 98.8081%

XGBoost - All Features - hr_xg1-AllFeat
Predict will leave (True)  : 93.8568%
Predict will stay (False)  : 98.7928%

XGBoost - No Departments - hr_xg2-NOdept
Predict will leave (True)  : 94.0407%
Predict will stay (False)  : 98.8284%

from PIL import Image
plots = ['./99-documentation-project/08-plot_data/plot-pr-curve1.png',
         './99-documentation-project/08-plot_data/plot-pr-curve2.png',
         './99-documentation-project/08-plot_data/plot-roc-curve1.png',
         './99-documentation-project/08-plot_data/plot-roc-curve2.png']

image_path = './99-documentation-project/08-plot_data/plot-pr-curve1.png'

for plot in plots:
    img = Image.open(plot)
    display(img)

Document Title	Salifort Motors - ML Modelling - Random Forest
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
12	rf1 - ALLFeat - train	0.9799	0.9801	0.9798	0.9801	0.9874	0.9390	0.9881
15	rf2 - NOdept - train	0.9799	0.9801	0.9798	0.9801	0.9873	0.9390	0.9881
18	xg1 - ALLFeat - train	0.9797	0.9798	0.9796	0.9798	0.9870	0.9386	0.9879
17	xg1 - ALLFeat - test	0.9791	0.9792	0.9791	0.9792	0.9860	0.9376	0.9875
22	xg2 - NOdept - train	0.9803	0.9804	0.9802	0.9804	0.9854	0.9404	0.9883
20	xg2 - NOdept - train	0.9803	0.9804	0.9802	0.9804	0.9854	0.9404	0.9883
14	rf2 - NOdept - test	0.9809	0.9810	0.9809	0.9810	0.9833	0.9426	0.9886
21	xg2 - NOdept - test	0.9805	0.9807	0.9805	0.9807	0.9833	0.9418	0.9884
23	xg2 - NOdept - test	0.9805	0.9807	0.9805	0.9807	0.9833	0.9418	0.9884
11	rf1 - ALLFeat - test	0.9809	0.9810	0.9809	0.9810	0.9832	0.9426	0.9886
16	xg1 - ALLFeat - GS train	0.9570	0.9100	0.9330	0.9780	0.9820	NaN	NaN
19	xg2 - NOdept - GS train	0.9600	0.9130	0.9360	0.9790	0.9810	NaN	NaN
13	rf2 - NOdept - GS train	0.9639	0.9086	0.9354	0.9789	0.9808	NaN	NaN
10	rf1 - ALLFeat - GS train	0.9638	0.9079	0.9350	0.9787	0.9802	NaN	NaN
8	dt2 - NOdept - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
5	dt1 - ALLFeat - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
9	dt2 - NOdept - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
6	dt1 - ALLFeat - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
7	dt2 - NOdept - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
4	dt1 - ALLFeat - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
1	lr1 - ALLFeat - train	0.7737	0.8130	0.7860	0.8130	0.8867	0.2578	0.8930
3	lr2 - NOdept - train	0.7702	0.8111	0.7833	0.8111	0.8860	0.2467	0.8920
2	lr2 - NOdept - test	0.7670	0.8062	0.7806	0.8062	0.8783	0.2476	0.8888
0	lr1 - ALLFeat - test	0.7760	0.8112	0.7878	0.8112	0.8781	0.2771	0.8915

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
17	xg1 - ALLFeat - test	0.9791	0.9792	0.9791	0.9792	0.9860	0.9376	0.9875
14	rf2 - NOdept - test	0.9809	0.9810	0.9809	0.9810	0.9833	0.9426	0.9886
21	xg2 - NOdept - test	0.9805	0.9807	0.9805	0.9807	0.9833	0.9418	0.9884
23	xg2 - NOdept - test	0.9805	0.9807	0.9805	0.9807	0.9833	0.9418	0.9884
11	rf1 - ALLFeat - test	0.9809	0.9810	0.9809	0.9810	0.9832	0.9426	0.9886
8	dt2 - NOdept - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
5	dt1 - ALLFeat - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
2	lr2 - NOdept - test	0.7670	0.8062	0.7806	0.8062	0.8783	0.2476	0.8888
0	lr1 - ALLFeat - test	0.7760	0.8112	0.7878	0.8112	0.8781	0.2771	0.8915

Salifort Motors - ML Models Comparisons ¶

Document Information ¶

Client Details ¶

Document Overview ¶

Notes ¶

Table of contents ¶

Initialise Notebook ¶

Import Packages ¶

Set Pandas Options ¶

Initialise Notebook Options ¶

Define Functions ¶

display_results() - Function to retrieve scores from Results.csv and display them ¶

Read / Write Pickle Function ¶

Model Performance ¶

Import data ¶

Code flags:¶

Prepare data for SHAP Plots ¶

Shap Plots for feature importance ¶

Get previous models results data ¶

Comparing ROC and Precision-Recall across models¶

Conclusion ¶

Table of contents¶

Comparing ROC and Precision-Recall across models¶

Table of contents ¶