# Import packages

# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support, \
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, precision_recall_curve, auc
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm

from datetime import datetime as dt

import json

# Shap Explainer
import shap

# For saving models
import pickle

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/" # Source folder for cleaned data
load_path = "00-data_cleaned/" # Source folder for cleaned data


save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # destination for pickle saved models
model_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # path to load/save pickled models
model_path = "04-pickle-ML-models/" # path to load/save pickled models

import chime
import time

def beepr():
    for x in range(2):
        for i in range(3):
            chime.success()
            time.sleep(0.25)
        time.sleep(.5)

#beepr()

def display_results():
    '''
    Load Results.csv containing store test scores, return the scores for display

    In: 
        none

    Out: pandas df of Results,csv containing precision, recall, f1, accuracy, AUC scores and prediction count of the models
    '''
    model_results = pd.read_csv("Results.csv")
    model_results.drop(columns=['Unnamed: 0'], inplace=True)
    model_results = model_results.sort_values(by='AUC', ascending=False)
    
    return model_results

def format_GS_results(model_name:str, model_object, metric:str):
    '''
    Returns a pandas df with the F1, recall, precision, accuracy, and auc scores
    from the GridSearch.
    
    Arguments:
        model_name (string): what you want the model to be called in the output table
        model_object: a fit GridSearchCV object from test data
        metric (string): precision, recall, f1, accuracy, or auc
  
    
    '''

    # Create dictionary that maps input metric to actual metric name in GridSearchCV
    metric_dict = {'auc': 'mean_test_roc_auc',
                   'precision': 'mean_test_precision',
                   'recall': 'mean_test_recall',
                   'f1': 'mean_test_f1',
                   'accuracy': 'mean_test_accuracy'
                  }

    # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)

    # Isolate the row of the df with the max(metric) score
    best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]

    # Extract Accuracy, precision, recall, and f1 score from that row
    auc = best_estimator_results.mean_test_roc_auc
    f1 = best_estimator_results.mean_test_f1
    recall = best_estimator_results.mean_test_recall
    precision = best_estimator_results.mean_test_precision
    accuracy = best_estimator_results.mean_test_accuracy
  
    # Create table of results
    table = pd.DataFrame()
    table = pd.DataFrame({'Model': [model_name],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1': [f1],
                          'Accuracy': [accuracy],
                          'AUC': [auc]
                        })
  
    return table

# Get results from predict to store in comparison table
def make_results(model_name:str, model_object:str, X_var:str, y_var:str, y_pred_var:str):
    '''
    Returns a pandas df with the F1, recall, precision, and accuracy scores
    across all validation folds.  

    In:
        a model name (string) - Used as the heading
        a model object : The ML Model
        X_var, y_var, y_pred_var the variables used in the model
  
    out: pandas df containing precision, recall, f1, accuracy, and AUC scores of the models

    '''

    # Get all the results from the CV and put them in a dict
    report = classification_report(y_var, y_pred_var,  output_dict=True)

    # Calculate precision, recall, and F1 score for the "True" class
    predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
    f1_true_class = predict_f1_score[1]  # Index 1 corresponds to the "True" class
    f1_false_class = predict_f1_score[0]  # Index 1 corresponds to the "True" class


    # Extract accuracy, precision, recall, and f1 score from that row
    f1 = report['weighted avg']['f1-score']
    recall = report['weighted avg']['recall']
    precision = report['weighted avg']['precision']
    accuracy = report['accuracy']
    auc = roc_auc_score(y_var, model_object.predict_proba(X_var)[:,1])
    # Create table of results
    table = pd.DataFrame({'Model': model_name,
                          'Precision': precision,
                          'Recall': recall,
                          'F1': f1,
                          'Accuracy': accuracy,
                          'AUC': auc,
                          'Predict Leave': f1_true_class,
                          'Predict Stay' : f1_false_class
                        },
                        index=[0]
                       )
  
    return table

def classification_report_summary(name:str, y_var:str, y_pred_var:str):
    '''
    Gather stats from predictions and format into a report

    In: 
        name:str    : Test data name for report header e.g. TEST or TRAIN
        y_var       : y variable
        y_pred_var  : y prediction Variable

    Out: Display of precision, recall, f1, accuracy, and AUC scores of the models, Weighted Average and Prediction f1 score for true/false
    '''
    targetnames = ['Predicted would not leave', 'Predicted would leave']

  
    
    print("\nClassification Report : ", name)
    print(classification_report(y_var, y_pred_var, target_names=targetnames))
    
    print("Recall        : {:.4%}".format(recall_score(y_var, y_pred_var)))
    print("f1_score      : {:.4%}".format(f1_score(y_var, y_pred_var)))
    print("Precision     : {:.4%}".format(precision_score(y_var, y_pred_var)))
    print("Accuracy      : {:.4%}".format(accuracy_score(y_var, y_pred_var)))
    
    report = classification_report(y_var, y_pred_var,  output_dict=True)
    
    print()
    print('\u2500' * 35)     
    print("Weighted Average")
    print('\u2500' * 35)     
    
    print("Recall        : {:.4%}".format(report['weighted avg']['recall']))
    print("f1 Score      : {:.4%}".format(report['weighted avg']['f1-score']))
    print("Precision     : {:.4%}".format(report['weighted avg']['precision']))
    print("Support       : {:.0f}".format(report['weighted avg']['support']))
    
      # Calculate precision, recall, and F1 score for the "True" class
    predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
    f1_true_class = predict_f1_score[1]  # Index 1 corresponds to the "True" class
    f1_false_class = predict_f1_score[0]  # Index 0 corresponds to the "False" class
    
    print()
    print('\u2500' * 35)     
    print("Prediction F1 score")
    print('\u2500' * 35)     
    print("Predict Leave : {:.4%}".format(f1_true_class))
    print("Predict Stay  : {:.4%}".format(f1_false_class))

def write_pickle(path, model_object, save_as:str):
    '''
    In: 
        path:         path of folder where you want to save the pickle
        model_object: a model you want to pickle
        save_as:      filename for how you want to save the model

    Out: A call to pickle the model in the folder indicated
    '''    

    with open(path + save_as + '.pickle', 'wb') as to_write:
        pickle.dump(model_object, to_write)
        
def read_pickle(path, saved_model_name:str):
    '''
    In: 
        path:             path to folder where you want to read from
        saved_model_name: filename of pickled model you want to read in

    Out: 
        model: the pickled model 
    '''
    with open(path + saved_model_name + '.pickle', 'rb') as to_read:
        model = pickle.load(to_read)

    return model

# Load cleaned dataset into a dataframe
df1 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_AllFeat.csv", index_col = False) # Feature engineering on salary, avg_mnth_hrs, dept, outliers removed


df1 = df1.sort_index(axis=1)

# model_prefix : Str = prefix for results.csv added to dataset
model_prefix     = 'dt1'

# dataset : Str = dataset name for results.csv
dataset          = 'ALLFeat'

# rerun : int =  Flag to set 1 = append to Results.csv / 0 = Overwrite with new file
rerun            = 1

refit            = 0 # run refit, not a big issue for Decision tree.

print("df1 - Feature engineering on salary, avg_mnth_hrs, dept, outliers removed\n")

# Display dataframe columns
df1.columns

df1 - Feature engineering on salary, avg_mnth_hrs, dept, outliers removed

Index(['dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd',
       'dept_sales', 'dept_support', 'dept_technical', 'last_eval', 'left', 'number_project', 'overworked',
       'promotion', 'salary', 'satisfaction', 'tenure'],
      dtype='object')

model_data = df1.copy()

# Isolate the outcome and feature variables

# Isolate the outcome variable 'left' is a binary value where true = employee left employment
Y = model_data['left']

# Select & Isolate the feature variables and drop the outcome variable
X = model_data.copy()
X = X.drop('left', axis = 1)

# Prepare training and test data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, stratify=Y, random_state=0)

# Instantiate model
dtc = DecisionTreeClassifier(random_state=0)

# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth':[4, 6, 8, None],
             'min_samples_leaf': [2, 5, 1],
             'min_samples_split': [2, 4, 6]
             }

# Assign a dictionary of scoring metrics to capture
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')

# Instantiate GridSearch
model_dt1 = GridSearchCV(dtc, cv_params, scoring=scoring, cv=4, refit='roc_auc')

%%time
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"))
model_dt1.fit(X_train, y_train)
beepr()

Started / Last Run = 2023-12-04 12:01:55
CPU times: user 5.14 s, sys: 1.16 ms, total: 5.14 s
Wall time: 7.69 s

## Write pickle

if refit == 1: # refit = 1 run the fit and save the model
    write_pickle(model_path, model_dt1, 'hr_dt1-'+'AllFeat')

# Read in pickle
if refit == 0: # refit = 0 load model, don't  fit the data
    model_dt1 = read_pickle(model_path, 'hr_dt1-'+'AllFeat')

print("Start Time =", dt.now().strftime("%H:%M:%S"))
# Get the parameters of the best-performing model
print(model_prefix+' - '+dataset+' - '+'test\n')
print("Best Parameters : ", model_dt1.best_params_)

# Get the average f1 score of the best-performing model
print("Best Score      : {:.4f}".format(model_dt1.best_score_))

# Get the best estimators of the parameters
print("Best Estimator  : ", model_dt1.best_estimator_)

Start Time = 12:02:03
dt1 - ALLFeat - test

Best Parameters :  {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score      : 0.9737
Best Estimator  :  DecisionTreeClassifier(max_depth=4, random_state=0)

model_dt1_cv_results = format_GS_results(model_prefix+' - '+dataset+' - '+'GS train', model_dt1, 'auc')
model_dt1_cv_results.to_csv("Results.csv", mode='a', index=True, header=False)

pd.options.display.float_format = '{:.3f}'.format
#print(model_dt1_cv_results,"\n")
display_results()

# Make predictions on test data

best_model = model_dt1.best_estimator_ # Store best model parameters for later testing
y_pred_train = best_model.predict(X_train)
explainer = shap.TreeExplainer(best_model)

# Compute the SHAP values for a set of observations
#shap_values = explainer(X_train)

shap_values = explainer(X_train[:100])

shap_values = shap_values[..., 1]

plt.grid(True, linestyle='--', alpha=0.7)

# Plot the SHAP values
shap.plots.waterfall(shap_values[0], max_display=12)
shap.summary_plot(shap_values)
shap.plots.bar(shap_values[0])

# Make predictions on test data

model_dt1_best_model = model_dt1.best_estimator_ # Save the best parameters for later testing
y_pred_test = model_dt1_best_model.predict(X_test)

# def classification_report_summary(name:str, y_var:str, y_pred_var:str):

classification_report_summary(model_prefix+' - '+dataset+' - '+'test', y_test,y_pred_test)

Classification Report :  dt1 - ALLFeat - test
                           precision    recall  f1-score   support

Predicted would not leave       0.99      0.99      0.99      2321
    Predicted would leave       0.94      0.93      0.94       471

                 accuracy                           0.98      2792
                macro avg       0.96      0.96      0.96      2792
             weighted avg       0.98      0.98      0.98      2792

Recall        : 92.9936%
f1_score      : 93.5897%
Precision     : 94.1935%
Accuracy      : 97.8510%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 97.8510%
f1 Score      : 97.8455%
Precision     : 97.8416%
Support       : 2792

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 93.5897%
Predict Stay  : 98.7091%

#make_lr_results(model_name:str, model_object:str, X_var:str, y_var:str, y_pred_var:str)

# format prediction results into a dataframe
dt_pred_test_results = make_results(model_prefix+' - '+dataset+' - '+'test', model_dt1_best_model, X_test, y_test, y_pred_test) 

# save the prediction results in Results.csv
dt_pred_test_results.to_csv("Results.csv", mode='a', index=True, header=False)

print(dt_pred_test_results) # Display results of the prediction
display_results() # Display contents of Results.csv

                  Model  Precision  Recall    F1  Accuracy   AUC  Predict Leave  Predict Stay
0  dt1 - ALLFeat - test      0.978   0.979 0.978     0.979 0.979          0.936         0.987

# Make predictions on train data
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"))

model_dt1_best_model = model_dt1.best_estimator_ # Save the best parameters for later testing
y_pred_train = model_dt1_best_model.predict(X_train)

Started / Last Run = 2023-12-04 12:02:06

# def classification_report_summary(name:str, y_var:str, y_pred_var:str):

classification_report_summary(model_prefix+' - '+dataset+' - '+'train', y_train,y_pred_train)

Classification Report :  dt1 - ALLFeat - train
                           precision    recall  f1-score   support

Predicted would not leave       0.98      0.99      0.99      6964
    Predicted would leave       0.96      0.92      0.94      1411

                 accuracy                           0.98      8375
                macro avg       0.97      0.95      0.96      8375
             weighted avg       0.98      0.98      0.98      8375

Recall        : 91.5663%
f1_score      : 93.5215%
Precision     : 95.5621%
Accuracy      : 97.8627%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 97.8627%
f1 Score      : 97.8444%
Precision     : 97.8434%
Support       : 8375

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 93.5215%
Predict Stay  : 98.7202%

#make_lr_results(model_name:str, model_object:str, X_var:str, y_var:str, y_pred_var:str)

# format prediction results into a dataframe
dt_pred_test_results = make_results(model_prefix+' - '+dataset+' - '+'train', model_dt1_best_model, X_train, y_train, y_pred_train) 

# Save dataframe into results.csv, no header
dt_pred_test_results.to_csv("Results.csv", mode='a', index=True, header=False) # save the prediction results in Results.csv

print(dt_pred_test_results) # Display results of the prediction
pd.options.display.float_format = '{:.3f}'.format
display_results() # Display contents of Results.csv

                   Model  Precision  Recall    F1  Accuracy   AUC  Predict Leave  Predict Stay
0  dt1 - ALLFeat - train      0.978   0.979 0.978     0.979 0.979          0.935         0.987

plt.figure(figsize=(18,15))
plot_tree(model_dt1_best_model,max_depth=3, fontsize=10, feature_names=X_test.columns, class_names={0:'stayed',1:'left'}, filled=True);
plt.show()

df2 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_NoDept.csv", index_col = False) # Feature engineering on salary, avg_mnth_hrs, outliers removed, departments removed

print("df2 - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed\n")


df2.sort_index(axis=1, inplace=True)

# model_prefix : Str = prefix for results.csv added to dataset
model_prefix     = 'dt2'

# dataset : Str = dataset name for results.csv
dataset          = 'NOdept'

# rerun : int =  Flag to set 1 = append to Results.csv / 0 = Overwrite with new file
rerun            = 1

refit            = 0 # run refit, not a big issue for Decision tree.


# Display dataframe columns
df2.columns

df2 - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed

Index(['last_eval', 'left', 'number_project', 'overworked', 'promotion', 'salary', 'satisfaction', 'tenure'], dtype='object')

model_data2 = df2.copy()

# Isolate the outcome and feature variables

# Isolate the outcome variable 'left' is a binary value where true = employee left employment
Y2 = model_data2['left']

# Select & Isolate the feature variables and drop the outcome variable
X2 = model_data2.copy()
X2 = X2.drop('left', axis = 1)

# Prepare training and test data
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,Y2, test_size=0.25, stratify=Y, random_state=0)

# Instantiate model
tree2 = DecisionTreeClassifier(random_state=0)

# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth':[4, 6, 8, None],
             'min_samples_leaf': [2, 5, 1],
             'min_samples_split': [2, 4, 6]
             }

# Assign a dictionary of scoring metrics to capture
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')

# Instantiate GridSearch
model_dt2 = GridSearchCV(tree2, cv_params, scoring=scoring, cv=4, refit='roc_auc')

%%time
model_dt2.fit(X_train2, y_train2)
beepr()

CPU times: user 4.44 s, sys: 5.76 ms, total: 4.45 s
Wall time: 6.99 s

## Write pickle

if refit == 1: # refit = 1 run the fit and save the model
    write_pickle(model_path, model_dt2, 'hr_dt2-'+'NOdept')

# Read in pickle
if refit == 0: # refit = 0 load model, don't  fit the data
    model_xg2 = read_pickle(model_path, 'hr_dt2-'+'NOdept')

# Get the parameters of the best-performing model
print(model_prefix+' - '+dataset+' - '+'test\n')
print("Best Parameters : ", model_dt2.best_params_)

# Get the average f1 score of the best-performing model
print("Best Score      : {:.4f}".format(model_dt2.best_score_))

# Get the best estimators of the parameters
print("Best Estimator  : ", model_dt2.best_estimator_)

dt2 - NOdept - test

Best Parameters :  {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score      : 0.9737
Best Estimator  :  DecisionTreeClassifier(max_depth=4, random_state=0)

model_dt2_cv_results = format_GS_results(model_prefix+' - '+dataset+' - '+'GS train', model_dt2, 'auc')
model_dt2_cv_results.to_csv("Results.csv", mode='a', index=True, header=False)

pd.options.display.float_format = '{:.3f}'.format
print(model_dt2_cv_results,"\n")
display_results()

                     Model  Precision  Recall    F1  Accuracy   AUC
0  dt2 - NOdept - GS train      0.951   0.914 0.932     0.977 0.974

# Make predictions on test data

model_dt2_best_model = model_dt2.best_estimator_ # Save the best parameters for later testing
y_pred_test2 = model_dt2_best_model.predict(X_test2)

# def classification_report_summary(name:str, y_var:str, y_pred_var:str):

classification_report_summary(model_prefix+' - '+dataset+' - '+'test', y_test2,y_pred_test2)

Classification Report :  dt2 - NOdept - test
                           precision    recall  f1-score   support

Predicted would not leave       0.99      0.99      0.99      2321
    Predicted would leave       0.94      0.93      0.94       471

                 accuracy                           0.98      2792
                macro avg       0.96      0.96      0.96      2792
             weighted avg       0.98      0.98      0.98      2792

Recall        : 92.9936%
f1_score      : 93.5897%
Precision     : 94.1935%
Accuracy      : 97.8510%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 97.8510%
f1 Score      : 97.8455%
Precision     : 97.8416%
Support       : 2792

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 93.5897%
Predict Stay  : 98.7091%

dt_pred_test_results2 = make_results(model_prefix+' - '+dataset+' - '+'test', model_dt2_best_model, X_test2, y_test2, y_pred_test2) # format prediction results into a dataframe
dt_pred_test_results2.to_csv("Results.csv", mode='a', index=True, header=False) # save the prediction results in Results.csv

print(dt_pred_test_results2) # Display results of the prediction
display_results() # Display contents of Results.csv

                 Model  Precision  Recall    F1  Accuracy   AUC  Predict Leave  Predict Stay
0  dt2 - NOdept - test      0.978   0.979 0.978     0.979 0.979          0.936         0.987

# Make predictions on train data
print("Current Time =", dt.now().strftime("%H:%M:%S"))

dt_best_model2 = model_dt2.best_estimator_ # Save the best parameters for later testing
y_pred_train2 = model_dt2_best_model.predict(X_train2)

Current Time = 12:02:17

# def classification_report_summary(name:str, y_var:str, y_pred_var:str):

classification_report_summary(model_prefix+' - '+dataset+' - '+'train', y_train2,y_pred_train2)

Classification Report :  dt2 - NOdept - train
                           precision    recall  f1-score   support

Predicted would not leave       0.98      0.99      0.99      6964
    Predicted would leave       0.96      0.92      0.94      1411

                 accuracy                           0.98      8375
                macro avg       0.97      0.95      0.96      8375
             weighted avg       0.98      0.98      0.98      8375

Recall        : 91.5663%
f1_score      : 93.5215%
Precision     : 95.5621%
Accuracy      : 97.8627%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 97.8627%
f1 Score      : 97.8444%
Precision     : 97.8434%
Support       : 8375

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 93.5215%
Predict Stay  : 98.7202%

#make_lr_results(model_name:str, model_object:str, X_var:str, y_var:str, y_pred_var:str)

dt_pred_test_results2 = make_results(model_prefix+' - '+dataset+' - '+'train', model_dt2_best_model, X_train2, y_train2, y_pred_train2) # format prediction results into a dataframe
dt_pred_test_results2.to_csv("Results.csv", mode='a', index=True, header=False) # save the prediction results in Results.csv

print(dt_pred_test_results2) # Display results of the prediction
pd.options.display.float_format = '{:.4f}'.format
display_results() # Display contents of Results.csv

                  Model  Precision  Recall    F1  Accuracy   AUC  Predict Leave  Predict Stay
0  dt2 - NOdept - train      0.978   0.979 0.978     0.979 0.979          0.935         0.987

plt.figure(figsize=(18,15))
plot_tree(model_dt2_best_model,max_depth=3, fontsize=10, feature_names=X_test2.columns, class_names={0:'stayed',1:'left'}, filled=True);
plt.show()

model1 = model_dt1
model2 = model_dt2

model_name1 = "model_dt1"
model_name2 = "model_dt2"

model_best_model1 = model_dt1_best_model
model_best_model2 = model_dt2_best_model

display_results()

# Prepare confusion matrices for LR1 test
cm_test1 = metrics.confusion_matrix(y_test, y_pred_test)  # Use the optimized model
#cm_test1_percent = cm_test1 / cm_test1.sum() * 100

# Prepare confusion matrices for LR1 train
cm_test2 = metrics.confusion_matrix(y_test2, y_pred_test2)  # Use the optimized model
#cm_test2_percent = cm_test2 / cm_test2.sum() * 100

#cm = confusion_matrix(y_test, y_pred_test, labels=model_lr1.classes_)

# Plot confusion matrix
#disp = ConfusionMatrixDisplay(confusion_matrix=cm,
#                             display_labels=model_lr1.classes_)
#disp.plot(values_format='');

fig, ax = plt.subplots(2, 2, figsize=(10,8))

# Calculate percentages for TEST
sum_by_true_class = np.sum(cm_test1, axis=1)
percentage_matrix = cm_test1 / sum_by_true_class[:, np.newaxis]
model_name = "dt1"
# Create a figure and plot the percentage confusion matrix as a heatmap

sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,0]) #, xticklabels=model_lr1.classes_, yticklabels=model_lr1.classes_)
ax[0,0].set_title('{} Confusion Matrix (Percentage)'.format(model_name))
ax[0,0].set_ylabel('True label')
ax[0,0].set_xlabel('Predicted label')
ax[0,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[0,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='white')



# Create a figure and plot the COUNT confusion matrix as a heatmap for TEST
sns.heatmap(cm_test1, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,0])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,0].set_title('{} Confusion Matrix (Count)'.format(model_name))
ax[1,0].set_ylabel('True label')
ax[1,0].set_xlabel('Predicted label')
ax[1,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')



# Calculate percentages for TRAIN
sum_by_true_class = np.sum(cm_test2, axis=1)
percentage_matrix = cm_test2 / sum_by_true_class[:, np.newaxis]
model_name = "dt2"

sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[0,1].set_title('{} Confusion Matrix (Percentage)'.format(model_name))
ax[0,1].set_ylabel('True label')
ax[0,1].set_xlabel('Predicted label')
ax[0,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[0,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='white')


# Create a figure and plot the COUNT confusion matrix as a heatmap for TRAIN
sns.heatmap(cm_test2, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,1].set_title('{}  Confusion Matrix (Count)'.format(model_name))
ax[1,1].set_ylabel('True label')
ax[1,1].set_xlabel('Predicted label')
ax[1,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')


plt.tight_layout()
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

#tree2_importances = pd.DataFrame(tree2.best_estimator_.feature_importances_, columns=X.columns)
# Get feature importances (Gini importance)
feature_importance1 = model_best_model1.feature_importances_
feature_importance2 = model_best_model2.feature_importances_

# Get the names of the features
feature_names1 = X.columns  # Replace with your feature names
feature_names2 = X2.columns  # Replace with your feature names

# Create a DataFrame to store feature names and their Gini importance
feature_importance_df1 = pd.DataFrame({'Feature': feature_names1, 'Importance': feature_importance1})
feature_importance_df2 = pd.DataFrame({'Feature': feature_names2, 'Importance': feature_importance2})


feature_importance_df1.sort_values(by='Importance', ascending=False,axis=0, inplace=True)
feature_importance_df2.sort_values(by='Importance', ascending=False,axis=0, inplace=True)


merged_df = pd.merge(feature_importance_df1, feature_importance_df2, on='Feature', how='left', suffixes = (' model_xg1 - AllFeat', ' model_xg2 - NoDept'))

# Print the merged DataFrame
print(merged_df)

feature_importance_df1.sort_values(by='Importance', ascending=True,axis=0, inplace=True)
feature_importance_df2.sort_values(by='Importance', ascending=True,axis=0, inplace=True)

# Plot side-by-side bar plots
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.barh(feature_importance_df1['Feature'], feature_importance_df1['Importance'], color='skyblue')
plt.title('Feature Importance - {} - AllFeat'.format(model_name1))
plt.xlabel('Importance')
plt.ylabel('Feature')

plt.subplot(1, 2, 2)
plt.barh(feature_importance_df2['Feature'], feature_importance_df2['Importance'], color='salmon')
plt.title('Feature Importance - {} - NoDept'.format(model_name2))
plt.xlabel('Importance')
plt.ylabel('Feature')

plt.tight_layout()
plt.show()

             Feature  Importance model_xg1 - AllFeat  Importance model_xg2 - NoDept
0       satisfaction                          0.5898                         0.5898
1             tenure                          0.1542                         0.1542
2          last_eval                          0.1475                         0.1475
3     number_project                          0.1086                         0.1086
4     dept_technical                          0.0000                            NaN
5             salary                          0.0000                         0.0000
6          promotion                          0.0000                         0.0000
7         overworked                          0.0000                         0.0000
8    dept_accounting                          0.0000                            NaN
9            dept_hr                          0.0000                            NaN
10        dept_sales                          0.0000                            NaN
11        dept_randd                          0.0000                            NaN
12  dept_product_mng                          0.0000                            NaN
13    dept_marketing                          0.0000                            NaN
14   dept_management                          0.0000                            NaN
15           dept_it                          0.0000                            NaN
16      dept_support                          0.0000                            NaN

## Prepare predictions and calculate model scores

y_prob1 = model1.predict_proba(X_test)[:, 1]
y_prob2 = model2.predict_proba(X_test2)[:, 1]

precision1, recall1, _ = precision_recall_curve(y_test, y_prob1)
precision2, recall2, _ = precision_recall_curve(y_test2, y_prob2)

# Compute area under the curve (AUC)
auc_score1 = auc(recall1, precision1)
auc_score2 = auc(recall2, precision2)

print(type(precision1))
print(type(recall1))
print(type(auc_score1))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float64'>

# read in previous plot data

# Logistic Regression

variables = [['auc_score1', 'model_lr1'],
             ['auc_score2', 'model_lr2'],
             ['roc_auc1', 'model_lr1'],
             ['roc_auc2', 'model_lr2']
             ]

arrays = [['precision1','model_lr1'],
         ['recall1', 'model_lr1'],
         ['precision2','model_lr2'],
         ['recall2', 'model_lr2'],
         ['fpr1', 'model_lr1'],
         ['tpr1', 'model_lr1'],
         ['fpr2', 'model_lr2'],
         ['tpr2', 'model_lr2'],
        ]


loaded_plot_vars = {}
loaded_plot_arrays = {}

for var_name, model in variables:
    #print(var_name, model)
    #var = globals()[var_name]
    with open(f'99-documentation-project/08-plot_data/{model}-{var_name}.csv', 'r') as file:
        var = f'{model}-{var_name}'
        #print(var)
        loaded_plot_vars[var] = json.load(file)



for array_name, model in arrays:
    #print(array_name, model)
    #var = globals()[var_name]
    filepath = f'99-documentation-project/08-plot_data/{model}-{array_name}.csv'
    array = f'{model}-{array_name}'
    loaded_plot_arrays[array] = pd.read_csv(filepath)


#print(loaded_plot_arrays)
#var = loaded_plot_vars.get('model_lr1-roc_auc1')
#print(var)

#array= loaded_plot_arrays.get('model_dt2-precision2')

#lr1_fpr1 = loaded_plot_arrays.get('model_lr1-fpr1')

#va = lr1_fpr1.values()

#print(type(lr1_fpr1))
#print(lr1_fpr1)

# Retrieve arrays and variable from previous model


lr1_precision1 = loaded_plot_arrays.get('model_lr1-precision1') # retrieve fpr1 array from model1 = AllFeat
lr1_recall1 = loaded_plot_arrays.get('model_lr1-recall1') # retrieve fpr1 array from model1 = AllFeat
lr1_auc_score1 = loaded_plot_vars.get('model_lr1-auc_score1') # retrieve auc score from model1 = AllFeat

lr2_precision2 = loaded_plot_arrays.get('model_lr2-precision2') # retrieve fpr1 array from model1 = AllFeat
lr2_recall2 = loaded_plot_arrays.get('model_lr2-recall2') # retrieve fpr1 array from model1 = AllFeat
lr2_auc_score2 = loaded_plot_vars.get('model_lr2-auc_score2') # retrieve auc score from model1 = AllFeat

lr1_fpr1 = loaded_plot_arrays.get('model_lr1-fpr1') # retrieve fpr1 array from model1 = AllFeat
lr1_tpr1 = loaded_plot_arrays.get('model_lr1-tpr1') # retrieve tpr1 array from model1 = AllFeat
lr1_roc_auc1 = loaded_plot_vars.get('model_lr1-roc_auc1') # retrieve auc score from model1 = AllFeat
lr1_model_name1 = arrays[0][1] # Retrieve model_name1

lr2_fpr2 = loaded_plot_arrays.get('model_lr2-fpr2') # retrieve fpr1 array from model2 = AllFeat
lr2_tpr2 = loaded_plot_arrays.get('model_lr2-tpr2') # retrieve tpr1 array from model2 = AllFeat
lr2_roc_auc2 = loaded_plot_vars.get('model_lr2-roc_auc2') # retrieve auc score from model1 = AllFeat
lr2_model_name2 = arrays[3][1] # Retrieve model_name2

#print(lr1_precision1)
#print(lr1_recall1)

# Plot the Precision / Recall curve




#plt.subplot(1, 2, 1)
plt.figure(figsize=(12, 5))
plt.plot(recall1, precision1, color='blue', label=f'{model_name1} - AUC = {auc_score1:.2f}')
plt.plot(lr1_recall1, lr1_precision1, color='purple', label=f'{lr1_model_name1} - AUC = {lr1_auc_score1:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('{} Precision-Recall Curve'.format(model_name1))
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')
plt.show()

#plt.subplot(1, 2, 2)
plt.figure(figsize=(12, 5))
plt.plot(recall2, precision2, color='blue', label=f'{model_name2} - AUC = {auc_score2:.2f}')
plt.plot(lr2_recall2, lr2_precision2, color='purple', label=f'{lr2_model_name2} - AUC = {lr2_auc_score2:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('{} Precision-Recall Curve'.format(model_name2))
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')

#plt.tight_layout()
plt.show()

# Compute ROC curve
fpr1, tpr1, _ = roc_curve(y_test, y_prob1) # true positive rate, false positive rate
fpr2, tpr2, _ = roc_curve(y_test2, y_prob2) # true positive rate, false positive rate

# Compute area under the curve (AUC)
roc_auc1 = auc(fpr1, tpr1) 
roc_auc2 = auc(fpr2, tpr2)

#plt.subplot(1, 2, 1)
# Plot ROC curve
plt.figure(figsize=(12, 5))
plt.plot(fpr1, tpr1, color='darkorange', lw=2, label=f'{model_name1} - AUC = {roc_auc1:.2f}') # Plot from current model
plt.plot(lr1_fpr1, lr1_tpr1, color='purple', lw=2, label=f'{lr1_model_name1} - AUC = {lr1_roc_auc1:.2f}') # plot from previous LR model
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('{} Receiver Operating Characteristic (ROC) Curve'.format(model_name1))
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')
plt.show()

#plt.subplot(1, 2,2)
# Plot ROC curve
plt.figure(figsize=(12, 5))
plt.plot(fpr2, tpr2, color='darkorange', lw=2, label=f'{model_name2} - AUC = {roc_auc2:.2f}')
plt.plot(lr2_fpr2, lr2_tpr2, color='purple', lw=2, label=f'{lr2_model_name2} - AUC = {lr2_roc_auc2:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('{} Receiver Operating Characteristic (ROC) Curve'.format(model_name2))
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')

plt.show()

# Save model plot data

arrays = [['precision1',model_name1],
         ['recall1', model_name1],
         ['precision2',model_name2],
         ['recall2', model_name2],
         ['fpr1', model_name1],
         ['tpr1', model_name1],
         ['fpr2', model_name2],
         ['tpr2', model_name2],
        ]

variables = [['auc_score1', model_name1],
             ['auc_score2', model_name2],
             ['roc_auc1', model_name1],
             ['roc_auc2', model_name2]
             ]

# Save plot data scores (auc, roc)
for var_name, model in variables:
   # print(model+"-"+var_name )
    var = globals()[var_name]
    with open(f'99-documentation-project/08-plot_data/{model}-{var_name}.csv', 'w') as file:
        json.dump(var, file)

# Save plot data arrays (recall, precision, fpr, tpr)
for array_name, model in arrays:
    #print(model+'-'+array_name)
    var = globals()[array_name]
    df = pd.DataFrame({array_name: var})
    df.to_csv(f'99-documentation-project/08-plot_data/{model}-{array_name}.csv', index=False, header=False)

Document Title	Salifort Motors - ML Modelling - Decision Tree
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
4	dt1 - ALLFeat - GS train	0.951	0.914	0.932	0.977	0.974	NaN	NaN
1	lr1 - ALLFeat - train	0.796	0.827	0.804	0.827	0.892	0.328	0.900
3	lr2 - NOdept - train	0.793	0.825	0.802	0.825	0.892	0.318	0.900
0	lr1 - ALLFeat - test	0.794	0.822	0.803	0.822	0.882	0.341	0.897
2	lr2 - NOdept - test	0.795	0.823	0.804	0.823	0.882	0.343	0.898

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
5	dt1 - ALLFeat - test	0.978	0.979	0.978	0.979	0.979	0.936	0.987
4	dt1 - ALLFeat - GS train	0.951	0.914	0.932	0.977	0.974	NaN	NaN
1	lr1 - ALLFeat - train	0.796	0.827	0.804	0.827	0.892	0.328	0.900
3	lr2 - NOdept - train	0.793	0.825	0.802	0.825	0.892	0.318	0.900
0	lr1 - ALLFeat - test	0.794	0.822	0.803	0.822	0.882	0.341	0.897
2	lr2 - NOdept - test	0.795	0.823	0.804	0.823	0.882	0.343	0.898

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
5	dt1 - ALLFeat - test	0.978	0.979	0.978	0.979	0.979	0.936	0.987
6	dt1 - ALLFeat - train	0.978	0.979	0.978	0.979	0.979	0.935	0.987
4	dt1 - ALLFeat - GS train	0.951	0.914	0.932	0.977	0.974	NaN	NaN
1	lr1 - ALLFeat - train	0.796	0.827	0.804	0.827	0.892	0.328	0.900
3	lr2 - NOdept - train	0.793	0.825	0.802	0.825	0.892	0.318	0.900
0	lr1 - ALLFeat - test	0.794	0.822	0.803	0.822	0.882	0.341	0.897
2	lr2 - NOdept - test	0.795	0.823	0.804	0.823	0.882	0.343	0.898

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
5	dt1 - ALLFeat - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
8	dt2 - NOdept - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
6	dt1 - ALLFeat - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
9	dt2 - NOdept - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
7	dt2 - NOdept - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
4	dt1 - ALLFeat - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
1	lr1 - ALLFeat - train	0.7960	0.8266	0.8041	0.8266	0.8923	0.3284	0.9005
3	lr2 - NOdept - train	0.7930	0.8248	0.8015	0.8248	0.8919	0.3180	0.8995
0	lr1 - ALLFeat - test	0.7943	0.8216	0.8031	0.8216	0.8819	0.3413	0.8969
2	lr2 - NOdept - test	0.7952	0.8227	0.8039	0.8227	0.8819	0.3426	0.8975

Table of contents¶

Table of contents ¶