# Import packages

# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set Options
pd.set_option('display.max_columns', None)

# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support, \
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, precision_recall_curve, auc
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from datetime import datetime as dt

import statsmodels.api as sm

import json

# For saving models
import pickle

# Shap Explainer
import shap

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
#pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/" # Source folder for cleaned data
#save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # destination for pickle saved models

model_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # path to load/save pickled models

import chime
import time

def beepr():
    for x in range(3):
        for i in range(3):
            chime.success()
            time.sleep(0.25)
        time.sleep(1)

#beepr()

def display_results():
    '''
    Load Results.csv containing store test scores, return the scores for display
    In: none
    Out: pandas df of Results,csv containing precision, recall, f1, accuracy, and AUC scores of the models
    '''
    model_results = pd.read_csv("Results.csv")
    model_results.drop(columns=['Unnamed: 0'], inplace=True)
    model_results = model_results.sort_values(by='AUC', ascending=False)
    
    return model_results

def format_GS_results(model_name: str, model_object: object, metric: str):
    '''
    Returns a pandas df with the F1, recall, precision, accuracy, and auc scores
    from the GridSearch.

    Arguments:
        model_name (string): what you want the model to be called in the output table
        model_object: a fit GridSearchCV object from test data
        metric (string): precision, recall, f1, accuracy, or auc
  
      
    '''

    # Create dictionary that maps input metric to actual metric name in GridSearchCV
    metric_dict = {'auc': 'mean_test_roc_auc',
                   'precision': 'mean_test_precision',
                   'recall': 'mean_test_recall',
                   'f1': 'mean_test_f1',
                   'accuracy': 'mean_test_accuracy'
                  }

    # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)

    # Isolate the row of the df with the max(metric) score
    best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]

    # Extract Accuracy, precision, recall, and f1 score from that row
    auc = round(best_estimator_results.mean_test_roc_auc,3)
    f1 = round(best_estimator_results.mean_test_f1,3)
    recall = round(best_estimator_results.mean_test_recall,3)
    precision = round(best_estimator_results.mean_test_precision,3)
    accuracy = round(best_estimator_results.mean_test_accuracy,3)
  
    # Create table of results
    table = pd.DataFrame()
    table = pd.DataFrame({'Model': [model_name],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1': [f1],
                          'Accuracy': [accuracy],
                          'AUC': [auc]
                        })
  
    return table

# Get results from model to store in comparison table
def make_results(model_name: str, model_object: object, X_var: str, y_var: str, y_pred_var: str):
    '''
    Accepts as arguments :
        model name (string) - Used as the heading in Results.csv
        model object : The ML Model
        X_var, y_var, y_pred_var the variables used in the model
  
    Returns a pandas df with the F1, recall, precision, and accuracy scores
    for the model with the best mean F1 score across all validation folds.  
    '''

    # Get all the results from the CV and put them in a dict
    report = classification_report(y_var, y_pred_var,  output_dict=True)

    # Calculate precision, recall, and F1 score for the "True" class
    predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
    f1_true_class = predict_f1_score[1]  # Index 1 corresponds to the "True" class
    f1_false_class = predict_f1_score[0]  # Index 1 corresponds to the "True" class


    # Extract accuracy, precision, recall, and f1 score from that row
    f1 = report['weighted avg']['f1-score']
    recall = report['weighted avg']['recall']
    precision = report['weighted avg']['precision']
    accuracy = report['accuracy']
    auc = roc_auc_score(y_var, model_object.predict_proba(X_var)[:,1])
    # Create table of results
    table = pd.DataFrame({'Model': model_name,
                          'Precision': precision,
                          'Recall': recall,
                          'F1': f1,
                          'Accuracy': accuracy,
                          'AUC': auc,
                          'Predict Leave': f1_true_class,
                          'Predict Stay' : f1_false_class
                        },
                        index=[0]
                       )
  
    return table

def classification_report_summary(name: str, y_var: str, y_pred_var: str):
    '''
    Gather stats from predictions

    In: 
        name: str    : Title for report header e.g. TEST or TRAIN
        y_var: str       : y variable
        y_pred_var: str  : y prediction Variable

    Out: Display of precision, recall, f1, accuracy, and AUC scores of the models, Weighted Average and Prediction f1 score for true/false
    '''
    targetnames = ['Predicted would not leave', 'Predicted would leave']

  
    
    print("\nClassification Report : ", name)
    print('\u2500' * 35)
    print(classification_report(y_var, y_pred_var, target_names=targetnames))
    
    print("Recall        : {:.4%}".format(recall_score(y_var, y_pred_var)))
    print("f1_score      : {:.4%}".format(f1_score(y_var, y_pred_var)))
    print("Precision     : {:.4%}".format(precision_score(y_var, y_pred_var)))
    print("Accuracy      : {:.4%}".format(accuracy_score(y_var, y_pred_var)))
    
    report = classification_report(y_var, y_pred_var,  output_dict=True)
    
    print()
    print('\u2500' * 35) 
    print("Weighted Average")
    print('\u2500' * 35) 
    
    print("Recall        : {:.4%}".format(report['weighted avg']['recall']))
    print("f1 Score      : {:.4%}".format(report['weighted avg']['f1-score']))
    print("Precision     : {:.4%}".format(report['weighted avg']['precision']))
    print("Support       : {:.4%}".format(report['weighted avg']['support']))
    
      # Calculate precision, recall, and F1 score for the "True" class
    predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
    f1_true_class = predict_f1_score[1]  # Index 1 corresponds to the "True" class
    f1_false_class = predict_f1_score[0]  # Index 0 corresponds to the "False" class
    
    print()
    print('\u2500' * 35) 
    print("Prediction F1 score")
    print('\u2500' * 35) 
    print("Predict Leave : {:.4%}".format(f1_true_class))
    print("Support Stay  : {:.4%}".format(f1_false_class))

def write_pickle(path, model_object, save_as:str):
    '''
    In: 
        path:         path of folder where you want to save the pickle
        model_object: a model you want to pickle
        save_as:      filename for how you want to save the model

    Out: A call to pickle the model in the folder indicated
    '''    

    with open(path + save_as + '.pickle', 'wb') as to_write:
        pickle.dump(model_object, to_write)
        
def read_pickle(path, saved_model_name:str):
    '''
    In: 
        path:             path to folder where you want to read from
        saved_model_name: filename of pickled model you want to read in

    Out: 
        model: the pickled model 
    '''
    with open(path + saved_model_name + '.pickle', 'rb') as to_read:
        model = pickle.load(to_read)

    return model

# Load cleaned dataset into a dataframe
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"),"\n")

# Feature engineering on salary, avg_mnth_hrs, dept, outliers removed
df1 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_AllFeat.csv", index_col = False) 

# Always sort the index
df1.sort_index(axis=1, inplace=True)

# model_prefix : Str prefix for results.csv added to dataset
model_prefix   = 'xg1'     

# dataset : Str = dataset save name for results.csv
dataset        = 'ALLFeat' 

# fitver: Str = Suffix for the pickle save filename
fitver         = 'AllFeat' 

# rerun : int =  Flag to set 1 = append to Results.csv / 0 = Overwrite with new file
rerun          = 1         

# refit : int - 1 = Fit Data and Save model / 0 = load model, don't fit data
refit          = 0   

print("df1 - All Features - Feature engineering on salary, avg_mnth_hrs, dept. outliers removed\n")

# Display dataframe columns
print(df1.info())

Started / Last Run = 2023-12-04 12:10:43 

df1 - All Features - Feature engineering on salary, avg_mnth_hrs, dept. outliers removed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11167 entries, 0 to 11166
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   dept_accounting   11167 non-null  bool   
 1   dept_hr           11167 non-null  bool   
 2   dept_it           11167 non-null  bool   
 3   dept_management   11167 non-null  bool   
 4   dept_marketing    11167 non-null  bool   
 5   dept_product_mng  11167 non-null  bool   
 6   dept_randd        11167 non-null  bool   
 7   dept_sales        11167 non-null  bool   
 8   dept_support      11167 non-null  bool   
 9   dept_technical    11167 non-null  bool   
 10  last_eval         11167 non-null  float64
 11  left              11167 non-null  float64
 12  number_project    11167 non-null  float64
 13  overworked        11167 non-null  int64  
 14  promotion         11167 non-null  float64
 15  salary            11167 non-null  int64  
 16  satisfaction      11167 non-null  float64
 17  tenure            11167 non-null  float64
dtypes: bool(10), float64(6), int64(2)
memory usage: 807.1 KB
None

model_data = df1.copy()

# Isolate the outcome variable
Y = model_data['left']

# Select & Isolate the feature variables and drop the outcome variable
X = model_data.copy()
X = X.drop('left', axis = 1)

#X = sm.add_constant(X)

X.columns

Index(['dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd',
       'dept_sales', 'dept_support', 'dept_technical', 'last_eval', 'number_project', 'overworked', 'promotion',
       'salary', 'satisfaction', 'tenure'],
      dtype='object')

# Prepare training and test data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, stratify=Y, random_state=0)

# Instantiate model

# xgb.XGBRegressor() for regression problems, xgb.XGBClassifier() for classification problems
xgc = XGBClassifier(random_state=0)


# Assign a dictionary of hyperparameters to search over
cv_params = {'learning_rate': [0.05, 0.1, 0.15],
             'max_depth': [3, 4, 5, 6, 8],
             'min_child_weight': [1, 3, 5, 7],
             'gamma': [0.0, 0.1, 0.2],
             'colsample_bytree': [0.3, 0.4]

             }  


# Assign a dictionary of scoring metrics to capture
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')

# Instantiate GridSearch
model_xg1 = GridSearchCV(xgc, cv_params, scoring=scoring, cv=4, refit='roc_auc')

X_train.columns

Index(['dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd',
       'dept_sales', 'dept_support', 'dept_technical', 'last_eval', 'number_project', 'overworked', 'promotion',
       'salary', 'satisfaction', 'tenure'],
      dtype='object')

%%time
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"))# Fit data 
if refit == 1: # if 1, refit, we'll save the already fit model
    model_xg1.fit(X_train, y_train) # --> Wall time: 21min 5s /XGBoost 20 mins
    beepr()

print("Finish Time =", dt.now().strftime("%H:%M:%S"))

Started / Last Run = 2023-12-04 12:10:44
Finish Time = 12:10:44
CPU times: user 0 ns, sys: 491 µs, total: 491 µs
Wall time: 385 µs

## Write pickle

if refit == 1: # refit = 1 run the fit and save the model
    write_pickle(model_path, model_xg1, 'hr_xg1-'+fitver)

# Read in pickle
if refit == 0: # refit = 0 load model, don't  fit the data
    model_xg1 = read_pickle(model_path, 'hr_xg1-'+fitver)

X_train.columns

Index(['dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd',
       'dept_sales', 'dept_support', 'dept_technical', 'last_eval', 'number_project', 'overworked', 'promotion',
       'salary', 'satisfaction', 'tenure'],
      dtype='object')

# Get the parameters of the best-performing model
print("Best Parameters : ", model_xg1.best_params_)

# Get the average f1 score of the best-performing model
print("Best Score      : ", model_xg1.best_score_)

# Get the best estimators of the parameters
print("Best Estimator  : ", model_xg1.best_estimator_)

Best Parameters :  {'colsample_bytree': 0.3, 'gamma': 0.2, 'learning_rate': 0.15, 'max_depth': 4, 'min_child_weight': 1}
Best Score      :  0.9819165476477163
Best Estimator  :  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.3, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.2, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.15, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=0, ...)

pd.options.display.float_format = '{:.3f}'.format
model_xg1_results = format_GS_results(model_prefix+' - '+dataset+' - '+'GS train', model_xg1, 'auc')
model_xg1_results.to_csv("./Results.csv", mode='a', index=True, header=False)

pd.options.display.float_format = '{:.4f}'.format
print(model_xg1_results)
display_results()

                      Model  Precision  Recall     F1  Accuracy    AUC
0  xg1 - ALLFeat - GS train     0.9570  0.9100 0.9330    0.9780 0.9820

# Make predictions on test data

xg1_best_model = model_xg1.best_estimator_ # Store best model parameters for later testing
y_pred_test = xg1_best_model.predict(X_test)
y_pred_train= xg1_best_model.predict(X_train) # Used for confusion matrix plots later

classification_report_summary(model_prefix+' - '+dataset+' - '+'test', y_test, y_pred_test)

Classification Report :  xg1 - ALLFeat - test
───────────────────────────────────
                           precision    recall  f1-score   support

Predicted would not leave       0.98      0.99      0.99      2321
    Predicted would leave       0.95      0.93      0.94       471

                 accuracy                           0.98      2792
                macro avg       0.97      0.96      0.96      2792
             weighted avg       0.98      0.98      0.98      2792

Recall        : 92.5690%
f1_score      : 93.7634%
Precision     : 94.9891%
Accuracy      : 97.9226%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 97.9226%
f1 Score      : 97.9119%
Precision     : 97.9075%
Support       : 279200.0000%

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 93.7634%
Support Stay  : 98.7538%

model_xg1_results = make_results(model_prefix+' - '+dataset+' - '+'test', xg1_best_model, X_test, y_test, y_pred_test)
model_xg1_results.to_csv("./Results.csv", mode='a', index=True, header=False)

print(model_xg1_results)
print()
display_results()

                  Model  Precision  Recall     F1  Accuracy    AUC  Predict Leave  Predict Stay
0  xg1 - ALLFeat - test     0.9791  0.9792 0.9791    0.9792 0.9860         0.9376        0.9875

# Make predictions on test data

xg1_best_model = model_xg1.best_estimator_ # Store best model parameters for later testing
y_pred_test = xg1_best_model.predict(X_test)
y_pred_train= xg1_best_model.predict(X_train) # Used for confusion matrix plots later

classification_report_summary(model_prefix+' - '+dataset+' - '+'train', y_train, y_pred_train)

Classification Report :  xg1 - ALLFeat - train
───────────────────────────────────
                           precision    recall  f1-score   support

Predicted would not leave       0.98      0.99      0.99      6964
    Predicted would leave       0.96      0.91      0.94      1411

                 accuracy                           0.98      8375
                macro avg       0.97      0.95      0.96      8375
             weighted avg       0.98      0.98      0.98      8375

Recall        : 91.4954%
f1_score      : 93.8568%
Precision     : 96.3433%
Accuracy      : 97.9821%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 97.9821%
f1 Score      : 97.9612%
Precision     : 97.9656%
Support       : 837500.0000%

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 93.8568%
Support Stay  : 98.7928%

model_xg1_results = make_results(model_prefix+' - '+dataset+' - '+'train', xg1_best_model, X_train, y_train, y_pred_train)
model_xg1_results.to_csv("./Results.csv", mode='a', index=True, header=False)

print(model_xg1_results)
print()
display_results()

                   Model  Precision  Recall     F1  Accuracy    AUC  Predict Leave  Predict Stay
0  xg1 - ALLFeat - train     0.9797  0.9798 0.9796    0.9798 0.9870         0.9386        0.9879

# Make predictions on test data

xg1_best_model = model_xg1.best_estimator_ # Store best model parameters for later testing
y_pred_train = xg1_best_model.predict(X_train)
explainer = shap.TreeExplainer(xg1_best_model)

# Compute the SHAP values for a set of observations
shap_values = explainer(X_train)

plt.grid(True, linestyle='--', alpha=0.7)

# Plot the SHAP values
shap.plots.waterfall(shap_values[0])
plt.grid(True, linestyle='--', alpha=0.7)
shap.summary_plot(shap_values, X_train)
plt.grid(True, linestyle='--', alpha=0.7)
shap.plots.bar(shap_values[0])

[12:10:47] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.

explainer = shap.TreeExplainer(xg1_best_model)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Plot SHAP summary plot
shap.summary_plot(shap_values, X_test, feature_names=df1.columns)

# Add a title to the plot
plt.title('SHAP Summary Plot')

# Display the plot
plt.show()

[12:10:53] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.

# Load cleaned dataset into a dataframe
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"),"\n")

# Feature engineering on salary, avg_mnth_hrs, outliers removed, departments removed
df2 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_NoDept.csv", index_col = False) 

# Always sort the index
df2.sort_index(axis=1, inplace=True)

# model_prefix : Str prefix for results.csv added to dataset
model_prefix   = 'xg2'   

# dataset : Str = dataset save name for results.csv
dataset        = 'NOdept'

# fitver : Str = Suffix for the pickle save filename
fitver         = 'NOdept'  

# rerun : int =  Flag to set 1 = append to Results.csv / 0 = Overwrite with new file
rerun          = 1

# refit : int - 1 = Fit Data and Save model / 0 = load model, don't fit data
refit          = 0         # 1 = Fit Data and Save model / 0 = load model, don't fit data

print("df2 - FOCUS FEATURES - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed\n")

# Display dataframe columns
print(df2.info())

Started / Last Run = 2023-12-04 12:10:55 

df2 - FOCUS FEATURES - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11167 entries, 0 to 11166
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   last_eval       11167 non-null  float64
 1   left            11167 non-null  float64
 2   number_project  11167 non-null  float64
 3   overworked      11167 non-null  int64  
 4   promotion       11167 non-null  float64
 5   salary          11167 non-null  int64  
 6   satisfaction    11167 non-null  float64
 7   tenure          11167 non-null  float64
dtypes: float64(6), int64(2)
memory usage: 698.1 KB
None

model_data2 = df2.copy()

# Isolate the outcome variable
Y2 = model_data2['left']

# Select & Isolate the feature variables and drop the outcome variable
X2 = model_data2.copy()
X2 = X2.drop('left', axis = 1)

#X = sm.add_constant(X)

X2.columns

Index(['last_eval', 'number_project', 'overworked', 'promotion', 'salary', 'satisfaction', 'tenure'], dtype='object')

# Prepare training and test data
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,Y2, test_size=0.25, stratify=Y, random_state=0)

# Instantiate model

# xgb.XGBRegressor() for regression problems, xgb.XGBClassifier() for classification problems
xgc = XGBClassifier(random_state=0)


# Assign a dictionary of hyperparameters to search over
cv_params = {'learning_rate': [0.05, 0.1, 0.15],
             'max_depth': [3, 4, 5, 6, 8],
             'min_child_weight': [1, 3, 5, 7],
             'gamma': [0.0, 0.1, 0.2],
             'colsample_bytree': [0.3, 0.4]

             }  


# Assign a dictionary of scoring metrics to capture
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')

# Instantiate GridSearch
model_xg2 = GridSearchCV(xgc, cv_params, scoring=scoring, cv=4, refit='roc_auc')

%%time
print("Started / Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"))
# Fit data 
if refit == 1:
    model_xg2.fit(X_train2, y_train2) # --> Wall time:  18-21 mins
    beepr()

print("Finish Time =", dt.now().strftime("%H:%M:%S"))

Started / Last Run = 2023-12-04 12:10:56
Finish Time = 12:10:56
CPU times: user 593 µs, sys: 133 µs, total: 726 µs
Wall time: 851 µs

## Write pickle

if refit == 1: # refit = 1 run the fit and save the model
    write_pickle(model_path, model_xg2, 'hr_xg2-'+fitver)

# Read in pickle
if refit == 0: # refit = 0 load model, don't  fit the data
    model_xg2 = read_pickle(model_path, 'hr_xg2-'+fitver)

# Get the parameters of the best-performing model
print("Best Parameters : ", model_xg2.best_params_)

# Get the average f1 score of the best-performing model
print("Best Score      : ", model_xg2.best_score_)

# Get the best estimators of the parameters
print("Best Estimator  : ", model_xg2.best_estimator_)

Best Parameters :  {'colsample_bytree': 0.3, 'gamma': 0.2, 'learning_rate': 0.15, 'max_depth': 4, 'min_child_weight': 1}
Best Score      :  0.9806005736725114
Best Estimator  :  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.3, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.2, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.15, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=0, ...)

pd.options.display.float_format = '{:.3f}'.format
model_xg2_results = format_GS_results(model_prefix+' - '+dataset+' - '+'GS train', model_xg2, 'auc')
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)

pd.options.display.float_format = '{:.3f}'.format
print(model_xg2_results)
display_results()

                     Model  Precision  Recall    F1  Accuracy   AUC
0  xg2 - NOdept - GS train      0.960   0.913 0.936     0.979 0.981

# Make predictions on test data

xg2_best_model = model_xg2.best_estimator_ # Store best model parameters for later testing
y_pred_train2 = xg2_best_model.predict(X_train2)
explainer = shap.TreeExplainer(xg2_best_model)

# Compute the SHAP values for a set of observations
shap_values = explainer(X_train2)

plt.grid(True, linestyle='--', alpha=0.7)

# Plot the SHAP values
shap.plots.waterfall(shap_values[0])
shap.summary_plot(shap_values, X_train2)
shap.plots.bar(shap_values[0])

[12:10:56] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.

# Make predictions on test data

# Store best model parameters for later testing
xg2_best_model = model_xg2.best_estimator_ 
y_pred_test2 = xg2_best_model.predict(X_test2)
y_pred_train2 = xg2_best_model.predict(X_train2)

classification_report_summary(model_prefix+' - '+dataset+' - '+'test', y_test2, y_pred_test2)

Classification Report :  xg2 - NOdept - test
───────────────────────────────────
                           precision    recall  f1-score   support

Predicted would not leave       0.99      0.99      0.99      2321
    Predicted would leave       0.96      0.93      0.94       471

                 accuracy                           0.98      2792
                macro avg       0.97      0.96      0.97      2792
             weighted avg       0.98      0.98      0.98      2792

Recall        : 92.7813%
f1_score      : 94.1810%
Precision     : 95.6236%
Accuracy      : 98.0659%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 98.0659%
f1 Score      : 98.0542%
Precision     : 98.0513%
Support       : 279200.0000%

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 94.1810%
Support Stay  : 98.8402%

model_xg2_results = make_results(model_prefix+' - '+dataset+' - '+'train', xg2_best_model, X_train2, y_train2, y_pred_train2)
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)


model_xg2_results = make_results(model_prefix+' - '+dataset+' - '+'test', xg2_best_model, X_test2, y_test2, y_pred_test2)
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)

print(model_xg2_results)
print()
display_results()

                 Model  Precision  Recall    F1  Accuracy   AUC  Predict Leave  Predict Stay
0  xg2 - NOdept - test      0.981   0.981 0.981     0.981 0.983          0.942         0.988

# Make predictions on test data

xg2_best_model = model_xg2.best_estimator_ # Store best model parameters for later testing
y_pred_test2 = xg2_best_model.predict(X_test2)
explainer = shap.TreeExplainer(xg2_best_model)

# Compute the SHAP values for a set of observations
shap_values = explainer(X_test2)

plt.grid(True, linestyle='--', alpha=0.7)

# Plot the SHAP values
shap.plots.waterfall(shap_values[0])
shap.summary_plot(shap_values, X_test2)
shap.plots.bar(shap_values[0])

[12:10:59] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.

# Make predictions on test data

# Store best model parameters for later testing
xg2_best_model = model_xg2.best_estimator_ 
y_pred_train2 = xg2_best_model.predict(X_train2)
#y_pred_train2 = xg2_best_model.predict(X_train2)

classification_report_summary(model_prefix+' - '+dataset+' - '+'train', y_train2, y_pred_train2)

Classification Report :  xg2 - NOdept - train
───────────────────────────────────
                           precision    recall  f1-score   support

Predicted would not leave       0.98      0.99      0.99      6964
    Predicted would leave       0.96      0.92      0.94      1411

                 accuracy                           0.98      8375
                macro avg       0.97      0.96      0.96      8375
             weighted avg       0.98      0.98      0.98      8375

Recall        : 91.7080%
f1_score      : 94.0407%
Precision     : 96.4952%
Accuracy      : 98.0418%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 98.0418%
f1 Score      : 98.0218%
Precision     : 98.0264%
Support       : 837500.0000%

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 94.0407%
Support Stay  : 98.8284%

model_xg2_results = make_results(model_prefix+' - '+dataset+' - '+'train', xg2_best_model, X_train2, y_train2, y_pred_train2)
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)


model_xg2_results = make_results(model_prefix+' - '+dataset+' - '+'test', xg2_best_model, X_test2, y_test2, y_pred_test2)
model_xg2_results.to_csv("./Results.csv", mode='a', index=True, header=False)

print(model_xg2_results)
print()
display_results()

                 Model  Precision  Recall    F1  Accuracy   AUC  Predict Leave  Predict Stay
0  xg2 - NOdept - test      0.981   0.981 0.981     0.981 0.983          0.942         0.988

# Set parameters for following cells

model1 = model_xg1 
model2 = model_xg2


model_name1 = "model_xg1"
model_name2 = "model_xg2"

model_best_model1 = xg1_best_model
model_best_model2 = xg2_best_model

display_results()

# Prepare confusion matrices for LR1 test
cm_test1 = metrics.confusion_matrix(y_test, y_pred_test)  # Use the optimized model
#cm_test1_percent = cm_test1 / cm_test1.sum() * 100

# Prepare confusion matrices for LR1 train
cm_test2 = metrics.confusion_matrix(y_test2, y_pred_test2)  # Use the optimized model
#cm_test2_percent = cm_test2 / cm_test2.sum() * 100

#cm = confusion_matrix(y_test, y_pred_test, labels=model_lr1.classes_)

# Plot confusion matrix
#disp = ConfusionMatrixDisplay(confusion_matrix=cm,
#                             display_labels=model_lr1.classes_)
#disp.plot(values_format='');

fig, ax = plt.subplots(2, 2, figsize=(10,8))

# Calculate percentages for TEST
sum_by_true_class = np.sum(cm_test1, axis=1)
percentage_matrix = cm_test1 / sum_by_true_class[:, np.newaxis]

# Create a figure and plot the percentage confusion matrix as a heatmap

sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,0]) #, xticklabels=model_lr1.classes_, yticklabels=model_lr1.classes_)
ax[0,0].set_title('{} Confusion Matrix (Percentage)'.format(model_name1))
ax[0,0].set_ylabel('True label')
ax[0,0].set_xlabel('Predicted label')
ax[0,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[0,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='white')



# Create a figure and plot the COUNT confusion matrix as a heatmap for TEST
sns.heatmap(cm_test1, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,0])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,0].set_title('{} Confusion Matrix (Count)'.format(model_name1))
ax[1,0].set_ylabel('True label')
ax[1,0].set_xlabel('Predicted label')
ax[1,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')



# Calculate percentages for TRAIN
sum_by_true_class = np.sum(cm_test2, axis=1)
percentage_matrix = cm_test2 / sum_by_true_class[:, np.newaxis]


sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[0,1].set_title('{} Confusion Matrix (Percentage)'.format(model_name2))
ax[0,1].set_ylabel('True label')
ax[0,1].set_xlabel('Predicted label')
ax[0,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[0,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='white')


# Create a figure and plot the COUNT confusion matrix as a heatmap for TRAIN
sns.heatmap(cm_test2, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,1].set_title('{}  Confusion Matrix (Count)'.format(model_name2))
ax[1,1].set_ylabel('True label')
ax[1,1].set_xlabel('Predicted label')
ax[1,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')


plt.tight_layout()
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

#tree2_importances = pd.DataFrame(tree2.best_estimator_.feature_importances_, columns=X.columns)
# Get feature importances (Gini importance)
feature_importance1 = model_best_model1.feature_importances_
feature_importance2 = model_best_model2.feature_importances_

# Get the names of the features
feature_names1 = X.columns  # Replace with your feature names
feature_names2 = X2.columns  # Replace with your feature names

# Create a DataFrame to store feature names and their Gini importance
feature_importance_df1 = pd.DataFrame({'Feature': feature_names1, 'Importance': feature_importance1})
feature_importance_df2 = pd.DataFrame({'Feature': feature_names2, 'Importance': feature_importance2})


feature_importance_df1.sort_values(by='Importance', ascending=False,axis=0, inplace=True)
feature_importance_df2.sort_values(by='Importance', ascending=False,axis=0, inplace=True)


merged_df = pd.merge(feature_importance_df1, feature_importance_df2, on='Feature', how='left', suffixes = (' model_xg1 - AllFeat', ' model_xg2 - NoDept'))

# Print the merged DataFrame
print(merged_df)

feature_importance_df1.sort_values(by='Importance', ascending=True,axis=0, inplace=True)
feature_importance_df2.sort_values(by='Importance', ascending=True,axis=0, inplace=True)

# Plot side-by-side bar plots
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.barh(feature_importance_df1['Feature'], feature_importance_df1['Importance'], color='skyblue')
plt.title('Feature Importance - {} - AllFeat'.format(model_name1))
plt.xlabel('Importance')
plt.ylabel('Feature')

plt.subplot(1, 2, 2)
plt.barh(feature_importance_df2['Feature'], feature_importance_df2['Importance'], color='salmon')
plt.title('Feature Importance - {} - NoDept'.format(model_name2))
plt.xlabel('Importance')
plt.ylabel('Feature')

plt.tight_layout()
plt.show()

             Feature  Importance model_xg1 - AllFeat  Importance model_xg2 - NoDept
0     number_project                           0.314                          0.229
1             tenure                           0.281                          0.296
2       satisfaction                           0.152                          0.233
3          last_eval                           0.102                          0.115
4         overworked                           0.100                          0.105
5             salary                           0.020                          0.017
6          promotion                           0.005                          0.005
7    dept_management                           0.005                            NaN
8         dept_sales                           0.004                            NaN
9         dept_randd                           0.004                            NaN
10   dept_accounting                           0.003                            NaN
11    dept_technical                           0.002                            NaN
12      dept_support                           0.002                            NaN
13           dept_it                           0.002                            NaN
14  dept_product_mng                           0.002                            NaN
15    dept_marketing                           0.002                            NaN
16           dept_hr                           0.001                            NaN

# read in previous plot data

# Logistic Regression

variables = [['auc_score1', 'model_lr1'],
             ['roc_auc1', 'model_lr1'],
             
             ['auc_score2', 'model_lr2'],
             ['roc_auc2', 'model_lr2'],
             
             ['auc_score1', 'model_dt1'],
             ['roc_auc1', 'model_dt1'],
             
             ['auc_score2', 'model_dt2'],
             ['roc_auc2', 'model_dt2']
             ]

arrays = [['precision1','model_lr1'],
         ['recall1', 'model_lr1'],
         ['fpr1', 'model_lr1'],
         ['tpr1', 'model_lr1'],
         
         ['precision2','model_lr2'],
         ['recall2', 'model_lr2'],
         ['fpr2', 'model_lr2'],
         ['tpr2', 'model_lr2'],

         ['precision1','model_dt1'],
         ['recall1', 'model_dt1'],
         ['fpr1', 'model_dt1'],
         ['tpr1', 'model_dt1'],
         
         ['precision2','model_dt2'],
         ['recall2', 'model_dt2'],
         ['fpr2', 'model_dt2'],
         ['tpr2', 'model_dt2']
        ]


loaded_plot_vars = {}
loaded_plot_arrays = {}

for var_name, model in variables:
    print(model+'-'+var_name)
    #var = globals()[var_name]
    with open(f'99-documentation-project/08-plot_data/{model}-{var_name}.csv', 'r') as file:
        var = f'{model}-{var_name}'
        print(var)
        loaded_plot_vars[var] = json.load(file)



for array_name, model in arrays:
    #print(array_name, model)
    #var = globals()[var_name]
    filepath = f'99-documentation-project/08-plot_data/{model}-{array_name}.csv'
    array = f'{model}-{array_name}'
    loaded_plot_arrays[array] = pd.read_csv(filepath)


#print(loaded_plot_arrays)
#var = loaded_plot_vars.get('model_lr1-roc_auc1')
#print(var)

#array= loaded_plot_arrays.get('model_dt2-precision2')

#lr1_fpr1 = loaded_plot_arrays.get('model_lr1-fpr1')

#va = lr1_fpr1.values()

#print(type(lr1_fpr1))
#print(lr1_fpr1)

# Retrieve arrays and variable from previous model


lr1_precision1 = loaded_plot_arrays.get('model_lr1-precision1') # retrieve fpr1 array from model1 = AllFeat
lr1_recall1 = loaded_plot_arrays.get('model_lr1-recall1') # retrieve fpr1 array from model1 = AllFeat
lr1_auc_score1 = loaded_plot_vars.get('model_lr1-auc_score1') # retrieve auc score from model1 = AllFeat

lr2_precision2 = loaded_plot_arrays.get('model_lr2-precision2') # retrieve fpr1 array from model1 = AllFeat
lr2_recall2 = loaded_plot_arrays.get('model_lr2-recall2') # retrieve fpr1 array from model1 = AllFeat
lr2_auc_score2 = loaded_plot_vars.get('model_lr2-auc_score2') # retrieve auc score from model1 = AllFeat

lr1_fpr1 = loaded_plot_arrays.get('model_lr1-fpr1') # retrieve fpr1 array from model1 = AllFeat
lr1_tpr1 = loaded_plot_arrays.get('model_lr1-tpr1') # retrieve tpr1 array from model1 = AllFeat
lr1_roc_auc1 = loaded_plot_vars.get('model_lr1-roc_auc1') # retrieve auc score from model1 = AllFeat
lr1_model_name1 = arrays[0][1] # Retrieve model_name1

lr2_fpr2 = loaded_plot_arrays.get('model_lr2-fpr2') # retrieve fpr1 array from model2 = AllFeat
lr2_tpr2 = loaded_plot_arrays.get('model_lr2-tpr2') # retrieve tpr1 array from model2 = AllFeat
lr2_roc_auc2 = loaded_plot_vars.get('model_lr2-roc_auc2') # retrieve auc score from model1 = AllFeat
lr2_model_name2 = arrays[4][1] # Retrieve model_name2

dt1_precision1 = loaded_plot_arrays.get('model_dt1-precision1') # retrieve fpr1 array from model1 = AllFeat
dt1_recall1 = loaded_plot_arrays.get('model_dt1-recall1') # retrieve fpr1 array from model1 = AllFeat
dt1_auc_score1 = loaded_plot_vars.get('model_dt1-auc_score1') # retrieve auc score from model1 = AllFeat

dt2_precision2 = loaded_plot_arrays.get('model_dt2-precision2') # retrieve fpr1 array from model1 = AllFeat
dt2_recall2 = loaded_plot_arrays.get('model_dt2-recall2') # retrieve fpr1 array from model1 = AllFeat
dt2_auc_score2 = loaded_plot_vars.get('model_dt2-auc_score2') # retrieve auc score from model1 = AllFeat

dt1_fpr1 = loaded_plot_arrays.get('model_dt1-fpr1') # retrieve fpr1 array from model1 = AllFeat
dt1_tpr1 = loaded_plot_arrays.get('model_dt1-tpr1') # retrieve tpr1 array from model1 = AllFeat
dt1_roc_auc1 = loaded_plot_vars.get('model_dt1-roc_auc1') # retrieve auc score from model1 = AllFeat
dt1_model_name1 = arrays[8][1] # Retrieve model_name1

dt2_fpr2 = loaded_plot_arrays.get('model_dt2-fpr2') # retrieve fpr1 array from model2 = AllFeat
dt2_tpr2 = loaded_plot_arrays.get('model_dt2-tpr2') # retrieve tpr1 array from model2 = AllFeat
dt2_roc_auc2 = loaded_plot_vars.get('model_dt2-roc_auc2') # retrieve auc score from model1 = AllFeat
dt2_model_name2 = arrays[12][1] # Retrieve model_name2

print(lr1_model_name1)
print(lr2_model_name2)
print(dt1_model_name1)
print(dt2_model_name2)

model_lr1-auc_score1
model_lr1-auc_score1
model_lr1-roc_auc1
model_lr1-roc_auc1
model_lr2-auc_score2
model_lr2-auc_score2
model_lr2-roc_auc2
model_lr2-roc_auc2
model_dt1-auc_score1
model_dt1-auc_score1
model_dt1-roc_auc1
model_dt1-roc_auc1
model_dt2-auc_score2
model_dt2-auc_score2
model_dt2-roc_auc2
model_dt2-roc_auc2
model_lr1
model_lr2
model_dt1
model_dt2

y_prob1 = model1.predict_proba(X_test)[:, 1]
y_prob2 = model2.predict_proba(X_test2)[:, 1]

precision1, recall1, _ = precision_recall_curve(y_test, y_prob1)
precision2, recall2, _ = precision_recall_curve(y_test2, y_prob2)

# Compute area under the curve (AUC)
auc_score1 = auc(recall1, precision1)
auc_score2 = auc(recall2, precision2)





#plt.subplot(1, 2, 1)
plt.figure(figsize=(12, 5))
plt.plot(recall1, precision1, color='blue', label=f'{model_name1} - AUC = {auc_score1:.2f}')
plt.plot(lr1_recall1, lr1_precision1, color='purple', label=f'{lr1_model_name1} - AUC = {lr1_auc_score1:.2f}')
plt.plot(dt1_recall1, dt1_precision1, color='green', label=f'{dt1_model_name1} - AUC = {dt1_auc_score1:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('{} Precision-Recall Curve'.format(model_name1))
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')
plt.show()

#plt.subplot(1, 2, 2)
plt.figure(figsize=(12, 5))
plt.plot(recall2, precision2, color='blue', label=f'{model_name2} - AUC = {auc_score2:.2f}')
plt.plot(lr2_recall2, lr2_precision2, color='purple', label=f'{lr2_model_name2} - AUC = {lr1_auc_score1:.2f}')
plt.plot(dt2_recall2, dt2_precision2, color='green', label=f'{dt2_model_name2} - AUC = {dt1_auc_score1:.2f}')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('{} Precision-Recall Curve'.format(model_name2))
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')

#plt.tight_layout()
plt.show()

# Compute ROC curve
fpr1, tpr1, _ = roc_curve(y_test, y_prob1)
fpr2, tpr2, _ = roc_curve(y_test2, y_prob2)

# Compute area under the curve (AUC)
roc_auc1 = auc(fpr1, tpr1)
roc_auc2 = auc(fpr2, tpr2)




#plt.subplot(1, 2, 1)
# Plot ROC curve
plt.figure(figsize=(12, 5))
plt.plot(fpr1, tpr1, color='darkorange', lw=2, label=f'{model_name1} - AUC = {roc_auc1:.2f}')
plt.plot(lr1_fpr1, lr1_tpr1, color='purple', lw=2, label=f'{lr1_model_name1} - AUC = {roc_auc1:.2f}')
plt.plot(dt1_fpr1, dt1_tpr1, color='green', lw=2, label=f'{dt1_model_name1} - AUC = {roc_auc1:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('{} Receiver Operating Characteristic (ROC) Curve'.format(model_name1))
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')
plt.show()

#plt.subplot(1, 2,2)
# Plot ROC curve
plt.figure(figsize=(12, 5))
plt.plot(fpr2, tpr2, color='darkorange', lw=2, label=f'{model_name2} - AUC = {roc_auc2:.2f}')
plt.plot(lr2_fpr2   , lr2_tpr2, color='purple', lw=2, label=f'{lr2_model_name2} - AUC = {roc_auc1:.2f}')
plt.plot(dt2_fpr2, dt2_tpr2, color='green', lw=2, label=f'{dt2_model_name2} - AUC = {roc_auc1:.2f}')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('{} Receiver Operating Characteristic (ROC) Curve'.format(model_name2))
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')
plt.show()

# Save model plot data

arrays = [['precision1',model_name1],
         ['recall1', model_name1],
         ['precision2',model_name2],
         ['recall2', model_name2],
         ['fpr1', model_name1],
         ['tpr1', model_name1],
         ['fpr2', model_name2],
         ['tpr2', model_name2],
        ]

variables = [['auc_score1', model_name1],
             ['auc_score2', model_name2],
             ['roc_auc1', model_name1],
             ['roc_auc2', model_name2]
             ]

# Save plot data scores (auc, roc)
for var_name, model in variables:
    print(model+"-"+var_name )
    var = globals()[var_name]
    with open(f'99-documentation-project/08-plot_data/{model}-{var_name}.csv', 'w') as file:
        json.dump(var, file)

# Save plot data arrays (recall, precision, fpr, tpr)
for array_name, model in arrays:
    print(model+'-'+array_name)
    var = globals()[array_name]
    df = pd.DataFrame({array_name: var})
    df.to_csv(f'99-documentation-project/08-plot_data/{model}-{array_name}.csv', index=False, header=False)

model_xg1-auc_score1
model_xg2-auc_score2
model_xg1-roc_auc1
model_xg2-roc_auc2
model_xg1-precision1
model_xg1-recall1
model_xg2-precision2
model_xg2-recall2
model_xg1-fpr1
model_xg1-tpr1
model_xg2-fpr2
model_xg2-tpr2

Document Title	Salifort Motors - ML Modelling - XGBoost
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
10	xg1 - ALLFeat - GS train	0.9570	0.9100	0.9330	0.9780	0.9820	NaN	NaN
5	dt1 - ALLFeat - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
8	dt2 - NOdept - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
6	dt1 - ALLFeat - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
9	dt2 - NOdept - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
7	dt2 - NOdept - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
4	dt1 - ALLFeat - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
1	lr1 - ALLFeat - train	0.7960	0.8266	0.8041	0.8266	0.8923	0.3284	0.9005
3	lr2 - NOdept - train	0.7930	0.8248	0.8015	0.8248	0.8919	0.3180	0.8995
0	lr1 - ALLFeat - test	0.7943	0.8216	0.8031	0.8216	0.8819	0.3413	0.8969
2	lr2 - NOdept - test	0.7952	0.8227	0.8039	0.8227	0.8819	0.3426	0.8975

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
11	xg1 - ALLFeat - test	0.9791	0.9792	0.9791	0.9792	0.9860	0.9376	0.9875
10	xg1 - ALLFeat - GS train	0.9570	0.9100	0.9330	0.9780	0.9820	NaN	NaN
5	dt1 - ALLFeat - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
8	dt2 - NOdept - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
6	dt1 - ALLFeat - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
9	dt2 - NOdept - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
7	dt2 - NOdept - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
4	dt1 - ALLFeat - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
1	lr1 - ALLFeat - train	0.7960	0.8266	0.8041	0.8266	0.8923	0.3284	0.9005
3	lr2 - NOdept - train	0.7930	0.8248	0.8015	0.8248	0.8919	0.3180	0.8995
0	lr1 - ALLFeat - test	0.7943	0.8216	0.8031	0.8216	0.8819	0.3413	0.8969
2	lr2 - NOdept - test	0.7952	0.8227	0.8039	0.8227	0.8819	0.3426	0.8975

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
12	xg1 - ALLFeat - train	0.9797	0.9798	0.9796	0.9798	0.9870	0.9386	0.9879
11	xg1 - ALLFeat - test	0.9791	0.9792	0.9791	0.9792	0.9860	0.9376	0.9875
10	xg1 - ALLFeat - GS train	0.9570	0.9100	0.9330	0.9780	0.9820	NaN	NaN
5	dt1 - ALLFeat - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
8	dt2 - NOdept - test	0.9784	0.9785	0.9785	0.9785	0.9791	0.9359	0.9871
6	dt1 - ALLFeat - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
9	dt2 - NOdept - train	0.9784	0.9786	0.9784	0.9786	0.9789	0.9352	0.9872
7	dt2 - NOdept - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
4	dt1 - ALLFeat - GS train	0.9506	0.9135	0.9317	0.9774	0.9737	NaN	NaN
1	lr1 - ALLFeat - train	0.7960	0.8266	0.8041	0.8266	0.8923	0.3284	0.9005
3	lr2 - NOdept - train	0.7930	0.8248	0.8015	0.8248	0.8919	0.3180	0.8995
0	lr1 - ALLFeat - test	0.7943	0.8216	0.8031	0.8216	0.8819	0.3413	0.8969
2	lr2 - NOdept - test	0.7952	0.8227	0.8039	0.8227	0.8819	0.3426	0.8975

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
12	xg1 - ALLFeat - train	0.980	0.980	0.980	0.980	0.987	0.939	0.988
11	xg1 - ALLFeat - test	0.979	0.979	0.979	0.979	0.986	0.938	0.988
10	xg1 - ALLFeat - GS train	0.957	0.910	0.933	0.978	0.982	NaN	NaN
13	xg2 - NOdept - GS train	0.960	0.913	0.936	0.979	0.981	NaN	NaN
5	dt1 - ALLFeat - test	0.978	0.979	0.978	0.979	0.979	0.936	0.987
8	dt2 - NOdept - test	0.978	0.979	0.978	0.979	0.979	0.936	0.987
6	dt1 - ALLFeat - train	0.978	0.979	0.978	0.979	0.979	0.935	0.987
9	dt2 - NOdept - train	0.978	0.979	0.978	0.979	0.979	0.935	0.987
7	dt2 - NOdept - GS train	0.951	0.914	0.932	0.977	0.974	NaN	NaN
4	dt1 - ALLFeat - GS train	0.951	0.914	0.932	0.977	0.974	NaN	NaN
1	lr1 - ALLFeat - train	0.796	0.827	0.804	0.827	0.892	0.328	0.900
3	lr2 - NOdept - train	0.793	0.825	0.802	0.825	0.892	0.318	0.900
0	lr1 - ALLFeat - test	0.794	0.822	0.803	0.822	0.882	0.341	0.897
2	lr2 - NOdept - test	0.795	0.823	0.804	0.823	0.882	0.343	0.898

Table of contents¶

Table of contents ¶