# Import packages

# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set Options
pd.set_option('display.max_columns', None)

# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_fscore_support, precision_recall_curve, auc
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm

from datetime import datetime as dt

import json

# For saving models
import pickle

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

# Source folder for cleaned data
load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/" 
load_path = "./00-data_cleaned/" 
# destination for pickle saved models
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/"

# Get results from Logistic Regression to store in comparison table
def make_results(model_name: str, model_object: object, X_var: str, y_var: str, y_pred_var: str):
    '''
    Returns a pandas df with the F1, recall, precision, and accuracy scores
    for the model with the best mean F1 score across all validation folds.     

    In: 
        model_name (string):  How you want your model to be named in the output table
        model_object:         The model object
        X_var:                numpy array of X data
        y_var:                numpy array of y data
        y_pred_var:           numpy array of predict

    Out: pandas df containing precision, recall, f1, accuracy, and AUC scores of the models
  
 
    '''

    # Get all the results from the CV and put them in a dict
    report = classification_report(y_var, y_pred_var,  output_dict=True)

    # Calculate precision, recall, and F1 score for the "True" class
    predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
    f1_true_class = predict_f1_score[1]  # Index 1 corresponds to the "True" class
    f1_false_class = predict_f1_score[0]  # Index 1 corresponds to the "True" class


    # Extract accuracy, precision, recall, and f1 score from that row
    f1 = report['weighted avg']['f1-score']
    recall = report['weighted avg']['recall']
    precision = report['weighted avg']['precision']
    accuracy = report['accuracy']
    auc = roc_auc_score(y_var, model_object.predict_proba(X_var)[:,1])
    # Create table of results
    table = pd.DataFrame({'Model': model_name,
                          'Precision': precision,
                          'Recall': recall,
                          'F1': f1,
                          'Accuracy': accuracy,
                          'AUC': auc,
                          'Predict Leave': f1_true_class,
                          'Predict Stay' : f1_false_class
                                          
                        },
                        index=[0]
                       )
  
    return table

def display_results():
    '''
    Load Results.csv containing store test scores, return the scores for display

    In: 
        none

    Out: pandas df of Results,csv containing precision, recall, f1, accuracy, and AUC scores of the models
    '''
    model_results = pd.read_csv("Results.csv")
    model_results.drop(columns=['Unnamed: 0'], inplace=True)
    model_results = model_results.sort_values(by='AUC', ascending=False)
    
    return model_results

def classification_report_summary(name:str, y_var:str, y_pred_var:str):
    '''
    Gather stats from predictions

    In: 
        name:str    : Test data name for report header e.g. TEST or TRAIN
        y_var       : y variable
        y_pred_var  : y prediction variable

    Out: Display of precision, recall, f1, accuracy, and AUC scores of the models, Weighted Average and Prediction f1 score for true/false
    '''
    targetnames = ['Predicted would not leave', 'Predicted would leave']

  
    
    print("\nClassification Report : ", name)
    print(classification_report(y_var, y_pred_var, target_names=targetnames))
    
    print("Recall        : {:.4%}".format(recall_score(y_var, y_pred_var)))
    print("f1_score      : {:.4%}".format(f1_score(y_var, y_pred_var)))
    print("Precision     : {:.4%}".format(precision_score(y_var, y_pred_var)))
    print("Accuracy      : {:.4%}".format(accuracy_score(y_var, y_pred_var)))
    
    report = classification_report(y_var, y_pred_var,  output_dict=True)
    
    print()
    print('\u2500' * 35) 
    print("Weighted Average")
    print('\u2500' * 35) 
    
    print("Recall        : {:.4%}".format(report['weighted avg']['recall']))
    print("f1 Score      : {:.4%}".format(report['weighted avg']['f1-score']))
    print("Precision     : {:.4%}".format(report['weighted avg']['precision']))
    print("Support       : {:.0f}".format(report['weighted avg']['support']))
    
      # Calculate precision, recall, and F1 score for the "True" class
    predict_precision, predict_recall, predict_f1_score, _ = precision_recall_fscore_support(y_var, y_pred_var, average=None)
    f1_true_class = predict_f1_score[1]  # Index 1 corresponds to the "True" class
    f1_false_class = predict_f1_score[0]  # Index 0 corresponds to the "False" class
    
    print()
    print('\u2500' * 35) 
    print("Prediction F1 score")
    print('\u2500' * 35) 
    print("Predict Leave : {:.4%}".format(f1_true_class))
    print("Support Stay  : {:.4%}".format(f1_false_class))

df1 = pd.read_csv(load_path + "data_cleaned_NoOl_NoFE_AllFeat.csv", index_col = False) 

print("\ndata_cleaned_NoOl_NoFE_AllFeat.csv")
print(df1.shape)
print(df1.head(1))

df1 = pd.read_csv(load_path + "data_cleaned_Ol_NoFE_AllFeat.csv", index_col = False) # includes outliers
print("\ndata_cleaned_Ol_NoFE_AllFeat.csv")
print(df1.shape)
print(df1.head(1))

df1 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_AllFeat.csv", index_col = False) 
print("\ndata_cleaned_NoOl_FE_AllFeat.csv")
print(df1.shape)
print(df1.head(1))
df1 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_NoDept.csv", index_col = False) 
print("\ndata_cleaned_NoOl_FE_NoDept.csv")
print(df1.shape)
print(df1.head(1))

# model_prefix : Str = prefix for results.csv added to dataset
model_prefix      = 'lr1'

# dataset : Str = dataset name for results.csv
dataset           = 'ALLFeat'

# rerun : int 1 = append to the Results.csv file with no headers / 0 = Write new file with headers
rerun             = 0

print("df1 - Feature engineering on salary, avg_mnth_hrs, dept, outliers removed\n")

# Display dataframe columns
print(df1.shape)

data_cleaned_NoOl_NoFE_AllFeat.csv
(11167, 10)
   satisfaction  last_eval  number_project  avg_mnth_hrs  tenure  accident  left  promotion   dept salary
0          0.38       0.53            2.00        157.00    3.00      0.00  1.00       0.00  sales    low

data_cleaned_Ol_NoFE_AllFeat.csv
(11991, 10)
   satisfaction  last_eval  number_project  avg_mnth_hrs  tenure  accident  left  promotion   dept salary
0          0.38       0.53               2           157       3         0     1          0  sales    low

data_cleaned_NoOl_FE_AllFeat.csv
(11167, 18)
   satisfaction  last_eval  number_project  tenure  left  promotion  salary  dept_accounting  dept_hr  dept_it  \
0          0.38       0.53            2.00    3.00  1.00       0.00       0            False    False    False   

   dept_management  dept_marketing  dept_product_mng  dept_randd  dept_sales  dept_support  dept_technical  overworked  
0            False           False             False       False        True         False           False           0  

data_cleaned_NoOl_FE_NoDept.csv
(11167, 8)
   satisfaction  last_eval  number_project  overworked  tenure  left  promotion  salary
0          0.38       0.53            2.00           0    3.00  1.00       0.00       0
df1 - Feature engineering on salary, avg_mnth_hrs, dept, outliers removed

(11167, 8)

# Load cleaned dataset into a dataframe
print("Started // Last Run =", dt.now().strftime("%Y-%m-%d %H:%M:%S"),"\n")

# data_cleaned_NoOl_FE_AllFeat.csv = Feature engineering on salary, avg_mnth_hrs, dept. Accident, Duplicates, and Outliers removed
df1 = pd.read_csv(load_path + "data_cleaned_NoOl_NoFE_AllFeat.csv", index_col = False)

Started // Last Run = 2023-12-04 12:00:15

# little bit of feature engineering

df1['salary'] = (
    df1['salary'].astype('category')
    .cat.set_categories(['low', 'medium', 'high'])
    .cat.codes
)

# One Hot Encode dept
df1 = pd.get_dummies(df1, columns = ['dept'])
df1.dtypes

satisfaction        float64
last_eval           float64
number_project      float64
avg_mnth_hrs        float64
tenure              float64
accident            float64
left                float64
promotion           float64
salary                 int8
dept_accounting        bool
dept_hr                bool
dept_it                bool
dept_management        bool
dept_marketing         bool
dept_product_mng       bool
dept_randd             bool
dept_sales             bool
dept_support           bool
dept_technical         bool
dtype: object

model_data = df1.copy() # copy df to df used for modelling

# Linear Regression model

# Save X and Y data into variables
Y = model_data['left'] # Isolate the outcome variable
X = model_data.copy()
X = X.drop('left', axis = 1) # Isolate the feature variables, drop the outcome variable left
#X = sm.add_constant(X)

# Split Test/Train Data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, stratify=Y, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit the multiple regression model
model_lr1 = LogisticRegression() # Training Data

model_lr1.fit(X_train, y_train) # Training data for model_lr1

LogisticRegression()

LogisticRegression()

# Make Predictions using Logistic Regression
y_pred_test = model_lr1.predict(X_test)

y_pred_test

array([0., 0., 0., ..., 0., 0., 0.])

classification_report_summary(model_prefix+' - '+dataset+' - '+'TEST', y_test, y_pred_test)

Classification Report :  lr1 - ALLFeat - TEST
                           precision    recall  f1-score   support

Predicted would not leave       0.86      0.93      0.90      2321
    Predicted would leave       0.45      0.27      0.34       471

                 accuracy                           0.82      2792
                macro avg       0.66      0.60      0.62      2792
             weighted avg       0.79      0.82      0.80      2792

Recall        : 27.3885%
f1_score      : 34.1270%
Precision     : 45.2632%
Accuracy      : 82.1633%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 82.1633%
f1 Score      : 80.3127%
Precision     : 79.4256%
Support       : 2792

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 34.1270%
Support Stay  : 89.6852%

result_table = make_results(model_prefix+' - '+dataset+' - '+'test',model_lr1, X_test, y_test, y_pred_test)

# If the model is being run for the first time, create a new Results.csv file
if rerun == 0:

    # First save to Results.csv, no mode set, write headers, write new file
    result_table.to_csv("Results.csv", index=True, header=True) 

# If the model is being RERUN with a new data, APPEND to existing Results.csv
elif rerun == 1 : 
    
    # APPEND save to Results.csv, don't write headers, APPEND new file
    result_table.to_csv("Results.csv", index=True, mode='a', header=False)

print(result_table)
print()
display_results()

                  Model  Precision  Recall   F1  Accuracy  AUC  Predict Leave  Predict Stay
0  lr1 - ALLFeat - test       0.79    0.82 0.80      0.82 0.88           0.34          0.90

# Make Predictions using Logistic Regression

y_pred_train = model_lr1.predict(X_train) # worth a look!

classification_report_summary(model_prefix+' - '+dataset+' - '+'TRAIN', y_train, y_pred_train)

Classification Report :  lr1 - ALLFeat - TRAIN
                           precision    recall  f1-score   support

Predicted would not leave       0.86      0.94      0.90      6964
    Predicted would leave       0.47      0.25      0.33      1411

                 accuracy                           0.83      8375
                macro avg       0.67      0.60      0.61      8375
             weighted avg       0.80      0.83      0.80      8375

Recall        : 25.1595%
f1_score      : 32.8400%
Precision     : 47.2703%
Accuracy      : 82.6627%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 82.6627%
f1 Score      : 80.4086%
Precision     : 79.5988%
Support       : 8375

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 32.8400%
Support Stay  : 90.0466%

result_table = make_results(model_prefix+' - '+dataset+' - '+'train',model_lr1, X_train, y_train, y_pred_train)
result_table.to_csv("Results.csv", index=True, mode='a', header=False) # Append to existing Results.csv file, mode = 'a', no headers

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
pd.options.display.float_format = '{:.3f}'.format
print(result_table)
print()
display_results()

                   Model  Precision  Recall    F1  Accuracy   AUC  Predict Leave  Predict Stay
0  lr1 - ALLFeat - train      0.796   0.827 0.804     0.827 0.892          0.328         0.900

df1.columns

Index(['satisfaction', 'last_eval', 'number_project', 'avg_mnth_hrs', 'tenure', 'accident', 'left', 'promotion',
       'salary', 'dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng',
       'dept_randd', 'dept_sales', 'dept_support', 'dept_technical'],
      dtype='object')

#df2 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_NoDept.csv", index_col = False) # Feature engineering on salary, avg_mnth_hrs, outliers removed, departments removed

df2= df1[['satisfaction', 'last_eval', 'number_project', 'avg_mnth_hrs', 'tenure', 'accident', 'left', 'promotion',
        'salary']]

# prefix for results.csv added to dataset
model_prefix         = 'lr2'

# dataset name for results.csv
dataset              = 'NOdept'

# 1 = append to the Results.csv file with no headers / 0 = Write new file with headers
rerun                = 1

print("df2 - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed\n")
# Display dataframe columns
print(df2.info())

df2 - Feature engineering on salary, avg_mnth_hrs, dept REMOVED, outliers removed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11167 entries, 0 to 11166
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   satisfaction    11167 non-null  float64
 1   last_eval       11167 non-null  float64
 2   number_project  11167 non-null  float64
 3   avg_mnth_hrs    11167 non-null  float64
 4   tenure          11167 non-null  float64
 5   accident        11167 non-null  float64
 6   left            11167 non-null  float64
 7   promotion       11167 non-null  float64
 8   salary          11167 non-null  int8   
dtypes: float64(8), int8(1)
memory usage: 709.0 KB
None

model_data2 = df2.copy() # copy df to df used for modelling

# Linear Regression model

# Save X and Y data into variables
Y2 = model_data2['left'] # Isolate the outcome variable
X2 = model_data2.copy()
X2 = X2.drop('left', axis = 1) # Isolate the feature variables, drop the outcome variable left
#X = sm.add_constant(X)

# Split Test/Train Data
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,Y2, test_size=0.25, stratify=Y, random_state=42)

scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train2)
X_test2 = scaler.transform(X_test2)

# Fit the multiple regression model
model_lr2 = LogisticRegression() # Training Data

model_lr2.fit(X_train2, y_train2) # Training data for model_lr1

LogisticRegression()

LogisticRegression()

# Make Predictions using Logistic Regression
y_pred_test2 = model_lr2.predict(X_test2)

classification_report_summary(model_prefix+' - '+dataset+' - '+'TEST', y_test2, y_pred_test2)

Classification Report :  lr2 - NOdept - TEST
                           precision    recall  f1-score   support

Predicted would not leave       0.86      0.93      0.90      2321
    Predicted would leave       0.46      0.27      0.34       471

                 accuracy                           0.82      2792
                macro avg       0.66      0.60      0.62      2792
             weighted avg       0.80      0.82      0.80      2792

Recall        : 27.3885%
f1_score      : 34.2629%
Precision     : 45.7447%
Accuracy      : 82.2708%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 82.2708%
f1 Score      : 80.3926%
Precision     : 79.5204%
Support       : 2792

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 34.2629%
Support Stay  : 89.7537%

result_table = make_results(model_prefix+' - '+dataset+' - '+'test',model_lr2, X_test2, y_test2, y_pred_test2)

# If the model is being run for the first time, create a new Results.csv file
if rerun == 0:

    # First save to Results.csv, no mode set, write headers, write new file
    result_table.to_csv("Results.csv", index=True, header=True) 

# If the model is being RERUN with a new data, APPEND to existing Results.csv
elif rerun == 1 : 
    
    # APPEND save to Results.csv, don't write headers, APPEND new file
    result_table.to_csv("Results.csv", index=True, mode='a', header=False)

print(result_table)
print()
display_results()

                 Model  Precision  Recall    F1  Accuracy   AUC  Predict Leave  Predict Stay
0  lr2 - NOdept - test      0.795   0.823 0.804     0.823 0.882          0.343         0.898

# Make Predictions using Logistic Regression

y_pred_train2 = model_lr2.predict(X_train2) # worth a look!

classification_report_summary(model_prefix+' - '+dataset+' - '+'TRAIN', y_train2, y_pred_train2)

Classification Report :  lr2 - NOdept - TRAIN
                           precision    recall  f1-score   support

Predicted would not leave       0.86      0.94      0.90      6964
    Predicted would leave       0.46      0.24      0.32      1411

                 accuracy                           0.82      8375
                macro avg       0.66      0.59      0.61      8375
             weighted avg       0.79      0.82      0.80      8375

Recall        : 24.2381%
f1_score      : 31.7992%
Precision     : 46.2162%
Accuracy      : 82.4836%

───────────────────────────────────
Weighted Average
───────────────────────────────────
Recall        : 82.4836%
f1 Score      : 80.1540%
Precision     : 79.2962%
Support       : 8375

───────────────────────────────────
Prediction F1 score
───────────────────────────────────
Predict Leave : 31.7992%
Support Stay  : 89.9514%

result_table = make_results(model_prefix+' - '+dataset+' - '+'train',model_lr2, X_train2, y_train2, y_pred_train2)
result_table.to_csv("Results.csv", index=True, mode='a', header=False) # Append to existing Results.csv file, mode = 'a', no headers

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
pd.options.display.float_format = '{:.4f}'.format
print(result_table)
print()
display_results()

                  Model  Precision  Recall     F1  Accuracy    AUC  Predict Leave  Predict Stay
0  lr2 - NOdept - train     0.7930  0.8248 0.8015    0.8248 0.8919         0.3180        0.8995

# Set parameters for following cells

model1 = model_lr1
model2 = model_lr2


model_name1 = "model_lr1"
model_name2 = "model_lr2"

# model_best_model1 = N/A not grid searched
# model_best_model2 = N/A not grid searched

display_results()

# Prepare confusion matrices for LR1 test
cm_test1 = metrics.confusion_matrix(y_test, y_pred_test)  # Use the optimized model
#cm_test1_percent = cm_test1 / cm_test1.sum() * 100

# Prepare confusion matrices for LR1 train
cm_test2 = metrics.confusion_matrix(y_test2, y_pred_test2)  # Use the optimized model
#cm_test2_percent = cm_test2 / cm_test2.sum() * 100

#cm = confusion_matrix(y_test, y_pred_test, labels=model_lr1.classes_)

# Plot confusion matrix
#disp = ConfusionMatrixDisplay(confusion_matrix=cm,
#                             display_labels=model_lr1.classes_)
#disp.plot(values_format='');

fig, ax = plt.subplots(2, 2, figsize=(10,8))

# Calculate percentages for TEST
sum_by_true_class = np.sum(cm_test1, axis=1)
percentage_matrix = cm_test1 / sum_by_true_class[:, np.newaxis]
model_name = "lr1 - AllFeat - Test"
# Create a figure and plot the percentage confusion matrix as a heatmap

sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,0]) #, xticklabels=model_lr1.classes_, yticklabels=model_lr1.classes_)
ax[0,0].set_title('{} Confusion Matrix (Percentage)'.format(model_name))
ax[0,0].set_ylabel('True label')
ax[0,0].set_xlabel('Predicted label')
ax[0,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='white')
ax[0,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')



# Create a figure and plot the COUNT confusion matrix as a heatmap for TEST
sns.heatmap(cm_test1, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,0])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,0].set_title('{} Confusion Matrix (Count)'.format(model_name))
ax[1,0].set_ylabel('True label')
ax[1,0].set_xlabel('Predicted label')
ax[1,0].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,0].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,0].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,0].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')



# Calculate percentages for TRAIN
sum_by_true_class = np.sum(cm_test2, axis=1)
percentage_matrix = cm_test2 / sum_by_true_class[:, np.newaxis]
model_name = "lr2 - NoDept - Test"

sns.heatmap(percentage_matrix, annot=True, fmt=".2%", cmap="Blues", ax = ax[0,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[0,1].set_title('{} Confusion Matrix (Percentage)'.format(model_name))
ax[0,1].set_ylabel('True label')
ax[0,1].set_xlabel('Predicted label')
ax[0,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[0,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[0,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='white')
ax[0,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')


# Create a figure and plot the COUNT confusion matrix as a heatmap for TRAIN
sns.heatmap(cm_test2, annot=True, fmt=".0f", cmap="Blues", ax = ax[1,1])#, xticklabels=class_labels, yticklabels=class_labels)
ax[1,1].set_title('{}  Confusion Matrix (Count)'.format(model_name))
ax[1,1].set_ylabel('True label')
ax[1,1].set_xlabel('Predicted label')
ax[1,1].text(0.3, 0.25, '(TN)\nTrue Stay', color='white')
ax[1,1].text(1.3, 0.25, '(FP) type 1\n False Leave', color='black')
ax[1,1].text(0.3, 1.25, '(FN) type 2\n False Stay', color='black')
ax[1,1].text(1.3, 1.25, '(TP)\nTrue Leave', color='black')


plt.tight_layout()
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

# Get Feature Importance function, returns a dataframe of features 

def get_feature_importance(model, feature_names, model_name):
    ''' Get Feature Importance function, returns a dataframe of features '''
    feature_importance = model.coef_[0]
    sorted_indices = feature_importance.argsort()[::-1]
    sorted_feature_names = [feature_names[i] for i in sorted_indices]
    sorted_importance = feature_importance[sorted_indices]

    feature_importance_df = pd.DataFrame({
        'Feature': sorted_feature_names,
        'Importance': sorted_importance
    })

    return feature_importance_df

# Get feature importance for the second model
feature_importance_1 = get_feature_importance(model_lr1, X.columns, "LogisticalRegression model_lr1 - All Feat")

# Get feature importance for the second model
feature_importance_2 = get_feature_importance(model_lr2, X2.columns, "LogisticalRegression model_lr2 - NoDept")

merged_df = pd.merge(feature_importance_1, feature_importance_2, on='Feature', how='left', suffixes = ('model_lr1 - AllFeat', 'model_lr2 - NoDept'))

# Print the merged DataFrame
print(merged_df)


# Plot side-by-side bar plots
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.barh(feature_importance_1['Feature'], feature_importance_1['Importance'], color='skyblue')
plt.title('Feature Importance - model_lr1 - AllFeat')
plt.xlabel('Importance')
plt.ylabel('Feature')

plt.subplot(1, 2, 2)
plt.barh(feature_importance_2['Feature'], feature_importance_2['Importance'], color='salmon')
plt.title('Feature Importance - model_lr2 - NoDept')
plt.xlabel('Importance')
plt.ylabel('Feature')

plt.tight_layout()
plt.show()

             Feature  Importancemodel_lr1 - AllFeat  Importancemodel_lr2 - NoDept
0             tenure                         0.9972                        0.9950
1       avg_mnth_hrs                         0.1764                        0.1736
2         dept_sales                         0.0364                           NaN
3       dept_support                         0.0304                           NaN
4     dept_technical                         0.0289                           NaN
5            dept_hr                         0.0115                           NaN
6          last_eval                        -0.0081                       -0.0096
7   dept_product_mng                        -0.0115                           NaN
8            dept_it                        -0.0122                           NaN
9     dept_marketing                        -0.0126                           NaN
10   dept_management                        -0.0281                           NaN
11   dept_accounting                        -0.0535                           NaN
12        dept_randd                        -0.0648                           NaN
13         promotion                        -0.1910                       -0.1970
14            salary                        -0.3332                       -0.3367
15          accident                        -0.5551                       -0.5574
16    number_project                        -0.5710                       -0.5675
17      satisfaction                        -1.0622                       -1.0588

y_prob1 = model_lr1.predict_proba(X_test)[:, 1]
y_prob2 = model_lr2.predict_proba(X_test2)[:, 1]

precision1, recall1, _ = precision_recall_curve(y_test, y_prob1)
precision2, recall2, _ = precision_recall_curve(y_test2, y_prob2)

# Compute area under the curve (AUC)
auc_score1 = auc(recall1, precision1)
auc_score2 = auc(recall2, precision2)

#plt.subplot(1, 2, 1)
plt.figure(figsize=(12, 5))
plt.plot(recall1, precision1, color='blue', label=f'{model_name1} - AUC = {auc_score1:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve - {model_name1}')
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.2, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.8, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')
#plt.savefig('plot-prc-curve1.png')
plt.show()

#plt.subplot(1, 2, 2)
plt.figure(figsize=(12, 5))
plt.plot(recall2, precision2, color='blue', label=f'{model_name2} - AUC = {auc_score2:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve - {model_name2}')
plt.legend(loc='best')
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Guess')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=1, color='green', linestyle='--', label='Perfect')
plt.text(0.2, 0.52, 'Random Guess', color='red')
plt.text(0.4, 0.8, 'Better', color='black')
plt.text(0.4, 0.4, 'Worse', color='black')
plt.text(0.8, 1.01, 'Perfect', color='green')

#plt.tight_layout()
plt.show()
plt.savefig('plot-prc-curve2.png')

<Figure size 640x480 with 0 Axes>

# Compute ROC curve
fpr1, tpr1, _ = roc_curve(y_test, y_prob1)
fpr2, tpr2, _ = roc_curve(y_test2, y_prob2)

# Compute area under the curve (AUC)
roc_auc1 = auc(fpr1, tpr1)
roc_auc2 = auc(fpr2, tpr2)


#plt.subplot(1, 2, 1)

plt.figure(figsize=(12, 5))
# Plot ROC curve

plt.plot(fpr1, tpr1, color='darkorange', lw=2, label=f'{model_name1} - AUC = {roc_auc1:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic Curve (ROC)  - {model_name1}')
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')
plt.show()

#plt.subplot(1, 2,2)
plt.figure(figsize=(12, 5))
# Plot ROC curve
plt.plot(fpr2, tpr2, color='darkorange', lw=2, label=f'{model_name2} AUC = {roc_auc2:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic (ROC) Curve - {model_name2}')
plt.legend(loc='best')
plt.axhline(y=1, color='green', linestyle='--', label='Perfect')
plt.axvline(x=0, color='green', linestyle='--', label='Perfect')
plt.text(0.4, 0.6, 'Better', color='black')
plt.text(0.4, 0.3, 'Worse', color='black')
plt.text(0.01, 1.01, 'Perfect', color='green')
plt.show()

# Save model plot data

arrays = [['precision1',model_name1],
         ['recall1', model_name1],
         ['precision2',model_name2],
         ['recall2', model_name2],
         ['fpr1', model_name1],
         ['tpr1', model_name1],
         ['fpr2', model_name2],
         ['tpr2', model_name2],
        ]

variables = [['auc_score1', model_name1],
             ['auc_score2', model_name2],
             ['roc_auc1', model_name1],
             ['roc_auc2', model_name2]
             ]

# Save plot data scores (auc, roc)
for var_name, model in variables:
    #print(var_name, model)
    var = globals()[var_name]
    with open(f'99-documentation-project/08-plot_data/{model}-{var_name}.csv', 'w') as file:
        json.dump(var, file)

# Save plot data arrays (recall, precision, fpr, tpr)
for array_name, model in arrays:
    #print(array_name, model)
    var = globals()[array_name]
    df = pd.DataFrame({array_name: var})
    df.to_csv(f'99-documentation-project/08-plot_data/{model}-{array_name}.csv', index=False, header=False)

Document Title	Salifort Motors - ML Modelling - Logistical Regression
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
1	lr1 - ALLFeat - train	0.7960	0.8266	0.8041	0.8266	0.8923	0.3284	0.9005
3	lr2 - NOdept - train	0.7930	0.8248	0.8015	0.8248	0.8919	0.3180	0.8995
0	lr1 - ALLFeat - test	0.7943	0.8216	0.8031	0.8216	0.8819	0.3413	0.8969
2	lr2 - NOdept - test	0.7952	0.8227	0.8039	0.8227	0.8819	0.3426	0.8975

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
1	lr1 - ALLFeat - train	0.7960	0.8266	0.8041	0.8266	0.8923	0.3284	0.9005
3	lr2 - NOdept - train	0.7930	0.8248	0.8015	0.8248	0.8919	0.3180	0.8995
0	lr1 - ALLFeat - test	0.7943	0.8216	0.8031	0.8216	0.8819	0.3413	0.8969
2	lr2 - NOdept - test	0.7952	0.8227	0.8039	0.8227	0.8819	0.3426	0.8975

	Model	Precision	Recall	F1	Accuracy	AUC	Predict Leave	Predict Stay
1	lr1 - ALLFeat - train	0.796	0.827	0.804	0.827	0.892	0.328	0.900
0	lr1 - ALLFeat - test	0.794	0.822	0.803	0.822	0.882	0.341	0.897

Table of contents¶

Table of contents ¶