# Import packages

# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns


# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm

# For saving models
import pickle

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"

# Load cleaned dataset, with outliers removed, into a dataframe
df1 = pd.read_csv(load_path + "data_cleaned_NoOl_NoFE_AllFeat.csv")

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11167 entries, 0 to 11166
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   satisfaction    11167 non-null  float64
 1   last_eval       11167 non-null  float64
 2   number_project  11167 non-null  float64
 3   avg_mnth_hrs    11167 non-null  float64
 4   tenure          11167 non-null  float64
 5   accident        11167 non-null  float64
 6   left            11167 non-null  float64
 7   promotion       11167 non-null  float64
 8   dept            11167 non-null  object 
 9   salary          11167 non-null  object 
dtypes: float64(8), object(2)
memory usage: 872.6+ KB

# Assuming you have a DataFrame named 'df'
summary = df1.describe(include='all')

# Additional summary information
info = df1.info()

# Combine summary statistics and information
summary_info = pd.concat([summary, info], axis=0)

summary_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11167 entries, 0 to 11166
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   satisfaction    11167 non-null  float64
 1   last_eval       11167 non-null  float64
 2   number_project  11167 non-null  float64
 3   avg_mnth_hrs    11167 non-null  float64
 4   tenure          11167 non-null  float64
 5   accident        11167 non-null  float64
 6   left            11167 non-null  float64
 7   promotion       11167 non-null  float64
 8   dept            11167 non-null  object 
 9   salary          11167 non-null  object 
dtypes: float64(8), object(2)
memory usage: 872.6+ KB

df1.dtypes

satisfaction      float64
last_eval         float64
number_project    float64
avg_mnth_hrs      float64
tenure            float64
accident          float64
left              float64
promotion         float64
dept               object
salary             object
dtype: object

df1.isna().sum()

satisfaction      0
last_eval         0
number_project    0
avg_mnth_hrs      0
tenure            0
accident          0
left              0
promotion         0
dept              0
salary            0
dtype: int64

df1.duplicated().sum()

0

print(df1['salary'].unique())
print(df1['dept'].unique())

['low' 'medium' 'high']
['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'it'
 'product_mng' 'marketing' 'randd']

# Encode salary as an ordinal numeric rather than get_dummies. 
# The data is ordinal and we want to preserve that, encoding dummies would 
# add three additional dimension and we want to avoid large dimension models 

df1['salary'] = (
    df1['salary'].astype('category')
    .cat.set_categories(['low', 'medium', 'high'])
    .cat.codes
)

# One Hot Encode dept
df1 = pd.get_dummies(df1, columns = ['dept'])
df1.dtypes

satisfaction        float64
last_eval           float64
number_project      float64
avg_mnth_hrs        float64
tenure              float64
accident            float64
left                float64
promotion           float64
salary                 int8
dept_accounting        bool
dept_hr                bool
dept_it                bool
dept_management        bool
dept_marketing         bool
dept_product_mng       bool
dept_randd             bool
dept_sales             bool
dept_support           bool
dept_technical         bool
dtype: object

pd.options.display.float_format = '{:.2f}'.format

# Create `overworked` column. For now, it's identical to average monthly hours.
df1['overworked'] = df1['avg_mnth_hrs']

# Inspect mean, max and min average monthly hours values
print('Max hours  :', df1['overworked'].max())
print('Min hours  : ', df1['overworked'].min())
print('Mean hours :', round(df1['overworked'].mean(),2))

Max hours  : 310.0
Min hours  :  96.0
Mean hours : 200.14

# Define `overworked` as working > 175 hrs/week
df1['overworked'] = (df1['overworked'] > 175).astype(int)

# Display first few rows of new column
df1.head()

# Drop the `average_monthly_hours` column
df1 = df1.drop('avg_mnth_hrs', axis=1)

# Display first few rows of resulting dataframe
df1.head()

## Correlation Heatmap 

df_corr = df1.corr() #.sort_values(by = 'left', ascending = False)

plt.figure(figsize = (12,12))
sns.heatmap(df_corr, annot=True, annot_kws={"size": 7}, fmt='.1%', cmap='YlOrRd')

<Axes: >

## Focus on features with higher correlation values:
df_corr = df1[['satisfaction', 'last_eval', 'number_project', 'overworked', 
               'tenure', 'left', 'accident','promotion', 'salary']] 
df_corr = df_corr.corr() #.sort_values(by = 'left', ascending = False)

plt.figure(figsize = (12,6))
sns.heatmap(df_corr, annot=True, annot_kws={"size": 8}, fmt='.2%', cmap='YlOrRd')

<Axes: >

# Drop `accident` and save resulting dataframe in new variable
df1.drop(['accident'], axis=1, inplace=True)

# Display first few rows of new dataframe
df1.columns

Index(['satisfaction', 'last_eval', 'number_project', 'tenure', 'left', 'promotion', 'salary', 'dept_accounting',
       'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd', 'dept_sales',
       'dept_support', 'dept_technical', 'overworked'],
      dtype='object')

# Required for the XGBoost modelling
df1.to_csv(save_path+'data_cleaned_NoOl_FE_AllFeat.csv', index=False) # Feature Engineered on avg_mnth_hrs, salary, dept
df1.columns

Index(['satisfaction', 'last_eval', 'number_project', 'tenure', 'left', 'promotion', 'salary', 'dept_accounting',
       'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd', 'dept_sales',
       'dept_support', 'dept_technical', 'overworked'],
      dtype='object')

df2 = df1[['satisfaction', 'last_eval', 'number_project', 'overworked', 'tenure', 'left', 'promotion', 'salary']]

# Feature Engineered on avg_mnth_hrs, salary. 
# dept REMOVED
df2.to_csv(save_path + 'data_cleaned_NoOl_FE_NoDept.csv', index = False) 
df2.columns

Index(['satisfaction', 'last_eval', 'number_project', 'overworked', 'tenure', 'left', 'promotion', 'salary'], dtype='object')

Document Title	Salifort Motors Feature Engineering
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

	satisfaction	last_eval	number_project	avg_mnth_hrs	tenure	accident	left	promotion	dept	salary
count	11167.00	11167.00	11167.00	11167.00	11167.00	11167.00	11167.00	11167.00	11167	11167
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	10	3
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	sales	low
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2997	5429
mean	0.63	0.72	3.79	200.14	3.11	0.15	0.17	0.01	NaN	NaN
std	0.24	0.17	1.16	48.56	0.90	0.36	0.37	0.12	NaN	NaN
min	0.09	0.36	2.00	96.00	2.00	0.00	0.00	0.00	NaN	NaN
25%	0.49	0.57	3.00	157.00	2.00	0.00	0.00	0.00	NaN	NaN
50%	0.66	0.72	4.00	199.00	3.00	0.00	0.00	0.00	NaN	NaN
75%	0.82	0.86	5.00	243.00	4.00	0.00	0.00	0.00	NaN	NaN
max	1.00	1.00	7.00	310.00	5.00	1.00	1.00	1.00	NaN	NaN

Salifort Motors - Feature Engineering ¶

Document Information ¶

Client Details ¶

Document Overview ¶

Table of contents ¶

Initialise Notebook ¶

Import Packages ¶

Set Pandas Options ¶

Initialise Notebook Options ¶

Import data ¶

Gather information about the data ¶

Describe the dataframe ¶

Check for missing values ¶

Check for duplicates ¶

Check Categorical values for consistency ¶

Feature Engineering ¶

Encode categorical values 'dept' as dummies ¶

Feature Engineer continuous values `avg_mnth_hrs` to binary `overworked`¶

Correlation Heatmap (full)¶

Correlation Heatmap (Focus)¶

Discard problematic columns ¶

`accident`¶

Save the Feature Engineered dataframe ¶

Focus Features ¶

Save the Focus Features dataframe ¶

	satisfaction	last_eval	number_project	avg_mnth_hrs	tenure	left	salary	dept_accounting	dept_hr	dept_it	dept_management	dept_marketing	dept_product_mng	dept_randd	dept_sales	dept_support	dept_technical	overworked
0	0.38	0.53	2.00	157.00	3.00	1.00	0	False	False	False	False	False	False	False	True	False	False	0
1	0.11	0.88	7.00	272.00	4.00	1.00	1	False	False	False	False	False	False	False	True	False	False	1
2	0.72	0.87	5.00	223.00	5.00	1.00	0	False	False	False	False	False	False	False	True	False	False	1
3	0.37	0.52	2.00	159.00	3.00	1.00	0	False	False	False	False	False	False	False	True	False	False	0
4	0.41	0.50	2.00	153.00	3.00	1.00	0	False	False	False	False	False	False	False	True	False	False	0

Table of contents¶

Table of contents ¶