Document Title | Salifort Motors Feature Engineering |
Author | Rod Slater |
Version | 1.0 |
Created | 01-11-2023 |
Modified | 16-11-2023 |
Client Name | Salifort Motors |
Client Contact | Mr HR Team |
Client Email | hr@salifortmotors.it |
Client Project | HR Team Data Driven Solutions from Machine Learning Models |
Feature Engineering for HR Data provided by Salifort Motors. This notebook details the feature engineering carried out on the provided data.
# Import packages
# Data manipulation
import numpy as np
import pandas as pd
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
# For saving models
import pickle
# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
# Load cleaned dataset, with outliers removed, into a dataframe
df1 = pd.read_csv(load_path + "data_cleaned_NoOl_NoFE_AllFeat.csv")
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11167 entries, 0 to 11166 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 satisfaction 11167 non-null float64 1 last_eval 11167 non-null float64 2 number_project 11167 non-null float64 3 avg_mnth_hrs 11167 non-null float64 4 tenure 11167 non-null float64 5 accident 11167 non-null float64 6 left 11167 non-null float64 7 promotion 11167 non-null float64 8 dept 11167 non-null object 9 salary 11167 non-null object dtypes: float64(8), object(2) memory usage: 872.6+ KB
# Assuming you have a DataFrame named 'df'
summary = df1.describe(include='all')
# Additional summary information
info = df1.info()
# Combine summary statistics and information
summary_info = pd.concat([summary, info], axis=0)
summary_info
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11167 entries, 0 to 11166 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 satisfaction 11167 non-null float64 1 last_eval 11167 non-null float64 2 number_project 11167 non-null float64 3 avg_mnth_hrs 11167 non-null float64 4 tenure 11167 non-null float64 5 accident 11167 non-null float64 6 left 11167 non-null float64 7 promotion 11167 non-null float64 8 dept 11167 non-null object 9 salary 11167 non-null object dtypes: float64(8), object(2) memory usage: 872.6+ KB
satisfaction | last_eval | number_project | avg_mnth_hrs | tenure | accident | left | promotion | dept | salary | |
---|---|---|---|---|---|---|---|---|---|---|
count | 11167.00 | 11167.00 | 11167.00 | 11167.00 | 11167.00 | 11167.00 | 11167.00 | 11167.00 | 11167 | 11167 |
unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10 | 3 |
top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | sales | low |
freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2997 | 5429 |
mean | 0.63 | 0.72 | 3.79 | 200.14 | 3.11 | 0.15 | 0.17 | 0.01 | NaN | NaN |
std | 0.24 | 0.17 | 1.16 | 48.56 | 0.90 | 0.36 | 0.37 | 0.12 | NaN | NaN |
min | 0.09 | 0.36 | 2.00 | 96.00 | 2.00 | 0.00 | 0.00 | 0.00 | NaN | NaN |
25% | 0.49 | 0.57 | 3.00 | 157.00 | 2.00 | 0.00 | 0.00 | 0.00 | NaN | NaN |
50% | 0.66 | 0.72 | 4.00 | 199.00 | 3.00 | 0.00 | 0.00 | 0.00 | NaN | NaN |
75% | 0.82 | 0.86 | 5.00 | 243.00 | 4.00 | 0.00 | 0.00 | 0.00 | NaN | NaN |
max | 1.00 | 1.00 | 7.00 | 310.00 | 5.00 | 1.00 | 1.00 | 1.00 | NaN | NaN |
df1.dtypes
satisfaction float64 last_eval float64 number_project float64 avg_mnth_hrs float64 tenure float64 accident float64 left float64 promotion float64 dept object salary object dtype: object
df1.isna().sum()
satisfaction 0 last_eval 0 number_project 0 avg_mnth_hrs 0 tenure 0 accident 0 left 0 promotion 0 dept 0 salary 0 dtype: int64
df1.duplicated().sum()
0
print(df1['salary'].unique())
print(df1['dept'].unique())
['low' 'medium' 'high'] ['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'it' 'product_mng' 'marketing' 'randd']
# Encode salary as an ordinal numeric rather than get_dummies.
# The data is ordinal and we want to preserve that, encoding dummies would
# add three additional dimension and we want to avoid large dimension models
df1['salary'] = (
df1['salary'].astype('category')
.cat.set_categories(['low', 'medium', 'high'])
.cat.codes
)
# One Hot Encode dept
df1 = pd.get_dummies(df1, columns = ['dept'])
df1.dtypes
satisfaction float64 last_eval float64 number_project float64 avg_mnth_hrs float64 tenure float64 accident float64 left float64 promotion float64 salary int8 dept_accounting bool dept_hr bool dept_it bool dept_management bool dept_marketing bool dept_product_mng bool dept_randd bool dept_sales bool dept_support bool dept_technical bool dtype: object
avg_mnth_hrs
to binary overworked
¶See EDA notes, overworked threshold is set to 175 hours.
pd.options.display.float_format = '{:.2f}'.format
# Create `overworked` column. For now, it's identical to average monthly hours.
df1['overworked'] = df1['avg_mnth_hrs']
# Inspect mean, max and min average monthly hours values
print('Max hours :', df1['overworked'].max())
print('Min hours : ', df1['overworked'].min())
print('Mean hours :', round(df1['overworked'].mean(),2))
Max hours : 310.0 Min hours : 96.0 Mean hours : 200.14
# Define `overworked` as working > 175 hrs/week
df1['overworked'] = (df1['overworked'] > 175).astype(int)
# Display first few rows of new column
df1.head()
satisfaction | last_eval | number_project | avg_mnth_hrs | tenure | accident | left | promotion | salary | dept_accounting | dept_hr | dept_it | dept_management | dept_marketing | dept_product_mng | dept_randd | dept_sales | dept_support | dept_technical | overworked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2.00 | 157.00 | 3.00 | 0.00 | 1.00 | 0.00 | 0 | False | False | False | False | False | False | False | True | False | False | 0 |
1 | 0.11 | 0.88 | 7.00 | 272.00 | 4.00 | 0.00 | 1.00 | 0.00 | 1 | False | False | False | False | False | False | False | True | False | False | 1 |
2 | 0.72 | 0.87 | 5.00 | 223.00 | 5.00 | 0.00 | 1.00 | 0.00 | 0 | False | False | False | False | False | False | False | True | False | False | 1 |
3 | 0.37 | 0.52 | 2.00 | 159.00 | 3.00 | 0.00 | 1.00 | 0.00 | 0 | False | False | False | False | False | False | False | True | False | False | 0 |
4 | 0.41 | 0.50 | 2.00 | 153.00 | 3.00 | 0.00 | 1.00 | 0.00 | 0 | False | False | False | False | False | False | False | True | False | False | 0 |
# Drop the `average_monthly_hours` column
df1 = df1.drop('avg_mnth_hrs', axis=1)
# Display first few rows of resulting dataframe
df1.head()
satisfaction | last_eval | number_project | tenure | accident | left | promotion | salary | dept_accounting | dept_hr | dept_it | dept_management | dept_marketing | dept_product_mng | dept_randd | dept_sales | dept_support | dept_technical | overworked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2.00 | 3.00 | 0.00 | 1.00 | 0.00 | 0 | False | False | False | False | False | False | False | True | False | False | 0 |
1 | 0.11 | 0.88 | 7.00 | 4.00 | 0.00 | 1.00 | 0.00 | 1 | False | False | False | False | False | False | False | True | False | False | 1 |
2 | 0.72 | 0.87 | 5.00 | 5.00 | 0.00 | 1.00 | 0.00 | 0 | False | False | False | False | False | False | False | True | False | False | 1 |
3 | 0.37 | 0.52 | 2.00 | 3.00 | 0.00 | 1.00 | 0.00 | 0 | False | False | False | False | False | False | False | True | False | False | 0 |
4 | 0.41 | 0.50 | 2.00 | 3.00 | 0.00 | 1.00 | 0.00 | 0 | False | False | False | False | False | False | False | True | False | False | 0 |
## Correlation Heatmap
df_corr = df1.corr() #.sort_values(by = 'left', ascending = False)
plt.figure(figsize = (12,12))
sns.heatmap(df_corr, annot=True, annot_kws={"size": 7}, fmt='.1%', cmap='YlOrRd')
<Axes: >
## Focus on features with higher correlation values:
df_corr = df1[['satisfaction', 'last_eval', 'number_project', 'overworked',
'tenure', 'left', 'accident','promotion', 'salary']]
df_corr = df_corr.corr() #.sort_values(by = 'left', ascending = False)
plt.figure(figsize = (12,6))
sns.heatmap(df_corr, annot=True, annot_kws={"size": 8}, fmt='.2%', cmap='YlOrRd')
<Axes: >
accident
¶Accident
seems very high compared to the national average (See EDA) This require further investigation.
# Drop `accident` and save resulting dataframe in new variable
df1.drop(['accident'], axis=1, inplace=True)
# Display first few rows of new dataframe
df1.columns
Index(['satisfaction', 'last_eval', 'number_project', 'tenure', 'left', 'promotion', 'salary', 'dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd', 'dept_sales', 'dept_support', 'dept_technical', 'overworked'], dtype='object')
# Required for the XGBoost modelling
df1.to_csv(save_path+'data_cleaned_NoOl_FE_AllFeat.csv', index=False) # Feature Engineered on avg_mnth_hrs, salary, dept
df1.columns
Index(['satisfaction', 'last_eval', 'number_project', 'tenure', 'left', 'promotion', 'salary', 'dept_accounting', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd', 'dept_sales', 'dept_support', 'dept_technical', 'overworked'], dtype='object')
Remove the departments from the model data so we can compare how they perform
Remove accident
df2 = df1[['satisfaction', 'last_eval', 'number_project', 'overworked', 'tenure', 'left', 'promotion', 'salary']]
Excludes departments which appear to have a low correlation
# Feature Engineered on avg_mnth_hrs, salary.
# dept REMOVED
df2.to_csv(save_path + 'data_cleaned_NoOl_FE_NoDept.csv', index = False)
df2.columns
Index(['satisfaction', 'last_eval', 'number_project', 'overworked', 'tenure', 'left', 'promotion', 'salary'], dtype='object')