Document Title | Salifort Motors EDA Analysis |
Author | Rod Slater |
Version | 1.0 |
Created | 01-11-2023 |
Modified | 16-11-2023 |
Client Name | Salifort Motors |
Client Contact | Mr HR Team |
Client Email | hr@salifortmotors.it |
Client Project | HR Team Data Driven Solutions from Machine Learning Models |
EDA Data Analysis for HR Data provided by Salifort Motors. This notebook details the data and variable analysis of the provided data along with observations, conclusions and suggestions for features to include in the ML models.
last_eval
number_project
avg_mnth_hrs
tenure
satisfaction
accident
promotion
salary
dept
# Import packages
# Data manipulation
import numpy as np
import pandas as pd
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
from scipy import stats
# For saving models
import pickle
# Add a little colour
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
def correlation_single(dataframe: object, var: str):
"""Return a single row from the correlation plot.
in : dataframe: Object = The dataframe to EDA
var: String = variable to summarise
out : correlation heatmap plot
"""
df_corr = dataframe.corr().sort_values(by = var, axis = 1, ascending = False) # create a correlation dataframe
corr_index = df_corr.index.get_loc(var) +1 # Get the row with the data we want
plt.figure(figsize=(10, 1))
sns.heatmap(df_corr.iloc[corr_index-1:corr_index,:20], annot=True, annot_kws={"size": 10},
fmt='.0%', cmap='YlOrRd', cbar = False) # do plot 'magic'
return plt
def eda_single(dataframe: object, var: str, decplace:int):
"""Return a df summary of the EDA variable.
in : dataframe: Object = The dataframe to EDA
var: String = variable to summarise
decplace: int = Number of decimal place for pandas.options.display.float_format
out : dataframe summary of the variable
"""
float_option = '{:.' + str(decplace) + 'f}'
pd.options.display.float_format = float_option.format #2 decimal places, floating point
# Create separate DataFrames for 'left = 0' and 'left = 1'
left_0_df = dataframe[dataframe['left'] == 0]
left_1_df = dataframe[dataframe['left'] == 1]
# Calculate the count of rows within each category for both groups
left_0_count = left_0_df.groupby(left_0_df[var], observed=False).size().fillna(0)
left_1_count = left_1_df.groupby(left_1_df[var], observed=False).size().fillna(0)
# Calculate the total count for each category
total_count = left_0_count + left_1_count
# Calculate the percentage of the column total where 'left = 1' or 'left = 0' for each category
left_0_percentage = ((left_0_count / total_count) * 100)#.fillna(100)#.replace([np.inf, -np.inf], 100)
left_1_percentage = ((left_1_count / total_count) * 100)#.fillna(100).replace([np.inf, -np.inf], 100)
# Combine the results into a summary table
summary_table = pd.DataFrame({
'Stayed Count': left_0_count,
'Left Count': left_1_count,
'Total Count': left_0_count+left_1_count,
'Total Perc of Total' : (left_1_count+left_0_count)/(left_1_count+left_0_count).sum()*100,
'Stayed Perc of Stayed Total' : (left_0_count/left_0_count.sum()*100),
'Left Perc of Left Total' : (left_1_count/left_1_count.sum()*100),
'Stayed %': left_0_percentage,
'Left %': left_1_percentage
})
summary_table = summary_table.fillna(0)
summary_vars = ['Stayed Count','Left Count', 'Total Count', 'Total Perc of Total',
'Stayed Perc of Stayed Total', 'Left Perc of Left Total']# , 'Stayed %','Left %']
summary_table.loc['Total'] = summary_table[summary_vars].sum()
return summary_table
def eda_dual(dataframe: object, var1: str, var2: str, decplace: int):
"""Return a df summary of the EDA variable.
in : dataframe: Object = The dataframe to EDA
var1: String = variable to summarise
var2: String = second variable
decplace: int = Number of decimal place for pandas.options.display.float_format
out : binned dataframe summary of the variable
"""
float_option = '{:.' + str(decplace) + 'f}'
pd.options.display.float_format = float_option.format #2 decimal places, floating point
# Create separate DataFrames for 'left = 0' and 'left = 1'
left_0_df = dataframe[dataframe['left'] == 0]
left_1_df = dataframe[dataframe['left'] == 1]
# Calculate the count of rows within each category for both groups
left_0_count = left_0_df.groupby([left_0_df[var1], left_0_df[var2]], observed=False).size().fillna(0)
left_1_count = left_1_df.groupby([left_1_df[var1], left_1_df[var2]], observed=False).size().fillna(0)
# Calculate the total count for each category
total_count = left_0_count + left_1_count
# Calculate the percentage of the column total where 'left = 1' or 'left = 0' for each category
left_0_percentage = ((left_0_count / total_count) * 100)
left_1_percentage = ((left_1_count / total_count) * 100)
# Combine the results into a summary table
summary_table = pd.DataFrame({
'Stayed Count': left_0_count,
'Left Count': left_1_count,
'Total Count': left_0_count+left_1_count,
'Total Perc of Total' : (left_1_count+left_0_count)/(left_1_count+left_0_count).sum()*100,
'Stayed Perc of Stayed Total' : (left_0_count/left_0_count.sum()*100),
'Left Perc of Left Total' : (left_1_count/left_1_count.sum()*100),
'Stayed %': left_0_percentage,
'Left %': left_1_percentage
})
summary_table = summary_table.fillna(0)
summary_table.index.name = "dave"
summary_vars = ['Stayed Count','Left Count', 'Total Count', 'Total Perc of Total',
'Stayed Perc of Stayed Total', 'Left Perc of Left Total']# , 'Stayed %','Left %']
summary_table.loc['Total'] = summary_table[summary_vars].sum()
return summary_table
def eda_single_bins(dataframe: object, var: str, bins: list, labels: list, decplace: int):
"""Return a BINNED df summary of the EDA variable.
in : dataframe: Object = The dataframe to EDA
: var: String = variable to summarise
: bins: List = list object of the bin groupings e.g. [0, 0.33, 0.5, 0.7, 1]
: labels: List = list object of the bin labels e.g. ['low', 'medium', 'high']
: decplace: int = Number of decimal place for pandas.options.display.float_format
out : binned dataframe summary of the variable
"""
float_option = '{:.' + str(decplace) + 'f}' # construct option
pd.options.display.float_format = float_option.format
# Define the bins for categorising satisfaction scores, for EDA only.
# bins = [0, 1, 2, 3, 4, 5, 6, 7]
# labels = ['1', '2','3','4', '5', '6', '7']
# Create separate DataFrames for 'left = 0' and 'left = 1'
left_0_df = dataframe[dataframe['left'] == 0]
left_1_df = dataframe[dataframe['left'] == 1]
# Calculate the count of rows within each category for both groups
left_0_count = left_0_df.groupby(pd.cut(left_0_df[var], bins=bins, labels=labels),observed=False).size()
left_1_count = left_1_df.groupby(pd.cut(left_1_df[var], bins=bins, labels=labels),observed=False).size()
# Calculate the total count for each category
total_count = left_0_count + left_1_count
# Calculate the percentage of the column total where 'left = 1' or 'left = 0' for each category
left_0_percentage = (left_0_count / total_count) * 100
left_1_percentage = (left_1_count / total_count) * 100
# Combine the results into a summary table
summary_table = pd.DataFrame({
'Stayed Count': left_0_count,
'Left Count': left_1_count,
'Total Count': left_0_count+left_1_count,
'Total Perc of Total' : (left_1_count+left_0_count)/(left_1_count+left_0_count).sum()*100,
'Stayed Perc of Stayed Total' : (left_0_count/left_0_count.sum()*100),
'Left Perc of Left Total' : (left_1_count/left_1_count.sum()*100),
'Stayed %': left_0_percentage,
'Left %': left_1_percentage
})
summary_vars = ['Stayed Count','Left Count', 'Total Count', 'Total Perc of Total',
'Stayed Perc of Stayed Total', 'Left Perc of Left Total']# , 'Stayed %','Left %']
summary_table.loc['Total'] = summary_table[summary_vars].sum()
return summary_table
def quadplot(dataframe:object, var:str, annotate:bool):
"""Quad plot 2x2 histogram, heatmap, barplot, violinplot,
in : dataframe : Object = Dataframe to analyse
: var : Variable = variable to summarise
: annotate : Bool = Annotate the heatmap (if lots of data select False) True / False
out : 4 x plots of the variable.
"""
# Quad plot 2x2 histogram, heatmap, barplot, violinplot,
# Create a figure with two subplots (histogram and scatterplot)
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
sns.set(style="darkgrid")
binary_labels=['stayed','left']
# Plot a histogram of the continuous variable
sns.histplot(data=dataframe, x=var, hue='left', bins=3,
element='step', common_norm=False, ax=axes[0,0], palette='colorblind')
#histplot.legend(title='left', labels=['left', 'stayed'])
axes[0,0].set_title(f'Histogram of {var} by left')
axes[0,0].set_xlabel(f'{var}')
axes[0,0].set_ylabel('Frequency')
axes[0,0].legend(loc='upper center', labels=['left', 'stayed'],
bbox_to_anchor=(0.5, 0.5), fancybox=True, shadow=True, ncol=2) # title='left',
# Create a pivot table for the heatmap
heatmap_data = dataframe.pivot_table(index='left', columns=var, aggfunc='size', fill_value=0)
sns.set(style="darkgrid")
# Create a heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(heatmap_data, cmap='Oranges', annot=annotate, vmin=min(heatmap_data.min()),
vmax=max(heatmap_data.max()), fmt=".1f",
cbar_kws={'label': 'Values'} , cbar=False, ax=axes[0,1], yticklabels=binary_labels)
axes[0,1].set_title(f'Heatmap of {var} by left')
axes[0,1].set_xlabel(var)
# axes[0,1].set_ylabel('left')
# Create a scatterplot
plt.figure(figsize=(8, 5))
sns.set(style="darkgrid")
sns.boxplot(x='left', y=var, data=dataframe, hue='left', ax=axes[1, 0], palette='colorblind')
#boxplot.legend(title='left', labels=['left', 'stayed'])
axes[1, 0].set_title(f'Box Plot of {var} by left')
axes[1, 0].set_xlabel('left')
axes[1, 0].set_ylabel(var)
axes[1, 0].set_xticks([0,1],['stayed','left'])
# Create a violin plot
sns.set(style="darkgrid")
sns.violinplot(x='left', y=var, data=dataframe, hue='left', ax=axes[1, 1], palette='colorblind')
axes[1, 1].set_title(f'Violin Plot of {var} by left')
axes[1, 1].set_xlabel('left')
axes[1, 1].set_ylabel(f'{var}')
axes[1, 1].set_xticks([0,1],['stayed','left'])
# Adjust plot spacing
fig.subplots_adjust(hspace=0.3, wspace=0.25)
# Show the combined plot
return plt
#returned_plot = quadplot(df1, 'last_eval', False)
#returned_plot.show()
def scatplot_single(dataframe: object, x_var: str, y_var: str):
"""Create scatterplot of two variables,
in : dataframe : Object = Dataframe to analyse
: x_var : Str = First variable to plot on x-axis
: y_var : Str = Second variable to plot on y-axis
out : single scatterplot
"""
dataframe.sort_values(by='left', ascending = False)
sns.set(style="darkgrid")
# Create scatterplot of two variables
plt.figure(figsize=(10, 6))
sns.scatterplot(data=dataframe, x=x_var, y=y_var, hue='left', alpha=1, palette='colorblind')
plt.title(f'{x_var} by {y_var}', fontsize='14');
def scatplot_multi(dataframe: object, var1: str):
"""Create scatterplots of one variable against all variables in dataframe,
in : dataframe : Object = Dataframe to analyse
: var1: Variable = Variable to plot on y-axis
out : single scatterplot
"""
column_list = ['last_eval','number_project','avg_mnth_hrs','promotion','tenure','satisfaction', 'salary', 'accident']
column_list.remove(var1)
#column_list
df = dataframe.copy() # .sort_values(by='number_project', ascending=True)
# Calculate the number of subplots needed based on the number of continuous variables (6)
num_subplots = len(column_list)
# Create a subplot grid with the appropriate number of columns
num_columns = 2 # Adjust this based on layout preference
num_rows = (num_subplots + num_columns - 1) // num_columns # Calculate the number of rows
fig, axes = plt.subplots(num_rows, num_columns, figsize=(18, 18))#, sharey = True) # Set plot sizes
fig.subplots_adjust(hspace=0.5, wspace=0.2)
sns.set(style="darkgrid")
# Flatten the axes if there's only one row
if num_rows == 1:
axes = [axes]
# Loop through the continuous variables
for i, var in enumerate(column_list):
# Determine the current subplot's row and column
row = i // num_columns
col = i % num_columns # Modulus / get the remainder
# Create a histogram in the current subplot
sns.scatterplot(data=df, x=var, y = var1, hue='left', alpha=0.19, ax=axes[row][col], palette='colorblind')
#axes[row][col].set_title(f'Histogram for {var}')
axes[row][col].set_xlabel(var)
axes[row][col].set_ylabel(var1)
#axes[row][col].legend(loc ='upper center', bbox_to_anchor=(0.6,-0.5), fancybox=True, shadow=True, ncol = 2,
title='left', labels=['Yes', 'No'])
axes[row][col].tick_params(axis='x', labelrotation=45)
# Adjust spacing
plt.tight_layout()
# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
Begin by understanding how many employees left and what percentage of all employees this figure represents.
Data from :
Dataset has been cleaned :
data_cleaned_Ol_NoFE_AllFeat
- includes outliers at 11991 rowsdata_cleaned_NoOl_NoFE_AllFeat
- Outliers have been removed at 11167 rows this is mainly for the Logistical Regression model.# Load dataset into a dataframe
df1 = pd.read_csv(load_path + "data_cleaned_Ol_NoFE_AllFeat.csv") # 11991 rows, all data including outliers
#df1 = pd.read_csv(load_path + "data_cleaned_NoOl_NoFE_AllFeat.csv") # 11167 rows, no outliers (mostly from tenure)
# Display first few rows of the dataframe
df1.head()
satisfaction | last_eval | number_project | avg_mnth_hrs | tenure | accident | left | promotion | dept | salary | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
pd.options.display.float_format = '{:.2f}'.format
counts = df1['left'].value_counts()
perc = df1['left'].value_counts(normalize=True)*100
summary_table = pd.DataFrame({"Counts " : counts,
"Percentage " : perc})
summary_table
Counts | Percentage | |
---|---|---|
left | ||
0 | 10000 | 83.40 |
1 | 1991 | 16.60 |
# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
df1.describe(include='all')
satisfaction | last_eval | number_project | avg_mnth_hrs | tenure | accident | left | promotion | dept | salary | |
---|---|---|---|---|---|---|---|---|---|---|
count | 11991.00 | 11991.00 | 11991.00 | 11991.00 | 11991.00 | 11991.00 | 11991.00 | 11991.00 | 11991 | 11991 |
unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10 | 3 |
top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | sales | low |
freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3239 | 5740 |
mean | 0.63 | 0.72 | 3.80 | 200.47 | 3.36 | 0.15 | 0.17 | 0.02 | NaN | NaN |
std | 0.24 | 0.17 | 1.16 | 48.73 | 1.33 | 0.36 | 0.37 | 0.13 | NaN | NaN |
min | 0.09 | 0.36 | 2.00 | 96.00 | 2.00 | 0.00 | 0.00 | 0.00 | NaN | NaN |
25% | 0.48 | 0.57 | 3.00 | 157.00 | 3.00 | 0.00 | 0.00 | 0.00 | NaN | NaN |
50% | 0.66 | 0.72 | 4.00 | 200.00 | 3.00 | 0.00 | 0.00 | 0.00 | NaN | NaN |
75% | 0.82 | 0.86 | 5.00 | 243.00 | 4.00 | 0.00 | 0.00 | 0.00 | NaN | NaN |
max | 1.00 | 1.00 | 7.00 | 310.00 | 10.00 | 1.00 | 1.00 | 1.00 | NaN | NaN |
salary
and dept
are categorical and will need encoding.
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11991 entries, 0 to 11990 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 satisfaction 11991 non-null float64 1 last_eval 11991 non-null float64 2 number_project 11991 non-null int64 3 avg_mnth_hrs 11991 non-null int64 4 tenure 11991 non-null int64 5 accident 11991 non-null int64 6 left 11991 non-null int64 7 promotion 11991 non-null int64 8 dept 11991 non-null object 9 salary 11991 non-null object dtypes: float64(2), int64(6), object(2) memory usage: 936.9+ KB
df1.columns
Index(['satisfaction', 'last_eval', 'number_project', 'avg_mnth_hrs', 'tenure', 'accident', 'left', 'promotion', 'dept', 'salary'], dtype='object')
# Copy df1 to model_data as eda dataframe
# copy df1 to original_data - has dept in columns, required for some plots and analysis
original_data = df1.copy()
# copy df1 to full_data which is Feature Engineered. We'll mostly use to prepare data for plots and calculations.
full_data = df1.copy()
# focus_data = full_data with dept column removed, created in next cell
salary
& dept
¶df1
to model_data
salary
as ordinal categorydept
Use :
full_data
dataframe with all features and feature engineeringfocus_data
full_data
dataframe with dept encoding removedoriginal_data
original data, no feature engineering# Set ordinal encoding for salary low, medium, high
# we'll get 0, 1, 2 in the salary field as an ordinal representation.
full_data['salary'] = (
full_data['salary'].astype('category')
.cat.set_categories(['low', 'medium', 'high'])
.cat.codes
)
# One Hot Encode dept, we'll get a new binary field for each dept
full_data = pd.get_dummies(full_data, columns = ['dept'])
# Duplicate dataframe but only focus features (excludes dept)
focus_data = full_data.loc[:,['satisfaction', 'last_eval', 'number_project', 'avg_mnth_hrs', 'tenure', 'accident',
'left', 'promotion','salary']]
We get higher correlation without the employee department info.
Project Disclaimer : Although I expect the models to filter these out anyway,
for self education purposes I'll model two datasets, one just with feature engineering and outliers removed,
and one with the same data but with the employee department removed in order to compare performance.
It's also interesting to see the satisfaction
correlation being as low as it is, as we'll see later this is one of the most important features to consider.
pd.options.display.float_format = '{:.4%}'.format
## All Features get correlations
df_corr = full_data.corr().mean().sort_values(ascending=False)
# Focus Features select data features
focus_corr = focus_data.corr().mean().sort_values(ascending=False)
# Focus Features select data features for left = 1
left_focus_corr = focus_data[focus_data['left'] == 1].corr().mean().sort_values(ascending=False)
result_df = pd.concat([df_corr, focus_corr, left_focus_corr], axis=1)
# rename columns to something meaningful
result_df.columns=['all features', 'focus features', 'left = 1 focus']
result_df
all features | focus features | left = 1 focus | |
---|---|---|---|
avg_mnth_hrs | 9.1727% | 19.3872% | 40.6857% |
last_eval | 9.0568% | 19.0199% | 44.4214% |
number_project | 8.8130% | 18.6848% | 37.6385% |
tenure | 8.1096% | 16.7102% | 43.2059% |
promotion | 6.2680% | 12.6328% | 11.6538% |
salary | 5.6554% | 11.3276% | 12.6628% |
accident | 4.8375% | 10.2237% | 12.9542% |
dept_management | 3.7989% | NaN | NaN |
left | 3.3160% | 7.2069% | NaN |
satisfaction | 2.8039% | 6.0132% | 16.5350% |
dept_marketing | 1.7520% | NaN | NaN |
dept_accounting | 1.6314% | NaN | NaN |
dept_randd | 1.4386% | NaN | NaN |
dept_hr | 1.3126% | NaN | NaN |
dept_product_mng | 1.1011% | NaN | NaN |
dept_it | 0.4065% | NaN | NaN |
dept_support | -0.9650% | NaN | NaN |
dept_technical | -1.7958% | NaN | NaN |
dept_sales | -3.1942% | NaN | NaN |
Visualise relationships between variables
%%time
sns.set(style="darkgrid")
#sns.pairplot(data=focus_data, hue='left', palette='colorblind')
# Using correlations to focus on key features
CPU times: user 1.88 ms, sys: 0 ns, total: 1.88 ms Wall time: 1.9 ms
Correlations
The following variables appear to have some significant correlation to each other and will be investigated further
avg_mnth_hrs
number_project
satisfaction
promotion
tenure
last_eval
Calculate Pearson Correlation Coefficients for complete set of fields. The Pearson correlation coefficient ranges from -1 (perfect negative correlation) to 1 (perfect positive correlation), with 0 indicating no linear correlation.
full_data
details all fields.
focus_data
details fields with higher correlation
## Focus on features with higher correlation values, remove encoded dept:
df_corr = full_data.corr().sort_values(by = 'left', ascending = False)
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.heatmap(df_corr, annot=True, annot_kws={"size": 8}, fmt='.0%', cmap='YlOrRd', cbar = False)
plt.show()
Drilling down to the features with higher correlations noted in the top left of the above correlation plot for the full dataset
The higher correlation means of the following features will need further investigation:
## Focus on features with higher correlation values, remove encoded dept:
df_corr = focus_data.corr().sort_values(by = 'left', ascending = False)
#df_corr.drop('left', axis = 1, inplace = True)
sns.heatmap(df_corr, annot=True, annot_kws={"size": 8}, fmt='.0%', cmap='YlOrRd')
plt.title(f'Correlation Score for All employees')
plt.show()
df = focus_data.copy() # Start with fresh copy
df = df.loc[(df['left'] == 1)] # Filter on left = 1
df = df.drop('left', axis=1) # drop the left column
df_corr = df.corr()
plt.figure(figsize = (6,6))
sns.heatmap(df_corr, annot=True, annot_kws={"size": 8}, fmt='.0%', cmap='YlOrRd')
plt.title('Correlation Score for employees who left')
plt.show()
Plot visualisations and detail analysis for each relevant feature of the dataset.
# Multi plot for variables.
continuous_vars = ['last_eval','number_project','avg_mnth_hrs','promotion','tenure',
'satisfaction', 'salary', 'accident']
df = full_data # .sort_values(by='number_project', ascending=True)
# Calculate the number of subplots needed based on the number of continuous variables (6)
num_subplots = len(continuous_vars)
# Create a subplot grid with the appropriate number of columns
num_columns = 2 # Adjust this based on layout preference
num_rows = (num_subplots + num_columns - 1) // num_columns # Calculate the number of rows
fig, axes = plt.subplots(num_rows, num_columns, figsize=(16, 16)) # Set plot sizes
sns.set(style="darkgrid")
# Flatten the axes if there's only one row
if num_rows == 1:
axes = [axes]
# Loop through the continuous variables
for i, var in enumerate(continuous_vars):
# Determine the current subplot's row and column
row = i // num_columns
col = i % num_columns # Modulus / get the remainder
# Create a histogram in the current subplot
sns.histplot(data=df, x=var, hue='left', bins=10, element='step', common_norm=False, ax=axes[row][col],
palette='colorblind')
axes[row][col].set_title(f'Histogram for {var}')
axes[row][col].set_xlabel(var)
axes[row][col].set_ylabel('Frequency')
axes[row][col].legend(title='left', labels=['Yes', 'No'])
axes[row][col].tick_params(axis='x', labelrotation=45)
# Remove any empty subplots
for i in range(num_subplots, num_rows * num_columns):
fig.delaxes(axes[i])
# Adjust spacing
plt.tight_layout()
# Display the combined plots
plt.show()
# Initialise Analysis Variables and Pandas Display options
var = 'last_eval' # Feature used for plot and detail
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
full_data[var].describe()
count 11991.00 mean 0.72 std 0.17 min 0.36 25% 0.57 50% 0.72 75% 0.86 max 1.00 Name: last_eval, dtype: float64
last_eval
¶Data was grouped into bins to simplify EDA Analysis, lowest score was 0.36
last_eval
- Include¶Include `last_eval` in the model dataset
last_eval
¶Those who left fall generally into one of two groups either low scoring or high scoring with 28% of low scoring employees leaving and 17% of high scoring employees leaving. 46% of employees who left had a low or medium score, there's also a strong correlation to hours worked as seen on the pair plot and correlation plot, where those who left also tend to work more hours.
satisfaction
, number_project
, avg_mnth_hrs
and tenure
last_eval
score vs 10% of total employees who stayedlast_eval
score vs 38% of total employees who stayedlast_eval
score vs 51% of total employees who stayed# Calculate mean and median satisfaction scores of employees who left and those who stayed
full_data.groupby('left')['last_eval'].agg(["mean","median"])
mean | median | |
---|---|---|
left | ||
0 | 0.72 | 0.71 |
1 | 0.72 | 0.79 |
result = eda_single_bins(full_data,var,
[0.0, 0.5, 0.7, 1.0],
['low : 0.00 - 0.50',
'medium : 0.51-0.69',
'high : 0.70 - 1.00'],0)
result
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
last_eval | ||||||||
low : 0.00 - 0.50 | 1032 | 398 | 1430 | 12 | 10 | 20 | 72 | 28 |
medium : 0.51-0.69 | 3805 | 512 | 4317 | 36 | 38 | 26 | 88 | 12 |
high : 0.70 - 1.00 | 5163 | 1081 | 6244 | 52 | 52 | 54 | 83 | 17 |
Total | 10000 | 1991 | 11991 | 100 | 100 | 100 | NaN | NaN |
%%time
df = full_data[['last_eval', 'satisfaction', 'avg_mnth_hrs', 'left', 'number_project', 'tenure']]
sns.set(style="darkgrid")
#sns.pairplot(data=df, hue = 'left', palette='colorblind')
CPU times: user 1.6 ms, sys: 228 µs, total: 1.83 ms Wall time: 1.56 ms
df = full_data[['last_eval', 'dept_support', 'avg_mnth_hrs', 'left', 'number_project']]
scatplot_single(df, 'last_eval', 'avg_mnth_hrs')
scatplot_single(full_data, 'last_eval', 'satisfaction')
return_plot = correlation_single(full_data, var)
returned_plot = quadplot(full_data, var, False)
plt.show()
<Figure size 800x500 with 0 Axes>
<Figure size 800x500 with 0 Axes>
# Initialise Analysis Variables and Pandas Display options
var = 'number_project' # Feature used for plot and detail
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
full_data[var].describe(include='all')
count 11991.00 mean 3.80 std 1.16 min 2.00 25% 3.00 50% 4.00 75% 5.00 max 7.00 Name: number_project, dtype: float64
number_project
- Include¶Include `number_project` in the model dataset
number_project
¶Caveats - The number of projects is a simple metric that does not take into account either the complexity or duration of the project. A member of staff could only have worked on one project, but the complexity and duration of the project would not be taken into account with the available data.
People who left are mostly in the low and medium salary ranges
People who left are estimated to cost the business £42M (assuming 30% cost of annual salary to replace)
45% of employees who worked on 2 or less projects left
100% of employees who worked on seven projects left the company
26% of employees who worked on 5 or more projects left the company
50% of employees who left, worked on either 2 or less or more than 5 projects
Highest retention was with employees who worked on between 3-5 projects where 95% of the staff were retained.
Lowest retention was with employees who worked on 2 projects, 46% of the staff were retained or 6+ projects where 3% of the staff were retained.
# Calculate mean and median satisfaction scores of employees who left and those who stayed
full_data.groupby(['left'], observed=True)['number_project'].agg(["mean","median"])
mean | median | |
---|---|---|
left | ||
0 | 3.79 | 4.00 |
1 | 3.88 | 4.00 |
eda_single(full_data,var,2)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
number_project | ||||||||
2 | 725.00 | 857.00 | 1582.00 | 13.35 | 7.25 | 43.04 | 45.83 | 54.17 |
3 | 3482.00 | 38.00 | 3520.00 | 29.71 | 34.82 | 1.91 | 98.92 | 1.08 |
4 | 3448.00 | 237.00 | 3685.00 | 31.11 | 34.48 | 11.90 | 93.57 | 6.43 |
5 | 1890.00 | 343.00 | 2233.00 | 18.85 | 18.90 | 17.23 | 84.64 | 15.36 |
6 | 455.00 | 371.00 | 826.00 | 6.97 | 4.55 | 18.63 | 55.08 | 44.92 |
7 | 0.00 | 145.00 | 0.00 | 0.00 | 0.00 | 7.28 | 0.00 | 0.00 |
Total | 10000.00 | 1991.00 | 11846.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
eda_single_bins(full_data,var, [0, 3,5,7],
['low (<3)','normal (3-5)','high (>5)'],2)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
number_project | ||||||||
low (<3) | 4207.00 | 895.00 | 5102.00 | 42.55 | 42.07 | 44.95 | 82.46 | 17.54 |
normal (3-5) | 5338.00 | 580.00 | 5918.00 | 49.35 | 53.38 | 29.13 | 90.20 | 9.80 |
high (>5) | 455.00 | 516.00 | 971.00 | 8.10 | 4.55 | 25.92 | 46.86 | 53.14 |
Total | 10000.00 | 1991.00 | 11991.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
return_plot = correlation_single(full_data, var)
scatplot_single(full_data, 'number_project','avg_mnth_hrs')
returned_plot = quadplot(full_data, var, True)
returned_plot.show()
<Figure size 800x500 with 0 Axes>
<Figure size 800x500 with 0 Axes>
scatplot_single(focus_data, 'last_eval', 'number_project')
ret = eda_dual(focus_data, 'number_project', 'last_eval', 2)
ret.sort_values(by='Left Perc of Left Total', ascending=False)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
Total | 10000.00 | 1991.00 | 7598.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
(2, 0.46) | 12.00 | 85.00 | 97.00 | 1.28 | 0.12 | 4.27 | 12.37 | 87.63 |
(2, 0.48) | 12.00 | 77.00 | 89.00 | 1.17 | 0.12 | 3.87 | 13.48 | 86.52 |
(2, 0.51) | 23.00 | 71.00 | 94.00 | 1.24 | 0.23 | 3.57 | 24.47 | 75.53 |
(2, 0.54) | 11.00 | 69.00 | 80.00 | 1.05 | 0.11 | 3.47 | 13.75 | 86.25 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
(5, 0.52) | 34.00 | 0.00 | 0.00 | 0.00 | 0.34 | 0.00 | 0.00 | 0.00 |
(5, 0.53) | 26.00 | 0.00 | 0.00 | 0.00 | 0.26 | 0.00 | 0.00 | 0.00 |
(5, 0.55) | 46.00 | 0.00 | 0.00 | 0.00 | 0.46 | 0.00 | 0.00 | 0.00 |
(5, 0.56) | 40.00 | 0.00 | 0.00 | 0.00 | 0.40 | 0.00 | 0.00 | 0.00 |
(2, 0.36) | 5.00 | 0.00 | 0.00 | 0.00 | 0.05 | 0.00 | 0.00 | 0.00 |
356 rows × 8 columns
%%time
# Create a plot for hours worked and number of projects
plot_df=full_data[['avg_mnth_hrs', 'number_project', 'left' ]]
sns.set(style="darkgrid")
sns.pairplot(data = plot_df, hue = 'left')
#display the graphs
plt.show()
CPU times: user 3.34 s, sys: 592 ms, total: 3.93 s Wall time: 3.32 s
# Initialise Analysis Variables and Pandas Display options
var = 'avg_mnth_hrs' # Feature used for plot and detail
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
full_data[var].describe()
count 11991.00 mean 200.47 std 48.73 min 96.00 25% 157.00 50% 200.00 75% 243.00 max 310.00 Name: avg_mnth_hrs, dtype: float64
avg_mnth_hrs
¶Data was grouped into bins to simplify analysis
Label | Hrs/day |
---|---|
Low 0-150 | 0-7 hrs/day |
Std 151-200 | 7-9 hrs/day |
High 201-251 | 9-12 hrs/day |
V High 251-300 | 14-16 hrs/day |
Extreme 301-350+ | 16+ hrs/day |
avg_mnth_hrs
¶Average hours per day assuming 21.25 days per month
Average Monthly Hours | 150 | 200 | 250 | 300 | 350 |
---|---|---|---|---|---|
Avg Hours Day (ass. 21.25 days/month) | 7 | 9 | 12 | 14 | 16 |
# Calculate mean and median satisfaction scores of employees who left and those who stayed
full_data.groupby(['left'])['avg_mnth_hrs'].agg(["mean","median"])
mean | median | |
---|---|---|
left | ||
0 | 198.94 | 198.00 |
1 | 208.16 | 226.00 |
avg_hours = full_data.groupby('left')['avg_mnth_hrs'].mean()
perc_diff = ((avg_hours[1] - avg_hours[0]) / avg_hours[0]) #* 100
annual_diff = (avg_hours[1] - avg_hours[0]) * 12
# Display the result
print("Average Monthly Hours Comparison")
print()
print("Left Mean Mnthly Hrs : {:.2f}".format(avg_hours[1]))
print("Stayed Mean Mnthly Hrs : {:.2f}".format(avg_hours[0]))
print("Percentage Difference : {:.2f}%".format(perc_diff))
print("Annual Hrs Difference : {:.2f} hours".format(annual_diff))
print()
print("Employees who left worked {:.2f} more hours per year than retained staff".format(annual_diff))
total_hours = full_data.groupby(['salary', 'left'])['avg_mnth_hrs'].mean()
# Reset the index to make the result a DataFrame
total_hours = total_hours.reset_index()
# Calculate the total difference between retained and not retained for each salary group
total_hours['total_difference'] = total_hours.groupby('salary')['avg_mnth_hrs'].diff().fillna(0)
# Display the result
total_hours
Average Monthly Hours Comparison Left Mean Mnthly Hrs : 208.16 Stayed Mean Mnthly Hrs : 198.94 Percentage Difference : 0.05% Annual Hrs Difference : 110.63 hours Employees who left worked 110.63 more hours per year than retained staff
salary | left | avg_mnth_hrs | total_difference | |
---|---|---|---|---|
0 | 0 | 0 | 198.67 | 0.00 |
1 | 0 | 1 | 207.32 | 8.66 |
2 | 1 | 0 | 199.02 | 0.00 |
3 | 1 | 1 | 209.76 | 10.75 |
4 | 2 | 0 | 199.92 | 0.00 |
5 | 2 | 1 | 202.98 | 3.06 |
eda_single_bins(full_data, var, [0, 175, 200, 250, 300, 350],
[' Normal 0-175 : 0-8 hrs/day',
' 176-200 : 8-9 hrs/day',
' High 201-251 : 9-12 hrs/day',
' V High 251-300 : 14-16 hrs/day',
'Extreme 301-350 : 16+ hrs/day'],1)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
avg_mnth_hrs | ||||||||
Normal 0-175 : 0-8 hrs/day | 3471.0 | 890.0 | 4361.0 | 36.4 | 34.7 | 44.7 | 79.6 | 20.4 |
176-200 : 8-9 hrs/day | 1664.0 | 20.0 | 1684.0 | 14.0 | 16.6 | 1.0 | 98.8 | 1.2 |
High 201-251 : 9-12 hrs/day | 3160.0 | 376.0 | 3536.0 | 29.5 | 31.6 | 18.9 | 89.4 | 10.6 |
V High 251-300 : 14-16 hrs/day | 1705.0 | 613.0 | 2318.0 | 19.3 | 17.1 | 30.8 | 73.6 | 26.4 |
Extreme 301-350 : 16+ hrs/day | 0.0 | 92.0 | 92.0 | 0.8 | 0.0 | 4.6 | 0.0 | 100.0 |
Total | 10000.0 | 1991.0 | 11991.0 | 100.0 | 100.0 | 100.0 | NaN | NaN |
avg_mnth_hrs
- Include & Encode¶Include `avg_mnth_hrs` in the model dataset
Feature Engineer `avg_mnth_hrs' to a binary result called `overwork` where avg_mnth_hrs > 175 = overworked
returned_plot = correlation_single(full_data, var)
returned_plot = quadplot(full_data, var, False) # df, variable, annotate
<Figure size 800x500 with 0 Axes>
<Figure size 800x500 with 0 Axes>
# Create a violin plot
violinplot = sns.violinplot(x='left', y=var, data=full_data, hue='left', palette='colorblind')
violinplot.set_title(f'Violin Plot of {var} by left')
violinplot.axhline(y=177, color='red', linestyle='--', label='Threshold at 177')
violinplot.set_xlabel('left')
violinplot.set_ylabel(f'{var}')
Text(0, 0.5, 'avg_mnth_hrs')
# Initialise Analysis Variables and Pandas Display options
var = 'tenure' # Feature used for plot and detail
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
print(full_data[var].describe())
count 11991.00 mean 3.36 std 1.33 min 2.00 25% 3.00 50% 3.00 75% 4.00 max 10.00 Name: tenure, dtype: float64
tenure
- include / Remove Outliers¶Include `tenure` in the model dataset
Consider removing outliers identified during data cleaning ops.
tenure
¶# Calculate mean and median satisfaction scores of employees who left and those who stayed
full_data.groupby(['left'])['tenure'].agg(["mean","median"])
mean | median | |
---|---|---|
left | ||
0 | 3.26 | 3.00 |
1 | 3.88 | 4.00 |
eda_single(full_data, var,1)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
tenure | ||||||||
2 | 2879.0 | 31.0 | 2910.0 | 24.9 | 28.8 | 1.6 | 98.9 | 1.1 |
3 | 4316.0 | 874.0 | 5190.0 | 44.3 | 43.2 | 43.9 | 83.2 | 16.8 |
4 | 1510.0 | 495.0 | 2005.0 | 17.1 | 15.1 | 24.9 | 75.3 | 24.7 |
5 | 580.0 | 482.0 | 1062.0 | 9.1 | 5.8 | 24.2 | 54.6 | 45.4 |
6 | 433.0 | 109.0 | 542.0 | 4.6 | 4.3 | 5.5 | 79.9 | 20.1 |
7 | 94.0 | 0.0 | 0.0 | 0.0 | 0.9 | 0.0 | 0.0 | 0.0 |
8 | 81.0 | 0.0 | 0.0 | 0.0 | 0.8 | 0.0 | 0.0 | 0.0 |
10 | 107.0 | 0.0 | 0.0 | 0.0 | 1.1 | 0.0 | 0.0 | 0.0 |
Total | 10000.0 | 1991.0 | 11709.0 | 100.0 | 100.0 | 100.0 | NaN | NaN |
eda_single_bins(full_data, 'tenure', [0,4,12],['<5 years','>5 years'],0)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
tenure | ||||||||
<5 years | 8705 | 1400 | 10105 | 84 | 87 | 70 | 86 | 14 |
>5 years | 1295 | 591 | 1886 | 16 | 13 | 30 | 69 | 31 |
Total | 10000 | 1991 | 11991 | 100 | 100 | 100 | NaN | NaN |
We are making assumptions here!
We would need some additional information in the dataset to identify the start year for each employee and which department managers belong to.
In management circles, it is common knowledge that the ideal number of direct subordinates a manager should have is 7±2 that is around 8 employees,plus or minus 2.
Currently within Salifort Motors, there are 9616 current employees (not management) and 384 Managers (not employees). This puts the employee manager ratio at 23 employees for every manager. Using conventional management thinking of 8 employees per manager, the client would need to recruit/develop a further 800+ managers to bring the employee to manager ratio down to more conventional levels.
This also links to the promotion field where the % of employees who have received promotions in the last five years is very low (1.6%) and the opportunity to grow management resources within the business is high, however this is not as simple as it seems, since 87% of employees have a tenure < 5 years
Consideration should be given to the development path for employees to move into management.
df = original_data.copy() # refresh df just in case
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
# Calculate the total number of employees (left = 0)
# filter left = 0 and dept != management
total_employees = len(df.loc[((df['left'] == 0) & (df['dept'] != 'management'))])
# filter left = 0 and dept = management
total_managers = len(df.loc[((df['left'] == 0) & (df['dept'] == 'management'))])
# Calculate the total number of Accidents - just curious
total_accident = len(df.loc[((df['left'] == 0) & (df['accident'] == 1))])
# Calculate the number of employees who left (left = 1)
# filter left = 1 and dept != management
employees_left = len(df.loc[((df['left'] == 1) & (df['dept'] != 'management'))])
# filter left = 1 and dept = management
managers_left = len(df.loc[((df['left'] == 1) & (df['dept'] == 'management'))])
result = pd.DataFrame({'Total Employees' : total_employees,
'Employees Left' : employees_left,
'Employees Left Perc' : (employees_left / total_employees)*100,
'Total Mgrs' : total_managers,
'Mgrs Left' : managers_left,
'Mgrs Left Perc' : (managers_left / total_managers)*100,
'Employee / Mgr Ratio': (total_employees - employees_left) /(total_managers - managers_left),
'Total Accident' : total_accident,
'Accident Perc' : (total_accident / total_employees)*100
}, index = [0])
result
Total Employees | Employees Left | Employees Left Perc | Total Mgrs | Mgrs Left | Mgrs Left Perc | Employee / Mgr Ratio | Total Accident | Accident Perc | |
---|---|---|---|---|---|---|---|---|---|
0 | 9616 | 1939 | 20.16 | 384 | 52 | 13.54 | 23.12 | 1745 | 18.15 |
returned_plot = correlation_single(full_data, var)
returned_plot = quadplot(full_data, var, True)
returned_plot.show()
<Figure size 800x500 with 0 Axes>
<Figure size 800x500 with 0 Axes>
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
df = full_data.copy()
# Count promotions by tenure
promotions_by_tenure = df.groupby('tenure')['promotion' ].agg(['count', 'sum']).reset_index()
# Calculate the total count by tenure
total_by_tenure = df['tenure'].value_counts().reset_index()
total_by_tenure.columns = ['tenure', 'Total by Tenure']
# Merge the two DataFrames to combine the counts
result_df = pd.merge(promotions_by_tenure, total_by_tenure, on='tenure')
# Calculate the percentage of tenure as a percentage of the total
result_df['percentage_of_total'] = (result_df['count'] / result_df['count'].sum()) * 100
# Calculate the percentage of promotions
result_df['promotions_percentage'] = (result_df['sum'] / df.shape[0]) * 100
# Rename the columns for clarity
result_df = result_df.rename(columns={'count':
'Count of Tenure', 'sum': 'Count of Promotions'})
summary_vars = ['Count of Tenure','Count of Promotions',
'Total by Tenure', 'percentage_of_total', 'promotions_percentage']
result_df.loc['Total'] = result_df[summary_vars].sum()
# Display the resulting table
result_df
tenure | Count of Tenure | Count of Promotions | Total by Tenure | percentage_of_total | promotions_percentage | |
---|---|---|---|---|---|---|
0 | 2.00 | 2910.00 | 39.00 | 2910.00 | 24.27 | 0.33 |
1 | 3.00 | 5190.00 | 89.00 | 5190.00 | 43.28 | 0.74 |
2 | 4.00 | 2005.00 | 22.00 | 2005.00 | 16.72 | 0.18 |
3 | 5.00 | 1062.00 | 11.00 | 1062.00 | 8.86 | 0.09 |
4 | 6.00 | 542.00 | 11.00 | 542.00 | 4.52 | 0.09 |
5 | 7.00 | 94.00 | 18.00 | 94.00 | 0.78 | 0.15 |
6 | 8.00 | 81.00 | 5.00 | 81.00 | 0.68 | 0.04 |
7 | 10.00 | 107.00 | 8.00 | 107.00 | 0.89 | 0.07 |
Total | NaN | 11991.00 | 203.00 | 11991.00 | 100.00 | 1.69 |
plt.figure(figsize=(8, 6))
sns.scatterplot(data=full_data, x='avg_mnth_hrs', y='promotion', hue='left', alpha=0.4, palette='colorblind')
plt.axvline(x=166.67, color='#ff6361', linestyle='--', label="175 Hrs")
plt.legend(labels=['left', 'stayed'])
plt.title('Monthly hours by promotion last 5 years', fontsize='14');
#plt.legend()
pd.set_option('display.max_rows', 100) # Display the entire results dataframe
df = original_data.copy() # copy original data to df used through the cell
# Group by 'dept' and 'tenure' and get the count
count_by_dept_and_tenure = df.groupby(['dept', 'tenure', 'left',
'promotion']).size().reset_index(name='count_by_dept_and_tenure')
# Group by 'dept' and get the total count
total_count_by_dept = df.groupby('dept').size().reset_index(name='total_count_by_dept')
# Merge the two DataFrames on 'dept'
result_df = pd.merge(count_by_dept_and_tenure, total_count_by_dept, on='dept')
# Calculate the percentage and add it to the DataFrame
result_df['perc_by_dept_and_tenure'] = (result_df['count_by_dept_and_tenure'] /
result_df['total_count_by_dept']) * 100
result_df.drop('total_count_by_dept', axis = 'columns', inplace = True)
result_df.sort_values(by='perc_by_dept_and_tenure', ascending = False)
# Display the combined DataFrame
result_df
dept | tenure | left | promotion | count_by_dept_and_tenure | perc_by_dept_and_tenure | |
---|---|---|---|---|---|---|
0 | accounting | 2 | 0 | 0 | 127 | 20.45 |
1 | accounting | 2 | 0 | 1 | 2 | 0.32 |
2 | accounting | 2 | 1 | 0 | 1 | 0.16 |
3 | accounting | 3 | 0 | 0 | 227 | 36.55 |
4 | accounting | 3 | 0 | 1 | 3 | 0.48 |
... | ... | ... | ... | ... | ... | ... |
162 | technical | 6 | 0 | 1 | 2 | 0.09 |
163 | technical | 6 | 1 | 0 | 23 | 1.02 |
164 | technical | 7 | 0 | 0 | 1 | 0.04 |
165 | technical | 8 | 0 | 0 | 5 | 0.22 |
166 | technical | 10 | 0 | 0 | 10 | 0.45 |
167 rows × 6 columns
df.groupby('dept')['tenure'].mean()
dept accounting 3.40 hr 3.26 it 3.35 management 3.98 marketing 3.42 product_mng 3.34 randd 3.32 sales 3.38 support 3.29 technical 3.31 Name: tenure, dtype: float64
# Initialise Analysis Variables and Pandas Display options
var = 'satisfaction' # Feature used for plot and detail
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
full_data[var].describe()
count 11991.00 mean 0.63 std 0.24 min 0.09 25% 0.48 50% 0.66 75% 0.82 max 1.00 Name: satisfaction, dtype: float64
satisfaction
- include¶Include `satisfaction` in the model dataset
satisfaction
¶The feature is not collinear with any others Employees who left had lower mean and medium scores than those that stayed.
full_data.groupby(['left'])['satisfaction'].agg(["mean","median"])
mean | median | |
---|---|---|
left | ||
0 | 0.67 | 0.69 |
1 | 0.44 | 0.41 |
eda_single_bins(full_data,var,[0, 0.37, 0.67, 1.0],
['low 0.00 - 0.37', 'medium 0.38 - 0.66', 'high 0.67-1.00'],1)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
satisfaction | ||||||||
low 0.00 - 0.37 | 1121.0 | 704.0 | 1825.0 | 15.2 | 11.2 | 35.4 | 61.4 | 38.6 |
medium 0.38 - 0.66 | 3676.0 | 756.0 | 4432.0 | 37.0 | 36.8 | 38.0 | 82.9 | 17.1 |
high 0.67-1.00 | 5203.0 | 531.0 | 5734.0 | 47.8 | 52.0 | 26.7 | 90.7 | 9.3 |
Total | 10000.0 | 1991.0 | 11991.0 | 100.0 | 100.0 | 100.0 | NaN | NaN |
return_plot = correlation_single(full_data, var)
returned_plot = quadplot(full_data,var,False)
returned_plot.show()
<Figure size 800x500 with 0 Axes>
<Figure size 800x500 with 0 Axes>
# Visualisation satisfaction vs promotion
df = full_data.copy()
#set figure and axes
fig, ax = plt.subplots(1, 2, figsize = (12,6))
sns.set_theme(style="whitegrid")
sns.color_palette("colorblind", 9)
#Create a boxplot showing 'average_monthly_hours' distributions for 'number_project', comparing employees who stayed vs those who left
sns.violinplot(data = df, x = 'satisfaction', y = 'promotion', hue = 'left', orient = 'h', ax = ax[0], palette='colorblind')
ax[0].invert_yaxis()
ax[0].set_title('Satisfaction Vs Promotion', fontsize = '14')
#Create a histogram showing distribution of 'number_project', comparing employees who stayed vs those who left
sns.barplot(data = df, y = 'satisfaction', hue = 'left', ax = ax[1], palette='colorblind')
ax[1].set_title('Satisfaction Vs Promotion', fontsize = '14')
#display the graphs
plt.show()
# Initialise Analysis Variables and Pandas Display options
var = 'accident' # Feature used for plot and detail
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
full_data[var].describe()
count 11991.00 mean 0.15 std 0.36 min 0.00 25% 0.00 50% 0.00 75% 0.00 max 1.00 Name: accident, dtype: float64
accident
- discard¶Do not include `accident` in the model dataset
accident
¶15% of all employees had an accident recorded against them.
This accident percentage is 6.5 times the national average of 2.3% [^1]. This column will be removed it seems wrong and needs further investigation before considering it for inclusion in the model.
There were 1850 accident incidents in the data, which is 15% of the total staff.
People who stayed recorded 1,745 accidents (94% of accidents were from people who stayed)
People who left recorded 105 accidents (6% of accidents were from people who left)
[^1] In 2022, the rate of injury cases was 2.3 cases per 100 FTE workers, unchanged from 2021. US Bureau Labour of Statistics, US Department of Labour - https://www.bls.gov/news.release/pdf/osh.pdf
eda_single(full_data, var,1)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
accident | ||||||||
0 | 8255.0 | 1886.0 | 10141.0 | 84.6 | 82.5 | 94.7 | 81.4 | 18.6 |
1 | 1745.0 | 105.0 | 1850.0 | 15.4 | 17.4 | 5.3 | 94.3 | 5.7 |
Total | 10000.0 | 1991.0 | 11991.0 | 100.0 | 100.0 | 100.0 | NaN | NaN |
returned_plot = correlation_single(full_data, var)
returned_plot = quadplot(full_data, var, True)
returned_plot.show()
<Figure size 800x500 with 0 Axes>
<Figure size 800x500 with 0 Axes>
# Initialise Analysis Variables and Pandas Display options
var = 'promotion' # Feature used for plot and detail
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
# Describe variable
full_data[var].describe()
count 11991.00 mean 0.02 std 0.13 min 0.00 25% 0.00 50% 0.00 75% 0.00 max 1.00 Name: promotion, dtype: float64
promotion
- include¶Include `promotion` in the model dataset
promotion
¶93% of staff have a tenure of five or less than 5 years.
promotion_counts = full_data['promotion'].value_counts()
promotion_percentages = full_data['promotion'].value_counts(normalize=True) * 100
# Create a new dataframe to store the results
promotion_df = pd.DataFrame({
'Count': promotion_counts,
'Percentage': promotion_percentages
})
# Print the result dataframe
print(promotion_df)
Count Percentage promotion 0 11788 98.31 1 203 1.69
eda_single(full_data, var,2)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
promotion | ||||||||
0 | 9805.00 | 1983.00 | 11788.00 | 98.31 | 98.05 | 99.60 | 83.18 | 16.82 |
1 | 195.00 | 8.00 | 203.00 | 1.69 | 1.95 | 0.40 | 96.06 | 3.94 |
Total | 10000.00 | 1991.00 | 11991.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
returned_plot = correlation_single(full_data, var)
returned_plot = quadplot(full_data, var, True)
returned_plot.show()
<Figure size 800x500 with 0 Axes>
<Figure size 800x500 with 0 Axes>
promotion
vs tenure
¶100% of staff that left had a tenure < 5 years so would not qualify for a promotion.
df = full_data.copy()
# Filter the DataFrame for "left"
left_1_df = df.loc[df['left'] == 1]
left_0_df = df.loc[df['left'] == 0]
left_1_df = left_1_df.rename(columns={'promotion': 'left promotion'})
# Filter the DataFrame for "left" = 0
left_0_df = left_0_df.rename(columns={'promotion': 'stayed promotion'})
# Group by "tenure" and calculate counts for "left" = 1
left_1_counts = left_1_df.groupby('tenure')['left promotion'].sum().reset_index()
# Calculate percentages for "left" = 1
left_1_counts['left promotion perc'] = (left_1_counts['left promotion'] /
left_1_counts['left promotion'].sum()) * 100
# Group by "tenure" and calculate counts for "left" = 0
left_0_counts = left_0_df.groupby('tenure')['stayed promotion'].sum().reset_index()
# Calculate percentages for "left" = 0
left_0_counts['stayed promotion perc'] = (left_0_counts['stayed promotion'] /
left_0_counts['stayed promotion'].sum()) * 100
# Display the results for "left" = 1
#print("Counts and Percentages for Left = 1:")
#print(left_1_counts)
# Display the results for "left" = 0
#print("\nCounts and Percentages for Left = 0:")
#print(left_0_counts)
# Merge the DataFrames using an outer join
merged_df = left_0_counts.merge(left_1_counts, on='tenure', how='outer')
merged_df
tenure | stayed promotion | stayed promotion perc | left promotion | left promotion perc | |
---|---|---|---|---|---|
0 | 2 | 39 | 20.00 | 0.00 | 0.00 |
1 | 3 | 83 | 42.56 | 6.00 | 75.00 |
2 | 4 | 21 | 10.77 | 1.00 | 12.50 |
3 | 5 | 10 | 5.13 | 1.00 | 12.50 |
4 | 6 | 11 | 5.64 | 0.00 | 0.00 |
5 | 7 | 18 | 9.23 | NaN | NaN |
6 | 8 | 5 | 2.56 | NaN | NaN |
7 | 10 | 8 | 4.10 | NaN | NaN |
# Initialise Analysis Variables and Pandas Display options
var = 'salary' # Feature used for plot and detail
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
salary
¶salary
= The employee's salary (low, medium, or high) encoded to 0,1,2
full_data[var].describe()
count 11991.00 mean 0.60 std 0.64 min 0.00 25% 0.00 50% 1.00 75% 1.00 max 2.00 Name: salary, dtype: float64
salary
- include¶Include `salary` in the model dataset
salary
¶# Totals by Salary
returned_df = eda_single(original_data, var, 2)
returned_df.reset_index()
returned_df.sort_values(by=('Left Count'), ascending = False, inplace = True)
returned_df
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
salary | ||||||||
Total | 10000.00 | 1991.00 | 11991.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
low | 4566.00 | 1174.00 | 5740.00 | 47.87 | 45.66 | 58.97 | 79.55 | 20.45 |
medium | 4492.00 | 769.00 | 5261.00 | 43.87 | 44.92 | 38.62 | 85.38 | 14.62 |
high | 942.00 | 48.00 | 990.00 | 8.26 | 9.42 | 2.41 | 95.15 | 4.85 |
var ='salary'
df = full_data[full_data['dept_sales'] == 1][['salary', 'left']]
#df = full_data[['dept_support', 'left', 'salary']]
returned_df = eda_single(df, var, 2)
returned_df.reset_index()
returned_df.sort_values(by=('Left Count'), ascending = False, inplace = True)
returned_df
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
salary | ||||||||
Total | 2689.00 | 550.00 | 3239.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
0 | 1234.00 | 319.00 | 1553.00 | 47.95 | 45.89 | 58.00 | 79.46 | 20.54 |
1 | 1230.00 | 219.00 | 1449.00 | 44.74 | 45.74 | 39.82 | 84.89 | 15.11 |
2 | 225.00 | 12.00 | 237.00 | 7.32 | 8.37 | 2.18 | 94.94 | 5.06 |
# Totals by salary & promotion
returned_df = eda_dual(original_data, var, 'promotion',2)
returned_df.index.name = var + ' / ' + 'promotion'
returned_df.sort_values(by='Left Count', ascending = False)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
salary / promotion | ||||||||
Total | 10000.00 | 1991.00 | 11947.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
(low, 0) | 4533.00 | 1169.00 | 5702.00 | 47.73 | 45.33 | 58.71 | 79.50 | 20.50 |
(medium, 0) | 4374.00 | 766.00 | 5140.00 | 43.02 | 43.74 | 38.47 | 85.10 | 14.90 |
(high, 0) | 898.00 | 48.00 | 946.00 | 7.92 | 8.98 | 2.41 | 94.93 | 5.07 |
(low, 1) | 33.00 | 5.00 | 38.00 | 0.32 | 0.33 | 0.25 | 86.84 | 13.16 |
(medium, 1) | 118.00 | 3.00 | 121.00 | 1.01 | 1.18 | 0.15 | 97.52 | 2.48 |
(high, 1) | 44.00 | 0.00 | 0.00 | 0.00 | 0.44 | 0.00 | 0.00 | 0.00 |
# Totals by salary & promotion
returned_df = eda_dual(original_data, var, 'tenure',2)
returned_df.index.name = var + ' / ' + 'Tenure'
returned_df.sort_values(by='Left Count', ascending = False)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
salary / Tenure | ||||||||
Total | 10000.00 | 1991.00 | 11405.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
(low, 3) | 1993.00 | 526.00 | 2519.00 | 22.09 | 19.93 | 26.42 | 79.12 | 20.88 |
(medium, 3) | 1927.00 | 325.00 | 2252.00 | 19.75 | 19.27 | 16.32 | 85.57 | 14.43 |
(low, 4) | 690.00 | 296.00 | 986.00 | 8.65 | 6.90 | 14.87 | 69.98 | 30.02 |
(low, 5) | 277.00 | 273.00 | 550.00 | 4.82 | 2.77 | 13.71 | 50.36 | 49.64 |
(medium, 5) | 260.00 | 195.00 | 455.00 | 3.99 | 2.60 | 9.79 | 57.14 | 42.86 |
(medium, 4) | 681.00 | 188.00 | 869.00 | 7.62 | 6.81 | 9.44 | 78.37 | 21.63 |
(low, 6) | 168.00 | 67.00 | 235.00 | 2.06 | 1.68 | 3.37 | 71.49 | 28.51 |
(medium, 6) | 219.00 | 42.00 | 261.00 | 2.29 | 2.19 | 2.11 | 83.91 | 16.09 |
(high, 3) | 396.00 | 23.00 | 419.00 | 3.67 | 3.96 | 1.16 | 94.51 | 5.49 |
(medium, 2) | 1259.00 | 19.00 | 1278.00 | 11.21 | 12.59 | 0.95 | 98.51 | 1.49 |
(high, 5) | 43.00 | 14.00 | 57.00 | 0.50 | 0.43 | 0.70 | 75.44 | 24.56 |
(low, 2) | 1362.00 | 12.00 | 1374.00 | 12.05 | 13.62 | 0.60 | 99.13 | 0.87 |
(high, 4) | 139.00 | 11.00 | 150.00 | 1.32 | 1.39 | 0.55 | 92.67 | 7.33 |
(low, 7) | 18.00 | 0.00 | 0.00 | 0.00 | 0.18 | 0.00 | 0.00 | 0.00 |
(low, 8) | 30.00 | 0.00 | 0.00 | 0.00 | 0.30 | 0.00 | 0.00 | 0.00 |
(low, 10) | 28.00 | 0.00 | 0.00 | 0.00 | 0.28 | 0.00 | 0.00 | 0.00 |
(high, 10) | 32.00 | 0.00 | 0.00 | 0.00 | 0.32 | 0.00 | 0.00 | 0.00 |
(high, 6) | 46.00 | 0.00 | 0.00 | 0.00 | 0.46 | 0.00 | 0.00 | 0.00 |
(high, 7) | 19.00 | 0.00 | 0.00 | 0.00 | 0.19 | 0.00 | 0.00 | 0.00 |
(high, 8) | 9.00 | 0.00 | 0.00 | 0.00 | 0.09 | 0.00 | 0.00 | 0.00 |
(medium, 7) | 57.00 | 0.00 | 0.00 | 0.00 | 0.57 | 0.00 | 0.00 | 0.00 |
(medium, 8) | 42.00 | 0.00 | 0.00 | 0.00 | 0.42 | 0.00 | 0.00 | 0.00 |
(medium, 10) | 47.00 | 0.00 | 0.00 | 0.00 | 0.47 | 0.00 | 0.00 | 0.00 |
(high, 2) | 258.00 | 0.00 | 0.00 | 0.00 | 2.58 | 0.00 | 0.00 | 0.00 |
df = original_data.copy()
# Group the data by 'department', 'salary', and 'left' and calculate the count in each group
grouped = df.groupby(['salary', 'dept', 'left']).size().reset_index(name='count')
# Pivot the data to create a summary table
summary_table = grouped.pivot_table(index=['salary', 'dept'], columns='left', values='count', fill_value=0)
# Calculate the percentage of people who left and add it as a new column
summary_table['Percentage Left'] = (summary_table[1] / (summary_table[0] + summary_table[1])) * 100
# Reset the index to have 'department' and 'salary' as regular columns
summary_table = summary_table.reset_index()
summary_table = summary_table.rename(columns={0: 'Stayed', 1: 'Left'})
summary_table.sort_values(by = 'Left', ascending = False)
summary_table = summary_table.sort_values(by=('Left'), ascending=False)
summary_table
left | salary | dept | Stayed | Left | Percentage Left |
---|---|---|---|---|---|
17 | low | sales | 1234.00 | 319.00 | 20.54 |
19 | low | technical | 900.00 | 238.00 | 20.91 |
27 | medium | sales | 1230.00 | 219.00 | 15.11 |
18 | low | support | 676.00 | 191.00 | 22.03 |
29 | medium | technical | 798.00 | 142.00 | 15.11 |
28 | medium | support | 712.00 | 116.00 | 14.01 |
12 | low | it | 389.00 | 87.00 | 18.28 |
14 | low | marketing | 234.00 | 76.00 | 24.52 |
22 | medium | it | 362.00 | 67.00 | 15.62 |
11 | low | hr | 234.00 | 62.00 | 20.95 |
15 | low | product_mng | 284.00 | 59.00 | 17.20 |
10 | low | accounting | 237.00 | 59.00 | 19.93 |
16 | low | randd | 271.00 | 51.00 | 15.84 |
21 | medium | hr | 219.00 | 48.00 | 17.98 |
25 | medium | product_mng | 244.00 | 47.00 | 16.15 |
20 | medium | accounting | 215.00 | 47.00 | 17.94 |
24 | medium | marketing | 268.00 | 33.00 | 10.96 |
13 | low | management | 107.00 | 32.00 | 23.02 |
26 | medium | randd | 294.00 | 31.00 | 9.54 |
23 | medium | management | 150.00 | 19.00 | 11.24 |
7 | high | sales | 225.00 | 12.00 | 5.06 |
9 | high | technical | 156.00 | 10.00 | 6.02 |
8 | high | support | 121.00 | 5.00 | 3.97 |
5 | high | product_mng | 48.00 | 4.00 | 7.69 |
2 | high | it | 67.00 | 4.00 | 5.63 |
1 | high | hr | 35.00 | 3.00 | 7.89 |
6 | high | randd | 44.00 | 3.00 | 6.38 |
4 | high | marketing | 59.00 | 3.00 | 4.84 |
0 | high | accounting | 60.00 | 3.00 | 4.76 |
3 | high | management | 127.00 | 1.00 | 0.78 |
returned_plot = correlation_single(full_data, var)
returned_plot.show()
returned_plot = quadplot(full_data, var, True)
returned_plot.show()
<Figure size 800x500 with 0 Axes>
<Figure size 800x500 with 0 Axes>
# Prepare data for heatmap
df = original_data.copy()
contingency_table = pd.crosstab(df['left'], df[var])
# Calculate column percentages
column_percentages = contingency_table / contingency_table.sum()
row_percentages = contingency_table.div(contingency_table.sum(axis=1), axis=0)
# Create a pivot table for the heatmap
heatmap_data = df.pivot_table(index='left', columns=var, aggfunc='size', fill_value=0)
# Create a heatmap
plt.figure(figsize=(12,4 ))
plt.subplot(1, 3, 1) # Subplot for count
sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, fmt='d', cbar=False)
plt.title(f'{var} v left (count)')
plt.xlabel(var)
plt.ylabel('left')
# Create a heatmap
plt.subplot(1, 3, 2) # Subplot for percentage of salary band
sns.heatmap(column_percentages, annot=True, cmap='YlGnBu', fmt='.1%', cbar=False)
plt.title(f'{var} (%) Vs. left by Salary Band')
plt.xlabel(var)
plt.ylabel('left')
# Create a heatmap
plt.subplot(1, 3, 3) # Subplot for percentage
ax = sns.heatmap(row_percentages, annot=True, cmap='YlGnBu', fmt='.1%', cbar=False)
annot_font_size = 10
plt.title(f'{var} (%) Vs. left by left')
#ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
#ax.set_yticklabels(ax.get_yticklabels(), fontsize=8)
plt.xlabel(var)
plt.ylabel('left')
plt.show()
# Create two new DataFrames for 'left = 1' and 'left = 0'
df = full_data.copy()
left_1 = df[df['left'] == 1]
left_0 = df[df['left'] == 0]
# Count the number of occurrences for each category in 'salary' for each group
all_counts = df['salary'].value_counts()
left_1_counts = left_1['salary'].value_counts()
left_0_counts = left_0['salary'].value_counts()
# Create pie chart subplots
fig, axs = plt.subplots(1, 3, figsize=(12, 5))
sns.set(style="darkgrid")
# Plot the pie chart for 'left = 1'
axs[2].pie(left_1_counts, labels=left_1_counts.index, autopct='%1.1f%%', startangle=90)
axs[2].set_title('left = 1')
# Plot the pie chart for 'left = 0'
axs[1].pie(left_0_counts, labels=left_0_counts.index, autopct='%1.1f%%', startangle=90)
axs[1].set_title('left = 0')
# Plot the pie chart for 'left = 0'
axs[0].pie(all_counts, labels=all_counts.index, autopct='%1.1f%%', startangle=90)
axs[0].set_title('All')
plt.show()
## Visualise Hours and Salary
df = full_data.copy()
#set figure and axes
fig, ax = plt.subplots(1, 2, figsize = (12,6))
sns.set(style="darkgrid")
sns.color_palette("husl", 9)
#Create a boxplot showing 'average_monthly_hours' distributions for 'number_project', comparing employees who stayed vs those who left
sns.boxplot(data = full_data, x = 'avg_mnth_hrs', y = 'salary', hue = 'left', orient = 'h', ax = ax[0], palette='colorblind')
ax[0].invert_yaxis()
ax[0].set_title('Monthly hours by Salary', fontsize = '14')
ax[0].set_yticks([0,1,2],['low', 'medium','high'], )
#Create a histogram showing distribution of 'number_project', comparing employees who stayed vs those who left and salary
hue_order = [0, 1, 2]
hue_labels = ['Low', 'Medium', 'High']
tenure_stay = full_data[full_data['left'] == 0]['salary']
tenure_left = full_data[full_data['left'] == 1]['salary']
sns.boxplot(data = full_data, x = 'left', y = 'number_project', hue = 'salary', hue_order=hue_order, ax = ax[1], palette='colorblind')
ax[1].set_title('Number of projects by Salary', fontsize = '14')
ax[1].set_xticks([0, 1], ['Not Left', 'Left'])
# Set custom labels for the hue using legend
handles, _ = plt.gca().get_legend_handles_labels()
plt.legend(handles, hue_labels, title='Salary', loc='upper right')
#display the graphs
plt.show()
df = original_data.copy()
# Group the data by 'department', 'salary', and 'left' and calculate the count in each group
grouped = df.groupby(['dept', 'salary', 'left']).size().reset_index(name='count')
# Pivot the data to create a summary table
summary_table = grouped.pivot_table(index=['dept', 'salary'], columns='left', values='count', fill_value=0)
# Calculate the percentage of people who left and add it as a new column
summary_table['Percentage Left'] = (summary_table[1] / (summary_table[0] + summary_table[1])) * 100
# Reset the index to have 'department' and 'salary' as regular columns
summary_table = summary_table.reset_index()
# Create a heatmap to visualize the count of people who left by 'department' and 'salary'
summary_table = summary_table.rename(columns={0: 'Stayed', 1: 'Left'})
summary_table.sort_values(by='Left', ascending=False, inplace=True)
summary_table
left | dept | salary | Stayed | Left | Percentage Left |
---|---|---|---|---|---|
22 | sales | low | 1234.00 | 319.00 | 20.54 |
28 | technical | low | 900.00 | 238.00 | 20.91 |
23 | sales | medium | 1230.00 | 219.00 | 15.11 |
25 | support | low | 676.00 | 191.00 | 22.03 |
29 | technical | medium | 798.00 | 142.00 | 15.11 |
26 | support | medium | 712.00 | 116.00 | 14.01 |
7 | it | low | 389.00 | 87.00 | 18.28 |
13 | marketing | low | 234.00 | 76.00 | 24.52 |
8 | it | medium | 362.00 | 67.00 | 15.62 |
4 | hr | low | 234.00 | 62.00 | 20.95 |
16 | product_mng | low | 284.00 | 59.00 | 17.20 |
1 | accounting | low | 237.00 | 59.00 | 19.93 |
19 | randd | low | 271.00 | 51.00 | 15.84 |
5 | hr | medium | 219.00 | 48.00 | 17.98 |
2 | accounting | medium | 215.00 | 47.00 | 17.94 |
17 | product_mng | medium | 244.00 | 47.00 | 16.15 |
14 | marketing | medium | 268.00 | 33.00 | 10.96 |
10 | management | low | 107.00 | 32.00 | 23.02 |
20 | randd | medium | 294.00 | 31.00 | 9.54 |
11 | management | medium | 150.00 | 19.00 | 11.24 |
21 | sales | high | 225.00 | 12.00 | 5.06 |
27 | technical | high | 156.00 | 10.00 | 6.02 |
24 | support | high | 121.00 | 5.00 | 3.97 |
15 | product_mng | high | 48.00 | 4.00 | 7.69 |
6 | it | high | 67.00 | 4.00 | 5.63 |
18 | randd | high | 44.00 | 3.00 | 6.38 |
12 | marketing | high | 59.00 | 3.00 | 4.84 |
3 | hr | high | 35.00 | 3.00 | 7.89 |
0 | accounting | high | 60.00 | 3.00 | 4.76 |
9 | management | high | 127.00 | 1.00 | 0.78 |
# Initialise Analysis Variables and Pandas Display options
var = 'dept' # Feature used for plot and detail
pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point
dept
¶dept
= The employee's department
# Describe variable
original_data[var].describe()
count 11991 unique 10 top sales freq 3239 Name: dept, dtype: object
dept
- include¶Include `dept` in the model dataset
Attrition across departments is more or less the same. Nothing stands out from this, consider excluding it from one of the model test runs.
returned_df = eda_single(original_data, var, 2)
returned_df.sort_values(by='Left Count', ascending=False, inplace=True)
returned_df
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
dept | ||||||||
Total | 10000.00 | 1991.00 | 11991.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
sales | 2689.00 | 550.00 | 3239.00 | 27.01 | 26.89 | 27.62 | 83.02 | 16.98 |
technical | 1854.00 | 390.00 | 2244.00 | 18.71 | 18.54 | 19.59 | 82.62 | 17.38 |
support | 1509.00 | 312.00 | 1821.00 | 15.19 | 15.09 | 15.67 | 82.87 | 17.13 |
it | 818.00 | 158.00 | 976.00 | 8.14 | 8.18 | 7.94 | 83.81 | 16.19 |
hr | 488.00 | 113.00 | 601.00 | 5.01 | 4.88 | 5.68 | 81.20 | 18.80 |
marketing | 561.00 | 112.00 | 673.00 | 5.61 | 5.61 | 5.63 | 83.36 | 16.64 |
product_mng | 576.00 | 110.00 | 686.00 | 5.72 | 5.76 | 5.52 | 83.97 | 16.03 |
accounting | 512.00 | 109.00 | 621.00 | 5.18 | 5.12 | 5.47 | 82.45 | 17.55 |
randd | 609.00 | 85.00 | 694.00 | 5.79 | 6.09 | 4.27 | 87.75 | 12.25 |
management | 384.00 | 52.00 | 436.00 | 3.64 | 3.84 | 2.61 | 88.07 | 11.93 |
# Prepare data for heatmap
df = original_data.copy()
contingency_table = pd.crosstab(df['left'], df[var])
# Calculate column percentages
column_percentages = contingency_table / contingency_table.sum()
row_percentages = contingency_table.div(contingency_table.sum(axis=1), axis=0)
# Create a pivot table for the heatmap
heatmap_data = df.pivot_table(index='left', columns=var, aggfunc='size', fill_value=0)
# Create a heatmap
fig, axes = plt.subplots(1, 2, figsize=(12,3 ))
sns.set(style="darkgrid")
#plt.subplot(2, 2, 1) # Subplot for count
sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, annot_kws={"size": 10},fmt='d', cbar=False, ax=axes[0])
axes[0].set_title(f'{var} v left (count)')
axes[0].set_xlabel(var)
axes[0].set_ylabel('left')
# Create a heatmap
#plt.subplot(2, 2, 1) # Subplot for percentage of salary band
sns.heatmap(column_percentages, annot=True, cmap='YlGnBu', annot_kws={"size": 10},fmt='.0%', cbar=False, ax=axes[1])
axes[1].set_title(f'{var} (%) Vs. left')
axes[1].set_xlabel(var)
axes[1].set_ylabel('left')
# Create a heatmap
#plt.subplot(2, 2, 2) # Subplot for percentage
#sns.heatmap(row_percentages, annot=True, cmap='YlGnBu', annot_kws={"size": 8},fmt='.1%', cbar=False, ax=axes[1,0])
#axes[1,0].set_title(f'{var} (%) Vs. left by left')
#axes[1,0].set_xlabel(var)
#axes[1,0].set_ylabel('left')
#percentage_left_by_dept = df.groupby('dept')['left'].mean() * 100
#left_1_counts = df.groupby('dept')['left'].value_counts()
# Create a correlation map and heatmap
#sns.heatmap(percentage_left_by_dept.reset_index().pivot_table(index='dept', columns='dept').T, annot=True, cbar=False, annot_kws={"size": 8}, fmt=".2f", cmap="YlGnBu", ax=axes[1,1])
#axes[1,1].set_title('Percentage Left by Department')
#sns.heatmap(left_1_counts.reset_index().pivot_table(index='dept', columns='dept').T, annot=True, annot_kws={"size": 10}, fmt=".0f", cmap="YlGnBu", cbar=False, ax=axes[2,0])
#axes[2,1].set_title('count Left by Department')
#fig.delaxes(axes[2, 1]) # Delete empty plots
plt.subplots_adjust(wspace=0.2, hspace=.3)
plt.show()
df = original_data.copy()
# Filter the DataFrame for left = 0 and left = 1
left_0_df = df[df['left'] == 0]
left_1_df = df[df['left'] == 1]
# Group and pivot the data for heatmaps
left_0_pivot = left_0_df.groupby(['dept', 'salary']).size().unstack(fill_value=0)
left_1_pivot = left_1_df.groupby(['dept', 'salary']).size().unstack(fill_value=0)
# Calculate percentage heatmaps
total_records = len(df)
print(total_records)
left_0_total = left_0_df.shape[0] # Count of n
left_1_total = left_1_df.shape[0] # Count of n
left_0_percentage = (left_0_pivot / left_0_total) # n as % of n = 0 total
left_1_percentage = (left_1_pivot / left_1_total) # n as % of n = 1 total
left_0_perc_of_tot = (left_0_pivot / (left_0_total + left_1_total)) # n as % of n total
left_1_perc_of_tot = (left_1_pivot / (left_0_total + left_1_total)) # n as % of n total
# Create subplots for the heatmaps
fig, axes = plt.subplots(3, 2, figsize=(12, 8))
sns.set(style="darkgrid")
# Create the count heatmap for left = 0
sns.heatmap(left_0_pivot, annot=True, annot_kws={"size": 10}, fmt='d', cmap='YlGnBu', ax=axes[0, 0])
axes[0, 0].set_title('Count Heatmap for Left = 0')
# Create the count heatmap for left = 1
sns.heatmap(left_1_pivot, annot=True, annot_kws={"size": 10}, fmt='d', cmap='YlGnBu', ax=axes[0, 1])
axes[0, 1].set_title('Count Heatmap for Left = 1')
# Create the percentage heatmap for left = 0
sns.heatmap(left_0_percentage, annot=True, annot_kws={"size": 10}, fmt='.2%', cmap='YlGnBu', ax=axes[1, 0])
axes[1, 0].set_title('Percentage Heatmap for % of Left = 0')
# Create the percentage heatmap for left = 1
sns.heatmap(left_1_percentage, annot=True, annot_kws={"size": 10}, fmt='.2%', cmap='YlGnBu', ax=axes[1, 1])
axes[1, 1].set_title('Percentage Heatmap for % of Left = 1')
# Create the percentage heatmap for left = 0
sns.heatmap(left_0_perc_of_tot, annot=True, annot_kws={"size": 10}, fmt='.2%', cmap='YlGnBu', ax=axes[2, 0])
axes[2, 0].set_title('Percentage Heatmap for Left = 0 % of Total')
# Create the percentage heatmap for left = 1
sns.heatmap(left_1_perc_of_tot, annot=True, annot_kws={"size": 10}, fmt='.2%', cmap='YlGnBu', ax=axes[2, 1])
axes[2, 1].set_title('Percentage Heatmap for Left = 1 % of Total')
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
11991
returned_df = eda_dual(original_data, 'dept', 'salary', 2)
returned_df.index.name = "Dept / Salary"
returned_df.sort_values(by='Left Count', ascending = False)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
Dept / Salary | ||||||||
Total | 10000.00 | 1991.00 | 11991.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
(sales, low) | 1234.00 | 319.00 | 1553.00 | 12.95 | 12.34 | 16.02 | 79.46 | 20.54 |
(technical, low) | 900.00 | 238.00 | 1138.00 | 9.49 | 9.00 | 11.95 | 79.09 | 20.91 |
(sales, medium) | 1230.00 | 219.00 | 1449.00 | 12.08 | 12.30 | 11.00 | 84.89 | 15.11 |
(support, low) | 676.00 | 191.00 | 867.00 | 7.23 | 6.76 | 9.59 | 77.97 | 22.03 |
(technical, medium) | 798.00 | 142.00 | 940.00 | 7.84 | 7.98 | 7.13 | 84.89 | 15.11 |
(support, medium) | 712.00 | 116.00 | 828.00 | 6.91 | 7.12 | 5.83 | 85.99 | 14.01 |
(it, low) | 389.00 | 87.00 | 476.00 | 3.97 | 3.89 | 4.37 | 81.72 | 18.28 |
(marketing, low) | 234.00 | 76.00 | 310.00 | 2.59 | 2.34 | 3.82 | 75.48 | 24.52 |
(it, medium) | 362.00 | 67.00 | 429.00 | 3.58 | 3.62 | 3.37 | 84.38 | 15.62 |
(hr, low) | 234.00 | 62.00 | 296.00 | 2.47 | 2.34 | 3.11 | 79.05 | 20.95 |
(accounting, low) | 237.00 | 59.00 | 296.00 | 2.47 | 2.37 | 2.96 | 80.07 | 19.93 |
(product_mng, low) | 284.00 | 59.00 | 343.00 | 2.86 | 2.84 | 2.96 | 82.80 | 17.20 |
(randd, low) | 271.00 | 51.00 | 322.00 | 2.69 | 2.71 | 2.56 | 84.16 | 15.84 |
(hr, medium) | 219.00 | 48.00 | 267.00 | 2.23 | 2.19 | 2.41 | 82.02 | 17.98 |
(product_mng, medium) | 244.00 | 47.00 | 291.00 | 2.43 | 2.44 | 2.36 | 83.85 | 16.15 |
(accounting, medium) | 215.00 | 47.00 | 262.00 | 2.18 | 2.15 | 2.36 | 82.06 | 17.94 |
(marketing, medium) | 268.00 | 33.00 | 301.00 | 2.51 | 2.68 | 1.66 | 89.04 | 10.96 |
(management, low) | 107.00 | 32.00 | 139.00 | 1.16 | 1.07 | 1.61 | 76.98 | 23.02 |
(randd, medium) | 294.00 | 31.00 | 325.00 | 2.71 | 2.94 | 1.56 | 90.46 | 9.54 |
(management, medium) | 150.00 | 19.00 | 169.00 | 1.41 | 1.50 | 0.95 | 88.76 | 11.24 |
(sales, high) | 225.00 | 12.00 | 237.00 | 1.98 | 2.25 | 0.60 | 94.94 | 5.06 |
(technical, high) | 156.00 | 10.00 | 166.00 | 1.38 | 1.56 | 0.50 | 93.98 | 6.02 |
(support, high) | 121.00 | 5.00 | 126.00 | 1.05 | 1.21 | 0.25 | 96.03 | 3.97 |
(product_mng, high) | 48.00 | 4.00 | 52.00 | 0.43 | 0.48 | 0.20 | 92.31 | 7.69 |
(it, high) | 67.00 | 4.00 | 71.00 | 0.59 | 0.67 | 0.20 | 94.37 | 5.63 |
(randd, high) | 44.00 | 3.00 | 47.00 | 0.39 | 0.44 | 0.15 | 93.62 | 6.38 |
(marketing, high) | 59.00 | 3.00 | 62.00 | 0.52 | 0.59 | 0.15 | 95.16 | 4.84 |
(hr, high) | 35.00 | 3.00 | 38.00 | 0.32 | 0.35 | 0.15 | 92.11 | 7.89 |
(accounting, high) | 60.00 | 3.00 | 63.00 | 0.53 | 0.60 | 0.15 | 95.24 | 4.76 |
(management, high) | 127.00 | 1.00 | 128.00 | 1.07 | 1.27 | 0.05 | 99.22 | 0.78 |
eda_single(original_data,'salary', 2)
Stayed Count | Left Count | Total Count | Total Perc of Total | Stayed Perc of Stayed Total | Left Perc of Left Total | Stayed % | Left % | |
---|---|---|---|---|---|---|---|---|
salary | ||||||||
high | 942.00 | 48.00 | 990.00 | 8.26 | 9.42 | 2.41 | 95.15 | 4.85 |
low | 4566.00 | 1174.00 | 5740.00 | 47.87 | 45.66 | 58.97 | 79.55 | 20.45 |
medium | 4492.00 | 769.00 | 5261.00 | 43.87 | 44.92 | 38.62 | 85.38 | 14.62 |
Total | 10000.00 | 1991.00 | 11991.00 | 100.00 | 100.00 | 100.00 | NaN | NaN |
left=1
) (pre modelling assessment)# Review data for salary ranges
# Set output number format
pd.set_option('display.float_format', '{:.2f}'.format)
df = original_data.copy()
count_by_group = df.groupby(['salary', 'left']).size().reset_index(name='count')
# Display the result
print(count_by_group)
high_left = df[(df['salary'] == 'high') & (df['left'] == 1)]
medium_left = df[(df['salary'] == 'medium') & (df['left'] == 1)]
low_left = df[(df['salary'] == 'low') & (df['left'] == 1)]
# Initialise values
high_salary = 150000
medium_salary = 100000
low_salary = 50000
cost_percentage = 0.25
# Count the rows in the filtered DataFrame multiply by salary thresholds
# Assumes high salary @ £150K and cost to replace is 30% of salary
high_cost = high_left.shape[0] * high_salary * cost_percentage
# Assumes medium salary @ £100K and cost to replace is 30% of salary
medium_cost = medium_left.shape[0] * medium_salary * cost_percentage
# Assumes low salary @ £50K and cost to replace is 30% of salary
low_cost = low_left.shape[0] * low_salary * cost_percentage
# Count of employees
total_count = low_left.shape[0]+medium_left.shape[0]+high_left.shape[0]
print()
# Display the selected rows
print(color.BOLD + f"Cost of staff leaving (assuming replacement costs {cost_percentage:.0%} of total salary)" + color.END)
print('\u2500' * 70)
print(" | # of Staff | Cost Per Head | Est £ Cost")
print("Estimate High Salary Left Cost | {:>12,.0f} | {:>14,.0f} | {:>12,.0f} ".
format(high_left.shape[0], high_cost/high_left.shape[0], high_cost))
print("Estimate Medium Salary Left Cost | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
.format(medium_left.shape[0], medium_cost/ medium_left.shape[0], medium_cost))
print("Estimate Low Salary Left Cost | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
.format(low_left.shape[0], low_cost/ low_left.shape[0], low_cost,))
print(" | ============ | ======= | ==========")
print("Estimate TOTAL Salary Left Cost | {:>12,.0f} | {:>14,.0f} "
.format(low_cost+medium_cost+high_cost, total_count))
salary left count
0 high 0 942
1 high 1 48
2 low 0 4566
3 low 1 1174
4 medium 0 4492
5 medium 1 769
Cost of staff leaving (assuming replacement costs 25% of total salary)
──────────────────────────────────────────────────────────────────────
| # of Staff | Cost Per Head | Est £ Cost
Estimate High Salary Left Cost | 48 | 37,500 | 1,800,000
Estimate Medium Salary Left Cost | 769 | 25,000 | 19,225,000
Estimate Low Salary Left Cost | 1,174 | 12,500 | 14,675,000
| ============ | ======= | ==========
Estimate TOTAL Salary Left Cost | 35,700,000 | 1,991
cost_percentage = 0.5
# Assumes high salary @ £150K and cost to replace is 30% of salary
high_cost = high_left.shape[0] * high_salary * cost_percentage
# Assumes medium salary @ £100K and cost to replace is 30% of salary
medium_cost = medium_left.shape[0] * medium_salary * cost_percentage
# Assumes low salary @ £50K and cost to replace is 30% of salary
low_cost = low_left.shape[0] * low_salary * cost_percentage
# Count of employees
total_count = low_left.shape[0]+medium_left.shape[0]+high_left.shape[0]
print(color.BOLD + f"Cost of staff LEFT (assuming replacement costs {cost_percentage:.0%} of total salary)" + color.END)
print('\u2500' * 70)
print(" | # of Staff | Cost Per Head | Est £ Cost ")
print("Estimate High Salary Left Cost | {:>12,.0f} | {:>13,.0f} | {:>12,.0f} ".
format(high_left.shape[0], high_cost/high_left.shape[0], high_cost))
print("Estimate Medium Salary Left Cost | {:>12,.0f} | {:>13,.0f} | {:>12,.0f}"
.format(medium_left.shape[0], medium_cost/ medium_left.shape[0],medium_cost))
print("Estimate Low Salary Left Cost | {:>12,.0f} | {:>13,.0f} | {:>12,.0f}"
.format(low_left.shape[0], low_cost/ low_left.shape[0], low_cost))
print(" | ============ | ======= | ==========")
print("Estimate TOTAL Salary Left Cost | {:>12,.0f} | {:>13,.0f}"
.format(low_cost+medium_cost+high_cost, total_count))
Cost of staff LEFT (assuming replacement costs 50% of total salary)
──────────────────────────────────────────────────────────────────────
| # of Staff | Cost Per Head | Est £ Cost
Estimate High Salary Left Cost | 48 | 75,000 | 3,600,000
Estimate Medium Salary Left Cost | 769 | 50,000 | 38,450,000
Estimate Low Salary Left Cost | 1,174 | 25,000 | 29,350,000
| ============ | ======= | ==========
Estimate TOTAL Salary Left Cost | 71,400,000 | 1,991
## Cost of blanket pay increase of salary_increase
salary_increase = .3 # 30% Salary Increase
# Count the rows in the filtered DataFrame multiply by salary thresholds
high_cost_increase = high_left.shape[0] * high_salary * salary_increase
medium_cost_increase = medium_left.shape[0] * medium_salary * salary_increase
low_cost_increase = low_left.shape[0] * low_salary * salary_increase
print()
print()
print(color.BOLD + "Cost of {:.0f}% pay increase".format(salary_increase*100) + color.END)
print('\u2500' * 70)
print("Estimate Retention Costs")
print(" | # of Staff | Cost Per Head | Est £ Cost")
print("Estimate High Salary Left Cost | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
.format(high_left.shape[0], high_cost_increase/ high_left.shape[0], high_cost_increase))
print("Estimate Medium Salary Left Cost | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
.format(medium_left.shape[0], medium_cost_increase/ medium_left.shape[0], medium_cost_increase))
print("Estimate Low Salary Left Cost | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
.format(low_left.shape[0], low_cost_increase/ low_left.shape[0], low_cost_increase))
print(" | =========== | ======= | ==========")
print("Estimate TOTAL Salary Left Cost | {:>12,.0f} | {:>14,.0f}"
.format(total_count, low_cost_increase+medium_cost_increase+high_cost_increase))
Cost of 30% pay increase
──────────────────────────────────────────────────────────────────────
Estimate Retention Costs
| # of Staff | Cost Per Head | Est £ Cost
Estimate High Salary Left Cost | 48 | 45,000 | 2,160,000
Estimate Medium Salary Left Cost | 769 | 30,000 | 23,070,000
Estimate Low Salary Left Cost | 1,174 | 15,000 | 17,610,000
| =========== | ======= | ==========
Estimate TOTAL Salary Left Cost | 1,991 | 42,840,000
People who left the company work between 3 to 10 hours per month more than people who are retained.
Annualised - Assuming 255 working days (UK)
perc_diff = ((avg_hours[1] - avg_hours[0]) / avg_hours[0]) * 100
annual_diff = (avg_hours[1] - avg_hours[0]) * 12
# Display the result
print("Average Monthly Hours Comparison")
print('\u2500' * 35)
print("Left : {:.2f}".format(avg_hours[1]))
print("Employed : {:.2f}".format(avg_hours[0]))
print("Percentage Difference : {:.2f}%".format(perc_diff))
print("Annual Hrs Difference : {:.0f} hours".format(annual_diff))
print()
print("Employees who left worked {:.0f} more hours per year than retained staff".format(annual_diff))
print()
total_hours = full_data.groupby(['salary', 'left'])['avg_mnth_hrs'].mean()
# Reset the index to make the result a DataFrame
total_hours = total_hours.reset_index()
# Calculate the total difference between retained and not retained for each salary group
total_hours['total_difference'] = total_hours.groupby('salary')['avg_mnth_hrs'].diff()
# Display the result
print(total_hours)
Average Monthly Hours Comparison ─────────────────────────────────── Left : 208.16 Employed : 198.94 Percentage Difference : 4.63% Annual Hrs Difference : 111 hours Employees who left worked 111 more hours per year than retained staff salary left avg_mnth_hrs total_difference 0 0 0 198.67 NaN 1 0 1 207.32 8.66 2 1 0 199.02 NaN 3 1 1 209.76 10.75 4 2 0 199.92 NaN 5 2 1 202.98 3.06
## Visualise Hours and Salary
plotdata = pd.DataFrame(total_hours)
sns.set(style="darkgrid")
plotdata.drop(['total_difference'], axis=1)
custom_palette = ["#007acc", "#ff3e46"]
# Create a grouped bar plot using Seaborn
plt.figure(figsize=(10, 6))
sns.set_palette(custom_palette)
sns.barplot(x='salary', y='avg_mnth_hrs', hue='left', data=plotdata, palette='colorblind')
plt.xlabel('Salary Level')
plt.ylabel('Hours Worked')
plt.title('Comparison of Hours Worked by Salary and Left Status')
plt.legend(title='Left', labels=['Not Left', 'Left'], loc='lower center', bbox_to_anchor=(0.5, -0.35))
plt.show()