Document Title | Salifort Motors - HR Team Summary |
Author | Rod Slater |
Version | 1.0 |
Created | 01-11-2023 |
Modified | 16-11-2023 |
Client Name | Salifort Motors |
Client Contact | Mr HR Team |
Client Email | hr@salifortmotors.it |
Client Project | HR Team Data Driven Solutions from Machine Learning Models |
Summary plots and Analysis for the HR Team
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
# Load dataset into a dataframe
df = pd.read_csv(load_path + "data_cleaned_Ol_NoFE_AllFeat.csv") # 11991 rows, all data including outliers
#df1 = pd.read_csv(load_path + "data_cleaned_NoOl_NoFE_AllFeat.csv") # 11167 rows, no outliers (mostly from tenure)
# Display first few rows of the dataframe
df.head()
satisfaction | last_eval | number_project | avg_mnth_hrs | tenure | accident | left | promotion | dept | salary | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
pd.options.display.float_format = '{:.2f}'.format
counts = df['left'].value_counts()
perc = df['left'].value_counts(normalize=True)*100
summary_table = pd.DataFrame({"Counts " : counts,
"Percentage " : perc})
summary_table
Counts | Percentage | |
---|---|---|
left | ||
0 | 10000 | 83.40 |
1 | 1991 | 16.60 |
# Calculate counts and percentages by department
dept_counts = df.groupby('dept')['left'].value_counts().unstack().fillna(0) # counts, combine grouped columns into an index, replace an values with 0
dept_counts['total'] = dept_counts.sum(axis=1) # Total across columns
dept_counts['left_percentage'] = (dept_counts[1] / dept_counts['total']) * 100
# Plot the bar graph
fig, ax = plt.subplots(figsize=(10, 6))
dept_counts['left_percentage'].plot(kind='bar', color='blue', ax=ax)
# Add lines at 16.6% and 2.8%
ax.axhline(y=16.6, color='red', linestyle='--', label='16.6% Salifort Average')
ax.axhline(y=17.8, color='green', linestyle='--', label='17.8% Business Avg 2022-2023')
# Add labels and legend
ax.set_xlabel('Department')
ax.set_ylabel('Percentage Left')
ax.set_title('Percentage of Employees Left by Department')
ax.legend(['16.6% Salifort Avg','17.8% Business Avg 2022-23','left %'], loc='upper center', bbox_to_anchor=(0.5, -0.3), fancybox=True, shadow=True, ncol=3)
# Show the plot
plt.show()
## Employees who left percentage by department total and salary band / Total employees by department and salary band
df_left_1 = df[df['left'] == 1]
left_1 = df_left_1.groupby(['dept', 'salary']).size().unstack()
total_df = df.groupby(['dept', 'salary']).size().unstack()
df_percentage = (left_1 / total_df) * 100
# Reset index to bring 'department' back as a column
df_percentage.reset_index(inplace=True)
# Print the result
print(df_percentage)
salary dept high low medium 0 accounting 4.76 19.93 17.94 1 hr 7.89 20.95 17.98 2 it 5.63 18.28 15.62 3 management 0.78 23.02 11.24 4 marketing 4.84 24.52 10.96 5 product_mng 7.69 17.20 16.15 6 randd 6.38 15.84 9.54 7 sales 5.06 20.54 15.11 8 support 3.97 22.03 14.01 9 technical 6.02 20.91 15.11
melted_df = pd.melt(df_percentage,value_vars=['high','low','medium'], id_vars=['dept'], value_name='percentage')
melted_df.sort_values(by=['salary'], inplace=True, ascending=False)
# Plot the bar graph
x_order = ['low','medium','high'] # set order of labels of x axis
fig, ax = plt.subplots(figsize=(12, 8))
# Plot bars for each salary level in each department
#df_percentage.plot(kind='bar', x = 'dept', stacked=False, ax=ax, order=x_order)
sns.barplot(x = 'dept', y='percentage', data=melted_df, hue='salary', palette='colorblind')
# Add lines at 16.6% and 2.8%
plt.axhline(y=16.6, color='red', linestyle='--', label='Salifort Average Turnover')
ax.axhline(y=17.8, color='green', linestyle='--', label='US Business Avg 2022-23')
# Add labels and legend
ax.set_xlabel('Department')
ax.set_ylabel('%')
plt.xticks(rotation=90)
ax.set_title('Distribution of Salary Levels by Department for Employees who Left')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=3)
# Show the plot
plt.show()
#left_df = df[df['left'] == 1][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
#stayed_df = df[['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
#df['dept'].unique()
array(['sales', 'accounting', 'hr', 'technical', 'support', 'management', 'it', 'product_mng', 'marketing', 'randd'], dtype=object)
Box plots of feature vs salary for all employees next to employees who left
def plot_compare_company(var:str):
''' Violinplot, side by side comparison: All Company, employees who left, employees who stayed
in :
team : Str : Team name to analyse
var : dataset variable to plot
'''
#df[(df['left'] == 1)] # & (df['dept'] == 'sales')]
#left_df = df[(df['left'] == 1) & (df['dept'] == team)][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
left_df = df[(df['left'] == 1) ][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
#stayed_df = df[(df['left'] == 0) & (df['dept'] == team)][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
stayed_df = df[(df['left'] == 0) ][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
all_df= df[['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
labels = df['salary'].unique()
# Set the style of the visualization
sns.set(style="darkgrid")
#plt.figure(figsize=(8, 6))
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 6), sharey=True)
#sns.barplot(x='dept', y='satisfaction', hue='salary', data=df, ci=None, estimator='mean', palette='colorblind')
# sns.boxplot(x='dept', y=var, hue='left', data=df, palette='colorblind')
sns.violinplot(x='salary', y=var, data=all_df, hue='salary', ax=axes[0], palette='colorblind')
axes[0].set_title(f'{var} - All Employees')
axes[0].set_xlabel('Salary')
axes[0].set_ylabel(f'{var}')
axes[0].set_xticks(labels)
axes[0].set_xticklabels(labels,rotation=0)
#plt.title(f'Employees who left, {var} by Department')
sns.violinplot(x='salary', y=var, data=left_df, hue='salary', ax=axes[1], palette='colorblind')
axes[1].set_title(f'{var} - Employees who Left')
axes[1].set_xlabel('Salary')
axes[1].set_ylabel(f'{var}')
axes[1].set_xticks(labels)
axes[1].set_xticklabels(labels,rotation=0)
sns.violinplot(x='salary', y=var, data=stayed_df, hue='salary', ax=axes[2], palette='colorblind')
axes[2].set_title(f'{var} - Employees who Stayed')
axes[2].set_xlabel('Salary')
axes[2].set_ylabel(f'{var}')
axes[2].set_xticks(labels)
axes[2].set_xticklabels(labels,rotation=0)
plt.tight_layout()
# Show the plot
plt.show()
varlist= df.drop(columns=['accident', 'left','promotion'])
variables = varlist.columns
for var in variables:
plot_compare_company(var)
Plot of key features by dept quartile distribution & Violin plot
var = 'number_project' # Variable to plot
hsplit = 4 # Where to place the horizontal line
# List of unique departments
departments = df['dept'].unique()
#dep = ['sales']
#departments = np.array(dep)
# Set the style of the visualization
sns.set(style="darkgrid")
# Loop through departments
for department in departments:
# Subset data for the current department
department_data = df[df['dept'] == department]
# Create a figure with two subplots (box plot and scatter plot)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6), sharey=False)
# Box plot
sns.boxplot(x=var, y='salary', data=department_data, hue = 'left', ax=axes[0], palette='colorblind')
axes[0].set_title(f'{var} - Quartile Distribution for {department}')
# axes[0].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5, label='Vertical Line 2')
# Scatter plot
#sns.scatterplot(x='salary', y='last_eval', data=department_data, hue='left', ax=axes[1], palette='colorblind')
sns.violinplot(x='salary', y=var, data=department_data, hue='left', ax=axes[1], palette='colorblind')
axes[1].axvline(x=0.08, color='red', linestyle='--', label='Vertical Line 1')
axes[1].axvline(x=0.32, color='blue', linestyle='--', label='Vertical Line 2')
axes[1].axvline(x=1.08, color='red', linestyle='--', label='Vertical Line 1')
axes[1].axvline(x=1.32, color='blue', linestyle='--', label='Vertical Line 2')
axes[1].axvline(x=2.08, color='red', linestyle='--', label='Vertical Line 1')
axes[1].axvline(x=2.32, color='blue', linestyle='--', label='Vertical Line 2')
axes[1].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5, label='Vertical Line 2')
axes[1].set_title(f'{var} - Violin Plot for {department}')
# Adjust layout
plt.tight_layout()
# Show the plots
plt.show()
var = 'avg_mnth_hrs' # Variable to plot
hsplit = 175 # Where to place the horizontal line
# List of unique departments
departments = df['dept'].unique()
#dep = ['sales']
#departments = np.array(dep)
# Set the style of the visualization
sns.set(style="whitegrid")
# Loop through departments
for department in departments:
# Subset data for the current department
department_data = df[df['dept'] == department]
# Calculate the percentage of outliers (for example, considering values beyond 1.5*IQR as outliers)
Q1 = df[var].quantile(0.25)
Q3 = df[var].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - .25 * IQR
upper_bound = Q3 + .25 * IQR
outliers_percentage = ((department_data[var] < lower_bound) | (department_data[var] > upper_bound)).mean() * 100
# Annotate the plot with the percentage of outliers
# Create a figure with two subplots (box plot and scatter plot)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6), sharey=True)
# Box plot
sns.boxplot(x='salary', y=var, data=department_data, hue = 'left', ax=axes[0], palette='colorblind')
axes[0].set_title(f'Quartile Distribution for {department}')
axes[0].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5, label='Vertical Line 2')
# Scatter plot
#sns.scatterplot(x='salary', y='last_eval', data=department_data, hue='left', ax=axes[1], palette='colorblind')
sns.violinplot(x='salary', y=var, data=department_data, hue='left', ax=axes[1], palette='colorblind')
axes[1].axvline(x=0.08, color='red', linestyle='--', label='Vertical Line 1')
axes[1].axvline(x=0.32, color='blue', linestyle='--', label='Vertical Line 2')
axes[1].axvline(x=1.08, color='red', linestyle='--', label='Vertical Line 1')
axes[1].axvline(x=1.32, color='blue', linestyle='--', label='Vertical Line 2')
axes[1].axvline(x=2.08, color='red', linestyle='--', label='Vertical Line 1')
axes[1].axvline(x=2.32, color='blue', linestyle='--', label='Vertical Line 2')
axes[1].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5, label='Vertical Line 2')
axes[1].set_title(f'Scatter Plot for {department}')
axes[1].text(0.5, 1.05, f'{outliers_percentage:.2f}% Outliers', transform=plt.gca().transAxes, fontsize=10, color='red')
# Adjust layout
plt.tight_layout()
# Show the plots
plt.show()