# Import packages

# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set Options
pd.set_option('display.max_columns', None)

# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree

# For saving models
import pickle

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_source/"
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"

# Load dataset into a dataframe

df0 = pd.read_csv(load_path + "HR_capstone_dataset.csv")


# Display first few rows of the dataframe

df0.head()

# Gather basic information about the data

df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_monthly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB

# Gather descriptive statistics about the data

df0.describe(include='all')

# Display all column names

df0.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', 'time_spend_company',
       'Work_accident', 'left', 'promotion_last_5years', 'Department', 'salary'],
      dtype='object')

# Rename columns as needed
df0 = df0.rename(columns = {'satisfaction_level': 'satisfaction',
                            'last_evaluation' : 'last_eval',
                            'average_monthly_hours': 'avg_mnth_hrs',
                            'time_spend_company': 'tenure',
                            'Work_accident': 'accident',
                            'Department': 'dept', 
                            'promotion_last_5years': 'promotion'})


# Display all column names after the update
df0.columns

Index(['satisfaction', 'last_eval', 'number_project', 'avg_mnth_hrs', 'tenure', 'accident', 'left', 'promotion',
       'dept', 'salary'],
      dtype='object')

# Check for missing values

df0.isna().sum()

satisfaction      0
last_eval         0
number_project    0
avg_mnth_hrs      0
tenure            0
accident          0
left              0
promotion         0
dept              0
salary            0
dtype: int64

# Check for duplicates
df0.duplicated().sum()

3008

print(df0['left'].value_counts())
print(df0['left'].value_counts(normalize=True))

left
0    11428
1     3571
Name: count, dtype: int64
left
0   0.76
1   0.24
Name: proportion, dtype: float64

# showing the duplicate values
df_duplicated=df0[df0.duplicated()==True]
print(df_duplicated['left'].value_counts())
print(df_duplicated['left'].value_counts(normalize=True))

left
1    1580
0    1428
Name: count, dtype: int64
left
1   0.53
0   0.47
Name: proportion, dtype: float64

# Inspect some rows containing duplicates as needed

df_duplicated.sample(20)

# Drop duplicates and save resulting dataframe in a new variable as needed

print("Pre De-dupe Rows  : ",df0.shape[0])
df1 = df0.drop_duplicates(keep = 'first')
df1.reset_index(inplace=True, drop=True)

# Display first few rows of new dataframe as needed

print("Post De-dupe Rows : ",df1.shape[0])
print("De-dupe rows removed   : ", df0.shape[0] - df1.shape[0])
df1.head()

Pre De-dupe Rows  :  14999
Post De-dupe Rows :  11991
De-dupe rows removed   :  3008

print(df1['salary'].unique())
print(df1['dept'].unique())

['low' 'medium' 'high']
['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'IT'
 'product_mng' 'marketing' 'RandD']

df1.loc[:,'dept'] = df1['dept'].str.lower()

# Save dataframe to CSV, full dataframe, with tenure outliers, no duplicates, No Feature Engineering

df1.to_csv(save_path + 'data_cleaned_Ol_NoFE_AllFeat.csv', encoding='utf-8', index=False)
df1.shape

(11991, 10)

# Create a boxplot to visualize distribution of `tenure` and detect any outliers
plt.figure(figsize = (8,6))
plt.title('Outliers for tenure', fontsize = 12)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
sns.boxplot(x = df1['tenure'], hue=df1['left'], palette='colorblind')
plt.show()

# Calculate percentiles and Interquartile range : 
# The interquartile range (IQR) is a measure of statistical dispersion that 
# describes the spread or variability of a dataset.

# Calculate 25th percentile

print(df1.shape)
perc25 = df1['tenure'].quantile(0.25)

#calculate 75th percentile
perc75 = df1['tenure'].quantile(0.75)

#Calculate the IQR (Interquartile Range)
iqr = perc75 - perc25

#Define the upper limit and lower limit for non-outlier values in 'tenure'
upper_limit = perc75 + 1.5 * iqr
lower_limit = perc25 - 1.5 * iqr


# subset data containing outliers
outliers = df1[(df1['tenure'] > upper_limit) | (df1['tenure'] < lower_limit)]
num_df_rows = df1.shape[0]
num_outliers = len(outliers)
perc_outliers = num_outliers / num_df_rows * 100

# Summary

print("IQR                  :   ", iqr)
print("Lower limit          :   ", lower_limit)
print("Upper limit          :   ", upper_limit)
print("# rows with outliers : ", num_outliers)
print("% rows with outliers :    {:.2f} %".format(perc_outliers))

(11991, 10)
IQR                  :    1.0
Lower limit          :    1.5
Upper limit          :    5.5
# rows with outliers :  824
% rows with outliers :    6.87 %

# REMOVE OUTLIERS 
print(df1.shape)
# Remove records from df1 that are in df2
df1 = df1[~df1.isin(outliers)].dropna()

# Reset the index of the modified DataFrame (optional)
df1.reset_index(drop=True, inplace=True)
print(df1.shape)

(11991, 10)
(11167, 10)

df1.to_csv(save_path + 'data_cleaned_NoOl_NoFE_AllFeat.csv', encoding='utf-8', index=False)

df1.shape

(11167, 10)

Document Title	Salifort Motors - EDA Data Cleansing
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

Variable	Description
satisfaction_level	Employee-reported job satisfaction level [0–1]
last_evaluation	Score of employee's last performance review [0–1]
number_project	Number of projects employee contributes to
average_monthly_hours	Average number of hours employee worked per month
time_spend_company	How long the employee has been with the company (years)
Work_accident	Whether or not the employee experienced an accident while at work
left	Whether or not the employee left the company
promotion_last_5years	Whether or not the employee was promoted in the last 5 years
Department	The employee's department
salary	The employee's salary (U.S. dollars)

	satisfaction_level	last_evaluation	number_project	average_monthly_hours	time_spend_company	Work_accident	left	promotion_last_5years	Department	salary
count	14999.00	14999.00	14999.00	14999.00	14999.00	14999.00	14999.00	14999.00	14999	14999
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	10	3
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	sales	low
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4140	7316
mean	0.61	0.72	3.80	201.05	3.50	0.14	0.24	0.02	NaN	NaN
std	0.25	0.17	1.23	49.94	1.46	0.35	0.43	0.14	NaN	NaN
min	0.09	0.36	2.00	96.00	2.00	0.00	0.00	0.00	NaN	NaN
25%	0.44	0.56	3.00	156.00	3.00	0.00	0.00	0.00	NaN	NaN
50%	0.64	0.72	4.00	200.00	3.00	0.00	0.00	0.00	NaN	NaN
75%	0.82	0.87	5.00	245.00	4.00	0.00	0.00	0.00	NaN	NaN
max	1.00	1.00	7.00	310.00	10.00	1.00	1.00	1.00	NaN	NaN

	satisfaction	last_eval	number_project	avg_mnth_hrs	tenure	accident	left	dept	salary
13561	0.53	0.61	3	148	10	0	0	management	high
14250	0.41	0.54	2	152	3	0	1	technical	low
14067	0.81	0.92	4	268	3	0	0	sales	low
13459	0.72	0.80	3	222	3	0	0	IT	medium
14203	0.62	0.85	3	237	3	1	0	IT	medium
13418	0.49	0.65	4	233	7	0	0	sales	medium
14930	0.11	0.89	6	268	4	0	1	IT	medium
13622	0.94	0.51	6	239	5	0	0	sales	medium
13695	0.78	0.65	3	139	3	0	0	marketing	high
12173	0.10	0.88	6	284	4	0	1	sales	low
13742	0.67	0.72	2	134	10	0	0	management	high
13693	0.87	0.97	5	151	3	0	0	RandD	medium
14224	0.41	0.55	2	148	3	0	1	sales	low
14440	0.78	0.98	5	239	6	0	1	marketing	low
14426	0.36	0.51	2	155	3	0	1	IT	low
14393	0.89	0.99	5	257	5	0	1	accounting	medium
14475	0.42	0.50	2	139	3	0	1	hr	medium
14533	0.61	0.47	2	253	3	0	1	sales	low
13640	0.78	0.57	4	157	3	0	0	accounting	low
14068	0.79	0.62	5	167	3	0	0	sales	low

Salifort Motors - EDA Data Cleansing ¶

Document Information ¶

Client Details ¶

Document Overview ¶

Data Provenance ¶

Data PreProcessing ¶

Table of contents ¶

Initialise Notebook ¶

Import packages ¶

Set Pandas Options ¶

Initialise Notebook Options ¶

Data Exploration (Initial EDA and data cleaning)¶

Load dataset ¶

Variable Information ¶

Gather basic information about the data ¶

Gather descriptive statistics about the data ¶

Rename columns ¶

Check missing values ¶

Check duplicates ¶

Check for class imbalance before removing duplicates ¶

Check class balance of duplicated rows ¶

Check categorical values for consistency ¶

Correct categorical values for snake_case ¶

Save df1 dataFrame : Minimal Feature Engineering - Duplicates removed ¶

Check outliers ¶

Save df1 dataFrame : Duplicates removed, `tenure` outliers removed ¶

	satisfaction_level	last_evaluation	number_project	average_monthly_hours	time_spend_company	left	Department	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

Table of contents¶

Table of contents ¶