# Import packages

# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data modelling Imports
from xgboost import XGBClassifier, XGBRegressor, plot_importance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree

from scipy import stats

# For saving models
import pickle

# Add a little colour

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

def correlation_single(dataframe: object, var: str):
    """Return a single row from the correlation plot.
    in  : dataframe: Object = The dataframe to EDA
          var: String       = variable to summarise
    out : correlation heatmap plot
    """
    df_corr = dataframe.corr().sort_values(by = var, axis = 1, ascending = False) # create a correlation dataframe

    corr_index = df_corr.index.get_loc(var) +1 # Get the row with the data we want
    
    plt.figure(figsize=(10, 1))
    sns.heatmap(df_corr.iloc[corr_index-1:corr_index,:20], annot=True, annot_kws={"size": 10}, 
                fmt='.0%', cmap='YlOrRd', cbar = False) # do plot 'magic'
    
    return plt

def eda_single(dataframe: object, var: str, decplace:int):

    """Return a df summary of the EDA variable.
    in  : dataframe: Object = The dataframe to EDA
          var: String       = variable to summarise
          decplace: int     = Number of decimal place for pandas.options.display.float_format
    out : dataframe summary of the variable 
    """

    float_option = '{:.' + str(decplace) + 'f}'
    pd.options.display.float_format = float_option.format #2 decimal places, floating point


    
    # Create separate DataFrames for 'left = 0' and 'left = 1'
    left_0_df = dataframe[dataframe['left'] == 0]
    left_1_df = dataframe[dataframe['left'] == 1]
    
    # Calculate the count of rows within each category for both groups
    left_0_count = left_0_df.groupby(left_0_df[var], observed=False).size().fillna(0)
    left_1_count = left_1_df.groupby(left_1_df[var], observed=False).size().fillna(0)
     
   
    
    # Calculate the total count for each category
    total_count = left_0_count + left_1_count
    
    # Calculate the percentage of the column total where 'left = 1' or 'left = 0' for each category
    left_0_percentage = ((left_0_count / total_count) * 100)#.fillna(100)#.replace([np.inf, -np.inf], 100)
    left_1_percentage = ((left_1_count / total_count) * 100)#.fillna(100).replace([np.inf, -np.inf], 100)

   
    # Combine the results into a summary table
    summary_table = pd.DataFrame({
        'Stayed Count': left_0_count,
        'Left Count': left_1_count,
        'Total Count': left_0_count+left_1_count,
        'Total Perc of Total' : (left_1_count+left_0_count)/(left_1_count+left_0_count).sum()*100,
        'Stayed Perc of Stayed Total' : (left_0_count/left_0_count.sum()*100),
        'Left Perc of Left Total' : (left_1_count/left_1_count.sum()*100),
        'Stayed %': left_0_percentage,
        'Left %': left_1_percentage
    })

    summary_table = summary_table.fillna(0)
    
    summary_vars = ['Stayed Count','Left Count', 'Total Count', 'Total Perc of Total', 
                    'Stayed Perc of Stayed Total', 'Left Perc of Left Total']# , 'Stayed %','Left %']
    summary_table.loc['Total'] = summary_table[summary_vars].sum()
    return summary_table

def eda_dual(dataframe: object, var1: str, var2: str, decplace: int):

    """Return a df summary of the EDA variable.
    in  : dataframe: Object = The dataframe to EDA
          var1: String      = variable to summarise
          var2: String      = second variable
          decplace: int     = Number of decimal place for pandas.options.display.float_format
    out : binned dataframe summary of the variable 
    """

    float_option = '{:.' + str(decplace) + 'f}'
    pd.options.display.float_format = float_option.format #2 decimal places, floating point
    
    # Create separate DataFrames for 'left = 0' and 'left = 1'
    left_0_df = dataframe[dataframe['left'] == 0]
    left_1_df = dataframe[dataframe['left'] == 1]
    
    # Calculate the count of rows within each category for both groups
    left_0_count = left_0_df.groupby([left_0_df[var1], left_0_df[var2]], observed=False).size().fillna(0)
    left_1_count = left_1_df.groupby([left_1_df[var1], left_1_df[var2]], observed=False).size().fillna(0)
    
    # Calculate the total count for each category
    total_count = left_0_count + left_1_count
    
    # Calculate the percentage of the column total where 'left = 1' or 'left = 0' for each category
    left_0_percentage = ((left_0_count / total_count) * 100)
    left_1_percentage = ((left_1_count / total_count) * 100)

    

    # Combine the results into a summary table
    summary_table = pd.DataFrame({
        'Stayed Count': left_0_count,
        'Left Count': left_1_count,
        'Total Count': left_0_count+left_1_count,
        'Total Perc of Total' : (left_1_count+left_0_count)/(left_1_count+left_0_count).sum()*100,
        'Stayed Perc of Stayed Total' : (left_0_count/left_0_count.sum()*100),
        'Left Perc of Left Total' : (left_1_count/left_1_count.sum()*100),
        'Stayed %': left_0_percentage,
        'Left %': left_1_percentage
    })

    summary_table = summary_table.fillna(0)
    summary_table.index.name = "dave"
    

    summary_vars = ['Stayed Count','Left Count', 'Total Count', 'Total Perc of Total', 
                    'Stayed Perc of Stayed Total', 'Left Perc of Left Total']# , 'Stayed %','Left %']
    summary_table.loc['Total'] = summary_table[summary_vars].sum()
    return summary_table

def eda_single_bins(dataframe: object, var: str, bins: list, labels: list, decplace: int):

    """Return a BINNED df summary of the EDA variable.
    in  : dataframe: Object = The dataframe to EDA
        : var: String       = variable to summarise
        : bins: List        = list object of the bin groupings e.g. [0, 0.33, 0.5, 0.7, 1] 
        : labels: List      = list object of the bin labels e.g. ['low', 'medium', 'high'] 
        : decplace: int     = Number of decimal place for pandas.options.display.float_format
    out : binned dataframe summary of the variable 
    """

    float_option = '{:.' + str(decplace) + 'f}' # construct option
    pd.options.display.float_format = float_option.format 


    # Define the bins for categorising satisfaction scores, for EDA only. 
#    bins = [0, 1, 2, 3, 4, 5, 6, 7]
#    labels = ['1', '2','3','4', '5', '6', '7']
    
    # Create separate DataFrames for 'left = 0' and 'left = 1'
    left_0_df = dataframe[dataframe['left'] == 0]
    left_1_df = dataframe[dataframe['left'] == 1]
    
    # Calculate the count of rows within each category for both groups
    left_0_count = left_0_df.groupby(pd.cut(left_0_df[var], bins=bins, labels=labels),observed=False).size()
    left_1_count = left_1_df.groupby(pd.cut(left_1_df[var], bins=bins, labels=labels),observed=False).size()
    
    # Calculate the total count for each category
    total_count = left_0_count + left_1_count
    
    # Calculate the percentage of the column total where 'left = 1' or 'left = 0' for each category
    left_0_percentage = (left_0_count / total_count) * 100
    left_1_percentage = (left_1_count / total_count) * 100
    
    # Combine the results into a summary table
    summary_table = pd.DataFrame({
        'Stayed Count': left_0_count,
        'Left Count': left_1_count,
        'Total Count': left_0_count+left_1_count,
        'Total Perc of Total' : (left_1_count+left_0_count)/(left_1_count+left_0_count).sum()*100,
        'Stayed Perc of Stayed Total' : (left_0_count/left_0_count.sum()*100),
        'Left Perc of Left Total' : (left_1_count/left_1_count.sum()*100),
        'Stayed %': left_0_percentage,
        'Left %': left_1_percentage
    })
    
    summary_vars = ['Stayed Count','Left Count', 'Total Count', 'Total Perc of Total', 
                    'Stayed Perc of Stayed Total', 'Left Perc of Left Total']# , 'Stayed %','Left %']
    summary_table.loc['Total'] = summary_table[summary_vars].sum()
    return summary_table

def quadplot(dataframe:object, var:str, annotate:bool):
    """Quad plot 2x2 histogram, heatmap, barplot, violinplot, 
    in  : dataframe     : Object        = Dataframe to analyse 
        : var           : Variable      = variable to summarise
        : annotate      : Bool          = Annotate the heatmap (if lots of data select False) True / False
    out : 4 x plots of the variable. 
    """

    # Quad plot 2x2 histogram, heatmap, barplot, violinplot, 
    
    # Create a figure with two subplots (histogram and scatterplot)
    fig, axes = plt.subplots(2, 2, figsize=(12, 12))
    sns.set(style="darkgrid")

    binary_labels=['stayed','left']
    
    # Plot a histogram of the continuous variable
    sns.histplot(data=dataframe, x=var, hue='left', bins=3, 
                            element='step', common_norm=False, ax=axes[0,0], palette='colorblind')
    #histplot.legend(title='left', labels=['left', 'stayed'])
    axes[0,0].set_title(f'Histogram of {var} by left')
    axes[0,0].set_xlabel(f'{var}')
    axes[0,0].set_ylabel('Frequency')
    axes[0,0].legend(loc='upper center', labels=['left', 'stayed'],
    bbox_to_anchor=(0.5, 0.5), fancybox=True, shadow=True, ncol=2) # title='left',
    
    # Create a pivot table for the heatmap
    heatmap_data = dataframe.pivot_table(index='left', columns=var, aggfunc='size', fill_value=0)
    sns.set(style="darkgrid")
    # Create a heatmap
    plt.figure(figsize=(8, 5))
    sns.heatmap(heatmap_data, cmap='Oranges', annot=annotate, vmin=min(heatmap_data.min()), 
                vmax=max(heatmap_data.max()), fmt=".1f", 
                cbar_kws={'label': 'Values'} , cbar=False, ax=axes[0,1], yticklabels=binary_labels)
    axes[0,1].set_title(f'Heatmap of {var} by left')
    axes[0,1].set_xlabel(var)
   # axes[0,1].set_ylabel('left')

   
    
    
    # Create a scatterplot
    plt.figure(figsize=(8, 5))
    sns.set(style="darkgrid")
    sns.boxplot(x='left', y=var, data=dataframe, hue='left', ax=axes[1, 0], palette='colorblind')
    #boxplot.legend(title='left', labels=['left', 'stayed'])
    axes[1, 0].set_title(f'Box Plot of {var} by left')
    axes[1, 0].set_xlabel('left')
    axes[1, 0].set_ylabel(var)
    axes[1, 0].set_xticks([0,1],['stayed','left'])
    
    # Create a violin plot
    sns.set(style="darkgrid")
    sns.violinplot(x='left', y=var, data=dataframe, hue='left', ax=axes[1, 1], palette='colorblind')
    axes[1, 1].set_title(f'Violin Plot of {var} by left')
    axes[1, 1].set_xlabel('left')
    axes[1, 1].set_ylabel(f'{var}')
    axes[1, 1].set_xticks([0,1],['stayed','left'])
    # Adjust plot spacing
    fig.subplots_adjust(hspace=0.3, wspace=0.25)
    
    # Show the combined plot
    return plt

#returned_plot = quadplot(df1, 'last_eval', False)
#returned_plot.show()

def scatplot_single(dataframe: object, x_var: str, y_var: str):
    """Create scatterplot of two variables, 
    in  : dataframe : Object = Dataframe to analyse 
        : x_var     : Str    = First variable to plot on x-axis
        : y_var     : Str    = Second variable to plot on y-axis
    out : single scatterplot
    """
    dataframe.sort_values(by='left', ascending = False)
    sns.set(style="darkgrid")
    # Create scatterplot of two variables
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=dataframe, x=x_var, y=y_var, hue='left', alpha=1, palette='colorblind')
    plt.title(f'{x_var}  by {y_var}', fontsize='14');

def scatplot_multi(dataframe: object, var1: str):
    """Create scatterplots of one variable against all variables in dataframe, 
    in  : dataframe : Object = Dataframe to analyse 
        : var1: Variable      = Variable to plot on y-axis
    out : single scatterplot
    """

    column_list = ['last_eval','number_project','avg_mnth_hrs','promotion','tenure','satisfaction', 'salary', 'accident']

    column_list.remove(var1)

    #column_list


    df = dataframe.copy() # .sort_values(by='number_project', ascending=True)

    # Calculate the number of subplots needed based on the number of continuous variables (6) 
    num_subplots = len(column_list)

    # Create a subplot grid with the appropriate number of columns
    num_columns = 2  # Adjust this based on  layout preference
    num_rows = (num_subplots + num_columns - 1) // num_columns  # Calculate the number of rows

    fig, axes = plt.subplots(num_rows, num_columns, figsize=(18, 18))#, sharey = True) # Set plot sizes
    fig.subplots_adjust(hspace=0.5, wspace=0.2)
    sns.set(style="darkgrid")

    # Flatten the axes if there's only one row
    if num_rows == 1:
        axes = [axes]

    # Loop through the continuous variables
    for i, var in enumerate(column_list):
        # Determine the current subplot's row and column
        row = i // num_columns
        col = i % num_columns # Modulus / get the remainder 
        
        # Create a histogram in the current subplot
        sns.scatterplot(data=df, x=var, y = var1, hue='left', alpha=0.19, ax=axes[row][col], palette='colorblind')
        #axes[row][col].set_title(f'Histogram for {var}')
        axes[row][col].set_xlabel(var)
        axes[row][col].set_ylabel(var1)
        #axes[row][col].legend(loc ='upper center', bbox_to_anchor=(0.6,-0.5), fancybox=True, shadow=True, ncol = 2, 
        title='left', labels=['Yes', 'No'])
        axes[row][col].tick_params(axis='x', labelrotation=45)




    # Adjust spacing
    plt.tight_layout()

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"

# Load dataset into a dataframe
df1 = pd.read_csv(load_path + "data_cleaned_Ol_NoFE_AllFeat.csv")          # 11991 rows, all data including outliers
#df1  = pd.read_csv(load_path + "data_cleaned_NoOl_NoFE_AllFeat.csv")      # 11167 rows, no outliers (mostly from tenure)

# Display first few rows of the dataframe

df1.head()

pd.options.display.float_format = '{:.2f}'.format
counts = df1['left'].value_counts()
perc = df1['left'].value_counts(normalize=True)*100

summary_table = pd.DataFrame({"Counts " : counts,
                              "Percentage " : perc})

summary_table

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)

df1.describe(include='all')

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11991 entries, 0 to 11990
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   satisfaction    11991 non-null  float64
 1   last_eval       11991 non-null  float64
 2   number_project  11991 non-null  int64  
 3   avg_mnth_hrs    11991 non-null  int64  
 4   tenure          11991 non-null  int64  
 5   accident        11991 non-null  int64  
 6   left            11991 non-null  int64  
 7   promotion       11991 non-null  int64  
 8   dept            11991 non-null  object 
 9   salary          11991 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 936.9+ KB

df1.columns

Index(['satisfaction', 'last_eval', 'number_project', 'avg_mnth_hrs', 'tenure', 'accident', 'left', 'promotion',
       'dept', 'salary'],
      dtype='object')

# Copy df1 to model_data as eda dataframe
# copy df1 to original_data - has dept in columns, required for some plots and analysis
original_data = df1.copy() 
# copy df1 to full_data which is Feature Engineered. We'll mostly use to prepare data for plots and calculations.
full_data     = df1.copy() 
# focus_data  = full_data with dept column removed, created in next cell

# Set ordinal encoding for salary low, medium, high
# we'll get 0, 1, 2 in the salary field as an ordinal representation.

full_data['salary'] = (
    full_data['salary'].astype('category')
    .cat.set_categories(['low', 'medium', 'high']) 
    .cat.codes
)


# One Hot Encode dept,  we'll get a new binary field for each dept
full_data = pd.get_dummies(full_data, columns = ['dept']) 

# Duplicate dataframe but only focus features (excludes dept)
focus_data = full_data.loc[:,['satisfaction', 'last_eval', 'number_project', 'avg_mnth_hrs', 'tenure', 'accident', 
'left', 'promotion','salary']]

pd.options.display.float_format = '{:.4%}'.format

## All Features get correlations
df_corr = full_data.corr().mean().sort_values(ascending=False)


# Focus Features select data features
focus_corr = focus_data.corr().mean().sort_values(ascending=False)

# Focus Features select data features for left = 1
left_focus_corr = focus_data[focus_data['left'] == 1].corr().mean().sort_values(ascending=False) 

result_df = pd.concat([df_corr, focus_corr, left_focus_corr], axis=1)

# rename columns to something meaningful
result_df.columns=['all features', 'focus features', 'left = 1 focus']

result_df

%%time
sns.set(style="darkgrid")
#sns.pairplot(data=focus_data, hue='left', palette='colorblind')

# Using correlations to focus on key features

CPU times: user 1.88 ms, sys: 0 ns, total: 1.88 ms
Wall time: 1.9 ms

## Focus on features with higher correlation values, remove encoded dept:
df_corr = full_data.corr().sort_values(by = 'left', ascending = False)
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.heatmap(df_corr, annot=True, annot_kws={"size": 8}, fmt='.0%', cmap='YlOrRd', cbar = False)
plt.show()

## Focus on features with higher correlation values, remove encoded dept:
df_corr = focus_data.corr().sort_values(by = 'left', ascending = False)
#df_corr.drop('left', axis = 1, inplace = True)

sns.heatmap(df_corr, annot=True, annot_kws={"size": 8}, fmt='.0%', cmap='YlOrRd')
plt.title(f'Correlation Score for All employees')
plt.show()

df = focus_data.copy() # Start with fresh copy

df = df.loc[(df['left'] == 1)] # Filter on left = 1
df = df.drop('left', axis=1) # drop the left column

df_corr = df.corr()

plt.figure(figsize = (6,6))
sns.heatmap(df_corr, annot=True, annot_kws={"size": 8}, fmt='.0%', cmap='YlOrRd')
plt.title('Correlation Score for employees who left')
plt.show()

# Multi plot for variables.

continuous_vars = ['last_eval','number_project','avg_mnth_hrs','promotion','tenure',
                   'satisfaction', 'salary', 'accident']

df = full_data # .sort_values(by='number_project', ascending=True)

# Calculate the number of subplots needed based on the number of continuous variables (6) 
num_subplots = len(continuous_vars)

# Create a subplot grid with the appropriate number of columns
num_columns = 2  # Adjust this based on  layout preference
num_rows = (num_subplots + num_columns - 1) // num_columns  # Calculate the number of rows

fig, axes = plt.subplots(num_rows, num_columns, figsize=(16, 16)) # Set plot sizes
sns.set(style="darkgrid")
# Flatten the axes if there's only one row
if num_rows == 1:
    axes = [axes]

# Loop through the continuous variables
for i, var in enumerate(continuous_vars):
    # Determine the current subplot's row and column
    row = i // num_columns
    col = i % num_columns # Modulus / get the remainder 

    # Create a histogram in the current subplot
    sns.histplot(data=df, x=var, hue='left', bins=10, element='step', common_norm=False, ax=axes[row][col], 
    palette='colorblind')
    axes[row][col].set_title(f'Histogram for {var}')
    axes[row][col].set_xlabel(var)
    axes[row][col].set_ylabel('Frequency')
    axes[row][col].legend(title='left', labels=['Yes', 'No'])
    axes[row][col].tick_params(axis='x', labelrotation=45)

# Remove any empty subplots
for i in range(num_subplots, num_rows * num_columns):
    fig.delaxes(axes[i])

# Adjust spacing
plt.tight_layout()

# Display the combined plots
plt.show()

# Initialise Analysis Variables and Pandas Display options

var = 'last_eval' # Feature used for plot and detail

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

full_data[var].describe()

count   11991.00
mean        0.72
std         0.17
min         0.36
25%         0.57
50%         0.72
75%         0.86
max         1.00
Name: last_eval, dtype: float64

# Calculate mean and median satisfaction scores of employees who left and those who stayed
full_data.groupby('left')['last_eval'].agg(["mean","median"])

result = eda_single_bins(full_data,var, 
                         [0.0, 0.5, 0.7, 1.0], 
                         ['low : 0.00 - 0.50', 
                          'medium : 0.51-0.69', 
                          'high : 0.70 - 1.00'],0)
result

%%time
df = full_data[['last_eval', 'satisfaction', 'avg_mnth_hrs', 'left', 'number_project', 'tenure']] 


sns.set(style="darkgrid")
#sns.pairplot(data=df, hue = 'left', palette='colorblind')

CPU times: user 1.6 ms, sys: 228 µs, total: 1.83 ms
Wall time: 1.56 ms

df = full_data[['last_eval', 'dept_support', 'avg_mnth_hrs', 'left', 'number_project']] 

scatplot_single(df, 'last_eval', 'avg_mnth_hrs')

scatplot_single(full_data, 'last_eval', 'satisfaction')

return_plot = correlation_single(full_data, var)

returned_plot = quadplot(full_data, var, False)


plt.show()

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

# Initialise Analysis Variables and Pandas Display options

var = 'number_project' # Feature used for plot and detail

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

full_data[var].describe(include='all')

count   11991.00
mean        3.80
std         1.16
min         2.00
25%         3.00
50%         4.00
75%         5.00
max         7.00
Name: number_project, dtype: float64

# Calculate mean and median satisfaction scores of employees who left and those who stayed
full_data.groupby(['left'], observed=True)['number_project'].agg(["mean","median"])

eda_single(full_data,var,2)

eda_single_bins(full_data,var, [0, 3,5,7],
                ['low (<3)','normal (3-5)','high (>5)'],2)

return_plot = correlation_single(full_data, var)

scatplot_single(full_data, 'number_project','avg_mnth_hrs')

returned_plot = quadplot(full_data, var, True)
returned_plot.show()

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

scatplot_single(focus_data, 'last_eval', 'number_project')

ret = eda_dual(focus_data, 'number_project', 'last_eval', 2)
ret.sort_values(by='Left Perc of Left Total', ascending=False)

%%time

# Create a plot for hours worked and number of projects

plot_df=full_data[['avg_mnth_hrs', 'number_project', 'left' ]]


sns.set(style="darkgrid")
sns.pairplot(data = plot_df,  hue = 'left')


#display the graphs
plt.show()

CPU times: user 3.34 s, sys: 592 ms, total: 3.93 s
Wall time: 3.32 s

# Initialise Analysis Variables and Pandas Display options

var = 'avg_mnth_hrs' # Feature used for plot and detail

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

full_data[var].describe()

count   11991.00
mean      200.47
std        48.73
min        96.00
25%       157.00
50%       200.00
75%       243.00
max       310.00
Name: avg_mnth_hrs, dtype: float64

# Calculate mean and median satisfaction scores of employees who left and those who stayed
full_data.groupby(['left'])['avg_mnth_hrs'].agg(["mean","median"])

avg_hours = full_data.groupby('left')['avg_mnth_hrs'].mean()
perc_diff = ((avg_hours[1] - avg_hours[0]) / avg_hours[0]) #* 100
annual_diff = (avg_hours[1] - avg_hours[0]) * 12

# Display the result
print("Average Monthly Hours Comparison")
print()
print("Left Mean Mnthly Hrs   : {:.2f}".format(avg_hours[1]))
print("Stayed Mean Mnthly Hrs : {:.2f}".format(avg_hours[0]))
print("Percentage Difference  :   {:.2f}%".format(perc_diff))
print("Annual Hrs Difference  : {:.2f} hours".format(annual_diff))
print()
print("Employees who left worked {:.2f} more hours per year than retained staff".format(annual_diff))

total_hours = full_data.groupby(['salary', 'left'])['avg_mnth_hrs'].mean()

# Reset the index to make the result a DataFrame
total_hours = total_hours.reset_index()

# Calculate the total difference between retained and not retained for each salary group
total_hours['total_difference'] = total_hours.groupby('salary')['avg_mnth_hrs'].diff().fillna(0)


# Display the result
total_hours

Average Monthly Hours Comparison

Left Mean Mnthly Hrs   : 208.16
Stayed Mean Mnthly Hrs : 198.94
Percentage Difference  :   0.05%
Annual Hrs Difference  : 110.63 hours

Employees who left worked 110.63 more hours per year than retained staff

eda_single_bins(full_data, var, [0, 175, 200, 250, 300, 350], 
                ['    Normal 0-175 :   0-8  hrs/day',
                 '         176-200 :   8-9  hrs/day', 
                 '    High 201-251 :  9-12  hrs/day', 
                 '  V High 251-300 : 14-16  hrs/day',
                 'Extreme 301-350 :    16+ hrs/day'],1)

returned_plot = correlation_single(full_data, var)

returned_plot = quadplot(full_data, var, False) # df, variable, annotate

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

# Create a violin plot
violinplot = sns.violinplot(x='left', y=var, data=full_data, hue='left', palette='colorblind')
violinplot.set_title(f'Violin Plot of {var} by left')
violinplot.axhline(y=177, color='red', linestyle='--', label='Threshold at 177')
violinplot.set_xlabel('left')
violinplot.set_ylabel(f'{var}')

Text(0, 0.5, 'avg_mnth_hrs')

# Initialise Analysis Variables and Pandas Display options

var = 'tenure' # Feature used for plot and detail

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

print(full_data[var].describe())

count   11991.00
mean        3.36
std         1.33
min         2.00
25%         3.00
50%         3.00
75%         4.00
max        10.00
Name: tenure, dtype: float64

# Calculate mean and median satisfaction scores of employees who left and those who stayed
full_data.groupby(['left'])['tenure'].agg(["mean","median"])

eda_single(full_data, var,1)

eda_single_bins(full_data, 'tenure', [0,4,12],['<5 years','>5 years'],0)

df = original_data.copy() # refresh df just in case

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

# Calculate the total number of employees (left = 0)

# filter left = 0 and dept != management
total_employees  = len(df.loc[((df['left'] == 0) & (df['dept'] != 'management'))]) 
# filter left = 0 and dept = management
total_managers   = len(df.loc[((df['left'] == 0) & (df['dept'] == 'management'))]) 

# Calculate the total number of Accidents - just curious
total_accident   = len(df.loc[((df['left'] == 0) & (df['accident'] == 1))])

# Calculate the number of employees who left (left = 1)

# filter left = 1 and dept != management
employees_left   = len(df.loc[((df['left'] == 1) & (df['dept'] != 'management'))]) 
# filter left = 1 and dept = management
managers_left    = len(df.loc[((df['left'] == 1) & (df['dept'] == 'management'))]) 

result = pd.DataFrame({'Total Employees'     : total_employees,
                       'Employees Left'      : employees_left,
                       'Employees Left Perc' : (employees_left / total_employees)*100,
                       'Total Mgrs'          : total_managers,
                       'Mgrs Left'           : managers_left,
                       'Mgrs Left Perc'     : (managers_left / total_managers)*100,
                       'Employee / Mgr Ratio': (total_employees - employees_left) /(total_managers - managers_left),
                       'Total Accident'      : total_accident,
                       'Accident Perc'       : (total_accident / total_employees)*100
                       }, index = [0])

result

returned_plot = correlation_single(full_data, var)

returned_plot = quadplot(full_data, var, True)
returned_plot.show()

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

df = full_data.copy()

# Count promotions by tenure
promotions_by_tenure = df.groupby('tenure')['promotion' ].agg(['count', 'sum']).reset_index()

# Calculate the total count by tenure
total_by_tenure = df['tenure'].value_counts().reset_index()
total_by_tenure.columns = ['tenure', 'Total by Tenure']

# Merge the two DataFrames to combine the counts
result_df = pd.merge(promotions_by_tenure, total_by_tenure, on='tenure')

# Calculate the percentage of tenure as a percentage of the total
result_df['percentage_of_total'] = (result_df['count'] / result_df['count'].sum()) * 100

# Calculate the percentage of promotions
result_df['promotions_percentage'] = (result_df['sum'] / df.shape[0]) * 100

# Rename the columns for clarity
result_df = result_df.rename(columns={'count': 
                                      'Count of Tenure', 'sum': 'Count of Promotions'})

summary_vars = ['Count of Tenure','Count of Promotions', 
                'Total by Tenure', 'percentage_of_total', 'promotions_percentage']

result_df.loc['Total'] = result_df[summary_vars].sum()
# Display the resulting table
result_df

plt.figure(figsize=(8, 6))
sns.scatterplot(data=full_data, x='avg_mnth_hrs', y='promotion', hue='left', alpha=0.4, palette='colorblind')
plt.axvline(x=166.67, color='#ff6361', linestyle='--', label="175 Hrs")
plt.legend(labels=['left', 'stayed'])
plt.title('Monthly hours by promotion last 5 years', fontsize='14');
#plt.legend()

pd.set_option('display.max_rows', 100) # Display the entire results dataframe

df = original_data.copy() # copy original data to df used through the cell


# Group by 'dept' and 'tenure' and get the count
count_by_dept_and_tenure = df.groupby(['dept', 'tenure', 'left', 
                                       'promotion']).size().reset_index(name='count_by_dept_and_tenure')

# Group by 'dept' and get the total count
total_count_by_dept = df.groupby('dept').size().reset_index(name='total_count_by_dept')

# Merge the two DataFrames on 'dept'
result_df = pd.merge(count_by_dept_and_tenure, total_count_by_dept, on='dept')

# Calculate the percentage and add it to the DataFrame
result_df['perc_by_dept_and_tenure'] = (result_df['count_by_dept_and_tenure'] / 
                                        result_df['total_count_by_dept']) * 100

result_df.drop('total_count_by_dept', axis = 'columns', inplace = True)
result_df.sort_values(by='perc_by_dept_and_tenure', ascending = False)

# Display the combined DataFrame
result_df

df.groupby('dept')['tenure'].mean()

dept
accounting    3.40
hr            3.26
it            3.35
management    3.98
marketing     3.42
product_mng   3.34
randd         3.32
sales         3.38
support       3.29
technical     3.31
Name: tenure, dtype: float64

# Initialise Analysis Variables and Pandas Display options

var = 'satisfaction' # Feature used for plot and detail

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

full_data[var].describe()

count   11991.00
mean        0.63
std         0.24
min         0.09
25%         0.48
50%         0.66
75%         0.82
max         1.00
Name: satisfaction, dtype: float64

full_data.groupby(['left'])['satisfaction'].agg(["mean","median"])

eda_single_bins(full_data,var,[0, 0.37, 0.67, 1.0],
                ['low 0.00 - 0.37', 'medium 0.38 - 0.66', 'high 0.67-1.00'],1)

return_plot = correlation_single(full_data, var)

returned_plot = quadplot(full_data,var,False)
returned_plot.show()

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

# Visualisation satisfaction vs promotion

df = full_data.copy()

#set figure and axes
fig, ax = plt.subplots(1, 2, figsize = (12,6))
sns.set_theme(style="whitegrid")
sns.color_palette("colorblind", 9)

#Create a boxplot showing 'average_monthly_hours' distributions for 'number_project', comparing employees who stayed vs those who left

sns.violinplot(data = df, x = 'satisfaction', y = 'promotion', hue = 'left', orient = 'h', ax = ax[0], palette='colorblind')
ax[0].invert_yaxis()
ax[0].set_title('Satisfaction Vs Promotion', fontsize = '14')

#Create a histogram showing distribution of 'number_project', comparing employees who stayed vs those who left


sns.barplot(data = df, y = 'satisfaction', hue = 'left',  ax = ax[1], palette='colorblind')
ax[1].set_title('Satisfaction Vs Promotion', fontsize = '14')

#display the graphs
plt.show()

# Initialise Analysis Variables and Pandas Display options

var = 'accident' # Feature used for plot and detail

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

full_data[var].describe()

count   11991.00
mean        0.15
std         0.36
min         0.00
25%         0.00
50%         0.00
75%         0.00
max         1.00
Name: accident, dtype: float64

eda_single(full_data, var,1)

returned_plot = correlation_single(full_data, var)

returned_plot = quadplot(full_data, var, True)
returned_plot.show()

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

# Initialise Analysis Variables and Pandas Display options

var = 'promotion' # Feature used for plot and detail

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

# Describe variable
full_data[var].describe()

count   11991.00
mean        0.02
std         0.13
min         0.00
25%         0.00
50%         0.00
75%         0.00
max         1.00
Name: promotion, dtype: float64

promotion_counts = full_data['promotion'].value_counts()
promotion_percentages = full_data['promotion'].value_counts(normalize=True) * 100

# Create a new dataframe to store the results
promotion_df = pd.DataFrame({
    'Count': promotion_counts,
    'Percentage': promotion_percentages
})

# Print the result dataframe
print(promotion_df)

           Count  Percentage
promotion                   
0          11788       98.31
1            203        1.69

eda_single(full_data, var,2)

returned_plot = correlation_single(full_data, var)

returned_plot = quadplot(full_data, var, True)
returned_plot.show()

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

df = full_data.copy()

# Filter the DataFrame for "left"

left_1_df = df.loc[df['left'] == 1]
left_0_df = df.loc[df['left'] == 0]

left_1_df = left_1_df.rename(columns={'promotion': 'left promotion'})


# Filter the DataFrame for "left" = 0

left_0_df = left_0_df.rename(columns={'promotion': 'stayed promotion'})

# Group by "tenure" and calculate counts for "left" = 1
left_1_counts = left_1_df.groupby('tenure')['left promotion'].sum().reset_index()

# Calculate percentages for "left" = 1
left_1_counts['left promotion perc'] = (left_1_counts['left promotion'] / 
                                        left_1_counts['left promotion'].sum()) * 100

# Group by "tenure" and calculate counts for "left" = 0
left_0_counts = left_0_df.groupby('tenure')['stayed promotion'].sum().reset_index()

# Calculate percentages for "left" = 0
left_0_counts['stayed promotion perc'] = (left_0_counts['stayed promotion'] / 
                                          left_0_counts['stayed promotion'].sum()) * 100

# Display the results for "left" = 1
#print("Counts and Percentages for Left = 1:")
#print(left_1_counts)

# Display the results for "left" = 0
#print("\nCounts and Percentages for Left = 0:")
#print(left_0_counts)

# Merge the DataFrames using an outer join
merged_df = left_0_counts.merge(left_1_counts, on='tenure', how='outer')

merged_df

# Initialise Analysis Variables and Pandas Display options

var = 'salary' # Feature used for plot and detail

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

full_data[var].describe()

count   11991.00
mean        0.60
std         0.64
min         0.00
25%         0.00
50%         1.00
75%         1.00
max         2.00
Name: salary, dtype: float64

# Totals by Salary 
returned_df = eda_single(original_data, var, 2)
returned_df.reset_index()
returned_df.sort_values(by=('Left Count'), ascending = False, inplace = True)
returned_df

var ='salary'
df = full_data[full_data['dept_sales'] == 1][['salary', 'left']]
#df = full_data[['dept_support', 'left', 'salary']]
returned_df = eda_single(df, var, 2)
returned_df.reset_index()
returned_df.sort_values(by=('Left Count'), ascending = False, inplace = True)
returned_df

# Totals by salary & promotion 
returned_df = eda_dual(original_data, var, 'promotion',2)
returned_df.index.name = var + ' / ' + 'promotion'
returned_df.sort_values(by='Left Count', ascending = False)

# Totals by salary & promotion 
returned_df = eda_dual(original_data, var, 'tenure',2)
returned_df.index.name = var + ' / ' + 'Tenure'
returned_df.sort_values(by='Left Count', ascending = False)

df = original_data.copy()

# Group the data by 'department', 'salary', and 'left' and calculate the count in each group
grouped = df.groupby(['salary', 'dept', 'left']).size().reset_index(name='count')

# Pivot the data to create a summary table
summary_table = grouped.pivot_table(index=['salary', 'dept'], columns='left', values='count', fill_value=0)

# Calculate the percentage of people who left and add it as a new column
summary_table['Percentage Left'] = (summary_table[1] / (summary_table[0] + summary_table[1])) * 100

# Reset the index to have 'department' and 'salary' as regular columns
summary_table = summary_table.reset_index()

summary_table  = summary_table.rename(columns={0: 'Stayed', 1: 'Left'})
summary_table.sort_values(by = 'Left', ascending = False)

summary_table = summary_table.sort_values(by=('Left'), ascending=False)
summary_table

returned_plot = correlation_single(full_data, var) 
returned_plot.show()

returned_plot = quadplot(full_data, var, True)
returned_plot.show()

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

# Prepare data for heatmap
df = original_data.copy()

contingency_table = pd.crosstab(df['left'], df[var])

# Calculate column percentages
column_percentages = contingency_table / contingency_table.sum()
row_percentages = contingency_table.div(contingency_table.sum(axis=1), axis=0) 

# Create a pivot table for the heatmap
heatmap_data = df.pivot_table(index='left', columns=var, aggfunc='size', fill_value=0)

# Create a heatmap 

plt.figure(figsize=(12,4 ))

plt.subplot(1, 3, 1)  # Subplot for count
sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, fmt='d', cbar=False)
plt.title(f'{var} v left (count)')
plt.xlabel(var)
plt.ylabel('left')

# Create a heatmap 

plt.subplot(1, 3, 2)  # Subplot for percentage of salary band
sns.heatmap(column_percentages, annot=True, cmap='YlGnBu', fmt='.1%', cbar=False)
plt.title(f'{var} (%) Vs. left by Salary Band')
plt.xlabel(var)
plt.ylabel('left')

# Create a heatmap 

plt.subplot(1, 3, 3)  # Subplot for percentage
ax = sns.heatmap(row_percentages, annot=True, cmap='YlGnBu', fmt='.1%', cbar=False)
annot_font_size = 10
plt.title(f'{var} (%) Vs. left by left')
#ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
#ax.set_yticklabels(ax.get_yticklabels(), fontsize=8)
plt.xlabel(var)
plt.ylabel('left')


plt.show()

# Create two new DataFrames for 'left = 1' and 'left = 0'

df = full_data.copy()
left_1 = df[df['left'] == 1]
left_0 = df[df['left'] == 0]

# Count the number of occurrences for each category in 'salary' for each group
all_counts = df['salary'].value_counts()
left_1_counts = left_1['salary'].value_counts()
left_0_counts = left_0['salary'].value_counts()

# Create pie chart subplots
fig, axs = plt.subplots(1, 3, figsize=(12, 5))
sns.set(style="darkgrid")

# Plot the pie chart for 'left = 1'
axs[2].pie(left_1_counts, labels=left_1_counts.index, autopct='%1.1f%%', startangle=90)
axs[2].set_title('left = 1')

# Plot the pie chart for 'left = 0'
axs[1].pie(left_0_counts, labels=left_0_counts.index, autopct='%1.1f%%', startangle=90)
axs[1].set_title('left = 0')

# Plot the pie chart for 'left = 0'
axs[0].pie(all_counts, labels=all_counts.index, autopct='%1.1f%%', startangle=90)
axs[0].set_title('All')

plt.show()

## Visualise Hours and Salary

df = full_data.copy()

#set figure and axes
fig, ax = plt.subplots(1, 2, figsize = (12,6))
sns.set(style="darkgrid")
sns.color_palette("husl", 9)

#Create a boxplot showing 'average_monthly_hours' distributions for 'number_project', comparing employees who stayed vs those who left
sns.boxplot(data = full_data, x = 'avg_mnth_hrs', y = 'salary', hue = 'left', orient = 'h', ax = ax[0], palette='colorblind')
ax[0].invert_yaxis()
ax[0].set_title('Monthly hours by Salary', fontsize = '14')
ax[0].set_yticks([0,1,2],['low', 'medium','high'], )

#Create a histogram showing distribution of 'number_project', comparing employees who stayed vs those who left and salary

hue_order = [0, 1, 2]
hue_labels = ['Low', 'Medium', 'High']
tenure_stay = full_data[full_data['left'] == 0]['salary']
tenure_left = full_data[full_data['left'] == 1]['salary']
sns.boxplot(data = full_data, x = 'left', y = 'number_project', hue = 'salary', hue_order=hue_order, ax = ax[1], palette='colorblind')
ax[1].set_title('Number of projects by Salary', fontsize = '14')
ax[1].set_xticks([0, 1], ['Not Left', 'Left'])

# Set custom labels for the hue using legend
handles, _ = plt.gca().get_legend_handles_labels()
plt.legend(handles, hue_labels, title='Salary', loc='upper right')

#display the graphs
plt.show()

df = original_data.copy()

# Group the data by 'department', 'salary', and 'left' and calculate the count in each group
grouped = df.groupby(['dept', 'salary', 'left']).size().reset_index(name='count')

# Pivot the data to create a summary table
summary_table = grouped.pivot_table(index=['dept', 'salary'], columns='left', values='count', fill_value=0)

# Calculate the percentage of people who left and add it as a new column
summary_table['Percentage Left'] = (summary_table[1] / (summary_table[0] + summary_table[1])) * 100



# Reset the index to have 'department' and 'salary' as regular columns
summary_table = summary_table.reset_index()

# Create a heatmap to visualize the count of people who left by 'department' and 'salary'
summary_table  = summary_table.rename(columns={0: 'Stayed', 1: 'Left'})
summary_table.sort_values(by='Left', ascending=False, inplace=True)
summary_table

# Initialise Analysis Variables and Pandas Display options

var = 'dept' # Feature used for plot and detail

pd.options.display.float_format = '{:.2f}'.format #2 decimal places, floating point

# Describe variable

original_data[var].describe()

count     11991
unique       10
top       sales
freq       3239
Name: dept, dtype: object

returned_df = eda_single(original_data, var, 2)
returned_df.sort_values(by='Left Count', ascending=False, inplace=True)
returned_df

# Prepare data for heatmap
df = original_data.copy()

contingency_table = pd.crosstab(df['left'], df[var])

# Calculate column percentages
column_percentages = contingency_table / contingency_table.sum()
row_percentages = contingency_table.div(contingency_table.sum(axis=1), axis=0) 

# Create a pivot table for the heatmap
heatmap_data = df.pivot_table(index='left', columns=var, aggfunc='size', fill_value=0)

# Create a heatmap 
fig, axes = plt.subplots(1, 2, figsize=(12,3 ))
sns.set(style="darkgrid")

#plt.subplot(2, 2, 1)  # Subplot for count
sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, annot_kws={"size": 10},fmt='d', cbar=False, ax=axes[0])
axes[0].set_title(f'{var} v left (count)')
axes[0].set_xlabel(var)
axes[0].set_ylabel('left')

# Create a heatmap 

#plt.subplot(2, 2, 1)  # Subplot for percentage of salary band
sns.heatmap(column_percentages, annot=True, cmap='YlGnBu', annot_kws={"size": 10},fmt='.0%', cbar=False, ax=axes[1])
axes[1].set_title(f'{var} (%) Vs. left')
axes[1].set_xlabel(var)
axes[1].set_ylabel('left')

# Create a heatmap 

#plt.subplot(2, 2, 2)  # Subplot for percentage
#sns.heatmap(row_percentages, annot=True, cmap='YlGnBu', annot_kws={"size": 8},fmt='.1%', cbar=False, ax=axes[1,0])
#axes[1,0].set_title(f'{var} (%) Vs. left by left')
#axes[1,0].set_xlabel(var)
#axes[1,0].set_ylabel('left')

#percentage_left_by_dept = df.groupby('dept')['left'].mean() * 100
#left_1_counts = df.groupby('dept')['left'].value_counts()

# Create a correlation map and heatmap
#sns.heatmap(percentage_left_by_dept.reset_index().pivot_table(index='dept', columns='dept').T, annot=True, cbar=False, annot_kws={"size": 8}, fmt=".2f", cmap="YlGnBu", ax=axes[1,1])
#axes[1,1].set_title('Percentage Left by Department')

#sns.heatmap(left_1_counts.reset_index().pivot_table(index='dept', columns='dept').T, annot=True, annot_kws={"size": 10}, fmt=".0f", cmap="YlGnBu", cbar=False, ax=axes[2,0])
#axes[2,1].set_title('count Left by Department')

#fig.delaxes(axes[2, 1]) # Delete empty plots
plt.subplots_adjust(wspace=0.2, hspace=.3)


plt.show()

df = original_data.copy()

# Filter the DataFrame for left = 0 and left = 1
left_0_df = df[df['left'] == 0]
left_1_df = df[df['left'] == 1]

# Group and pivot the data for heatmaps
left_0_pivot = left_0_df.groupby(['dept', 'salary']).size().unstack(fill_value=0)
left_1_pivot = left_1_df.groupby(['dept', 'salary']).size().unstack(fill_value=0)

# Calculate percentage heatmaps
total_records = len(df)
print(total_records)
left_0_total = left_0_df.shape[0] # Count of n
left_1_total = left_1_df.shape[0] # Count of n
left_0_percentage = (left_0_pivot / left_0_total)  # n as % of n = 0 total
left_1_percentage = (left_1_pivot / left_1_total)  # n as % of n = 1 total
left_0_perc_of_tot = (left_0_pivot / (left_0_total + left_1_total))  # n as % of n  total
left_1_perc_of_tot = (left_1_pivot / (left_0_total  + left_1_total))  # n as % of n  total

# Create subplots for the heatmaps
fig, axes = plt.subplots(3, 2, figsize=(12, 8))
sns.set(style="darkgrid")

# Create the count heatmap for left = 0
sns.heatmap(left_0_pivot, annot=True, annot_kws={"size": 10}, fmt='d', cmap='YlGnBu', ax=axes[0, 0])
axes[0, 0].set_title('Count Heatmap for Left = 0')

# Create the count heatmap for left = 1
sns.heatmap(left_1_pivot, annot=True, annot_kws={"size": 10}, fmt='d', cmap='YlGnBu', ax=axes[0, 1])
axes[0, 1].set_title('Count Heatmap for Left = 1')

# Create the percentage heatmap for left = 0
sns.heatmap(left_0_percentage, annot=True, annot_kws={"size": 10}, fmt='.2%', cmap='YlGnBu', ax=axes[1, 0])
axes[1, 0].set_title('Percentage Heatmap for % of Left = 0')

# Create the percentage heatmap for left = 1
sns.heatmap(left_1_percentage, annot=True, annot_kws={"size": 10}, fmt='.2%', cmap='YlGnBu', ax=axes[1, 1])
axes[1, 1].set_title('Percentage Heatmap for % of Left = 1')

# Create the percentage heatmap for left = 0
sns.heatmap(left_0_perc_of_tot, annot=True, annot_kws={"size": 10}, fmt='.2%', cmap='YlGnBu', ax=axes[2, 0])
axes[2, 0].set_title('Percentage Heatmap for Left = 0 % of Total')

# Create the percentage heatmap for left = 1
sns.heatmap(left_1_perc_of_tot, annot=True, annot_kws={"size": 10}, fmt='.2%', cmap='YlGnBu', ax=axes[2, 1])
axes[2, 1].set_title('Percentage Heatmap for Left = 1 % of Total')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

11991

returned_df = eda_dual(original_data, 'dept', 'salary', 2)
returned_df.index.name = "Dept / Salary"
returned_df.sort_values(by='Left Count', ascending = False)

eda_single(original_data,'salary', 2)

# Review data for salary ranges

# Set output number format
pd.set_option('display.float_format', '{:.2f}'.format)

df = original_data.copy()

count_by_group = df.groupby(['salary', 'left']).size().reset_index(name='count')

# Display the result
print(count_by_group)

high_left   = df[(df['salary'] == 'high') & (df['left'] == 1)]
medium_left = df[(df['salary'] == 'medium') & (df['left'] == 1)]
low_left    = df[(df['salary'] == 'low') & (df['left'] == 1)]


# Initialise values

high_salary   = 150000
medium_salary = 100000
low_salary    =  50000
cost_percentage = 0.25 

# Count the rows in the filtered DataFrame multiply by salary thresholds

# Assumes high salary @ £150K and cost to replace is 30% of salary
high_cost = high_left.shape[0] * high_salary * cost_percentage

# Assumes medium salary @ £100K and cost to replace is 30% of salary
medium_cost = medium_left.shape[0] * medium_salary * cost_percentage

# Assumes low salary @ £50K and cost to replace is 30% of salary
low_cost = low_left.shape[0] * low_salary * cost_percentage

# Count of employees
total_count = low_left.shape[0]+medium_left.shape[0]+high_left.shape[0]


print()
# Display the selected rows

print(color.BOLD + f"Cost of staff leaving (assuming replacement costs {cost_percentage:.0%} of total salary)" + color.END)
print('\u2500' * 70)
print("                                  |   # of Staff |  Cost Per Head |   Est £ Cost")
print("Estimate High Salary Left Cost    | {:>12,.0f} | {:>14,.0f} | {:>12,.0f} ".
      format(high_left.shape[0], high_cost/high_left.shape[0], high_cost))
print("Estimate Medium Salary Left Cost  | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
      .format(medium_left.shape[0], medium_cost/ medium_left.shape[0], medium_cost))
print("Estimate Low Salary Left Cost     | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
      .format(low_left.shape[0], low_cost/ low_left.shape[0], low_cost,))
print("                                  | ============ |        ======= |   ==========")
print("Estimate TOTAL Salary Left Cost   | {:>12,.0f} | {:>14,.0f} "
      .format(low_cost+medium_cost+high_cost, total_count))

   salary  left  count
0    high     0    942
1    high     1     48
2     low     0   4566
3     low     1   1174
4  medium     0   4492
5  medium     1    769

Cost of staff leaving (assuming replacement costs 25% of total salary)
──────────────────────────────────────────────────────────────────────
                                  |   # of Staff |  Cost Per Head |   Est £ Cost
Estimate High Salary Left Cost    |           48 |         37,500 |    1,800,000 
Estimate Medium Salary Left Cost  |          769 |         25,000 |   19,225,000
Estimate Low Salary Left Cost     |        1,174 |         12,500 |   14,675,000
                                  | ============ |        ======= |   ==========
Estimate TOTAL Salary Left Cost   |   35,700,000 |          1,991

cost_percentage = 0.5 

# Assumes high salary @ £150K and cost to replace is 30% of salary
high_cost = high_left.shape[0] * high_salary * cost_percentage

# Assumes medium salary @ £100K and cost to replace is 30% of salary
medium_cost = medium_left.shape[0] * medium_salary * cost_percentage

# Assumes low salary @ £50K and cost to replace is 30% of salary
low_cost = low_left.shape[0] * low_salary * cost_percentage

# Count of employees
total_count = low_left.shape[0]+medium_left.shape[0]+high_left.shape[0]

print(color.BOLD + f"Cost of staff LEFT (assuming replacement costs {cost_percentage:.0%} of total salary)" + color.END)
print('\u2500' * 70)
print("                                  |  # of Staff  | Cost Per Head | Est £ Cost ")
print("Estimate High Salary Left Cost    | {:>12,.0f} | {:>13,.0f} | {:>12,.0f} ".
      format(high_left.shape[0], high_cost/high_left.shape[0], high_cost))
print("Estimate Medium Salary Left Cost  | {:>12,.0f} | {:>13,.0f} | {:>12,.0f}"
      .format(medium_left.shape[0], medium_cost/ medium_left.shape[0],medium_cost))
print("Estimate Low Salary Left Cost     | {:>12,.0f} | {:>13,.0f} | {:>12,.0f}"
      .format(low_left.shape[0], low_cost/ low_left.shape[0], low_cost))
print("                                  | ============ |       ======= |   ==========")
print("Estimate TOTAL Salary Left Cost   | {:>12,.0f} | {:>13,.0f}"
      .format(low_cost+medium_cost+high_cost, total_count))

Cost of staff LEFT (assuming replacement costs 50% of total salary)
──────────────────────────────────────────────────────────────────────
                                  |  # of Staff  | Cost Per Head | Est £ Cost 
Estimate High Salary Left Cost    |           48 |        75,000 |    3,600,000 
Estimate Medium Salary Left Cost  |          769 |        50,000 |   38,450,000
Estimate Low Salary Left Cost     |        1,174 |        25,000 |   29,350,000
                                  | ============ |       ======= |   ==========
Estimate TOTAL Salary Left Cost   |   71,400,000 |         1,991

## Cost of blanket pay increase of salary_increase

salary_increase = .3 # 30% Salary Increase


# Count the rows in the filtered DataFrame multiply by salary thresholds
high_cost_increase = high_left.shape[0] * high_salary * salary_increase 
medium_cost_increase = medium_left.shape[0] * medium_salary  * salary_increase 
low_cost_increase = low_left.shape[0] * low_salary  * salary_increase 

print()
print()
print(color.BOLD + "Cost of {:.0f}% pay increase".format(salary_increase*100) + color.END)
print('\u2500' * 70)
print("Estimate Retention Costs")
print("                                  |   # of Staff |  Cost Per Head | Est £ Cost")
print("Estimate High Salary Left Cost    | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
      .format(high_left.shape[0], high_cost_increase/ high_left.shape[0], high_cost_increase))
print("Estimate Medium Salary Left Cost  | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
      .format(medium_left.shape[0], medium_cost_increase/ medium_left.shape[0], medium_cost_increase))
print("Estimate Low Salary Left Cost     | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
      .format(low_left.shape[0], low_cost_increase/ low_left.shape[0], low_cost_increase))
print("                                  |  =========== |        ======= |   ==========")
print("Estimate TOTAL Salary Left Cost   | {:>12,.0f} |                {:>14,.0f}"
      .format(total_count, low_cost_increase+medium_cost_increase+high_cost_increase))


Cost of 30% pay increase
──────────────────────────────────────────────────────────────────────
Estimate Retention Costs
                                  |   # of Staff |  Cost Per Head | Est £ Cost
Estimate High Salary Left Cost    |           48 |         45,000 |    2,160,000
Estimate Medium Salary Left Cost  |          769 |         30,000 |   23,070,000
Estimate Low Salary Left Cost     |        1,174 |         15,000 |   17,610,000
                                  |  =========== |        ======= |   ==========
Estimate TOTAL Salary Left Cost   |        1,991 |                    42,840,000

perc_diff = ((avg_hours[1] - avg_hours[0]) / avg_hours[0]) * 100
annual_diff = (avg_hours[1] - avg_hours[0]) * 12

# Display the result
print("Average Monthly Hours Comparison")
print('\u2500' * 35)
print("Left                  : {:.2f}".format(avg_hours[1]))
print("Employed              : {:.2f}".format(avg_hours[0]))
print("Percentage Difference :   {:.2f}%".format(perc_diff))
print("Annual Hrs Difference : {:.0f} hours".format(annual_diff))
print()
print("Employees who left worked {:.0f} more hours per year than retained staff".format(annual_diff))
print()
total_hours = full_data.groupby(['salary', 'left'])['avg_mnth_hrs'].mean()

# Reset the index to make the result a DataFrame
total_hours = total_hours.reset_index()

# Calculate the total difference between retained and not retained for each salary group
total_hours['total_difference'] = total_hours.groupby('salary')['avg_mnth_hrs'].diff()


# Display the result
print(total_hours)

Average Monthly Hours Comparison
───────────────────────────────────
Left                  : 208.16
Employed              : 198.94
Percentage Difference :   4.63%
Annual Hrs Difference : 111 hours

Employees who left worked 111 more hours per year than retained staff

   salary  left  avg_mnth_hrs  total_difference
0       0     0        198.67               NaN
1       0     1        207.32              8.66
2       1     0        199.02               NaN
3       1     1        209.76             10.75
4       2     0        199.92               NaN
5       2     1        202.98              3.06

## Visualise Hours and Salary
plotdata = pd.DataFrame(total_hours)
sns.set(style="darkgrid")
plotdata.drop(['total_difference'], axis=1)
custom_palette = ["#007acc", "#ff3e46"]

# Create a grouped bar plot using Seaborn
plt.figure(figsize=(10, 6))
sns.set_palette(custom_palette)
sns.barplot(x='salary', y='avg_mnth_hrs', hue='left', data=plotdata, palette='colorblind')
plt.xlabel('Salary Level')
plt.ylabel('Hours Worked')
plt.title('Comparison of Hours Worked by Salary and Left Status')
plt.legend(title='Left', labels=['Not Left', 'Left'], loc='lower center', bbox_to_anchor=(0.5, -0.35))
plt.show()

Document Title	Salifort Motors EDA Analysis
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

	all features	focus features	left = 1 focus
avg_mnth_hrs	9.1727%	19.3872%	40.6857%
last_eval	9.0568%	19.0199%	44.4214%
number_project	8.8130%	18.6848%	37.6385%
tenure	8.1096%	16.7102%	43.2059%
promotion	6.2680%	12.6328%	11.6538%
salary	5.6554%	11.3276%	12.6628%
accident	4.8375%	10.2237%	12.9542%
dept_management	3.7989%	NaN	NaN
left	3.3160%	7.2069%	NaN
satisfaction	2.8039%	6.0132%	16.5350%
dept_marketing	1.7520%	NaN	NaN
dept_accounting	1.6314%	NaN	NaN
dept_randd	1.4386%	NaN	NaN
dept_hr	1.3126%	NaN	NaN
dept_product_mng	1.1011%	NaN	NaN
dept_it	0.4065%	NaN	NaN
dept_support	-0.9650%	NaN	NaN
dept_technical	-1.7958%	NaN	NaN
dept_sales	-3.1942%	NaN	NaN

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
number_project
2	725.00	857.00	1582.00	13.35	7.25	43.04	45.83	54.17
3	3482.00	38.00	3520.00	29.71	34.82	1.91	98.92	1.08
4	3448.00	237.00	3685.00	31.11	34.48	11.90	93.57	6.43
5	1890.00	343.00	2233.00	18.85	18.90	17.23	84.64	15.36
6	455.00	371.00	826.00	6.97	4.55	18.63	55.08	44.92
7	0.00	145.00	0.00	0.00	0.00	7.28	0.00	0.00
Total	10000.00	1991.00	11846.00	100.00	100.00	100.00	NaN	NaN

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
number_project
low (<3)	4207.00	895.00	5102.00	42.55	42.07	44.95	82.46	17.54
normal (3-5)	5338.00	580.00	5918.00	49.35	53.38	29.13	90.20	9.80
high (>5)	455.00	516.00	971.00	8.10	4.55	25.92	46.86	53.14
Total	10000.00	1991.00	11991.00	100.00	100.00	100.00	NaN	NaN

	satisfaction	last_eval	number_project	avg_mnth_hrs	tenure	left	dept	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

	satisfaction	last_eval	number_project	avg_mnth_hrs	tenure	accident	left	promotion	dept	salary
count	11991.00	11991.00	11991.00	11991.00	11991.00	11991.00	11991.00	11991.00	11991	11991
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	10	3
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	sales	low
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3239	5740
mean	0.63	0.72	3.80	200.47	3.36	0.15	0.17	0.02	NaN	NaN
std	0.24	0.17	1.16	48.73	1.33	0.36	0.37	0.13	NaN	NaN
min	0.09	0.36	2.00	96.00	2.00	0.00	0.00	0.00	NaN	NaN
25%	0.48	0.57	3.00	157.00	3.00	0.00	0.00	0.00	NaN	NaN
50%	0.66	0.72	4.00	200.00	3.00	0.00	0.00	0.00	NaN	NaN
75%	0.82	0.86	5.00	243.00	4.00	0.00	0.00	0.00	NaN	NaN
max	1.00	1.00	7.00	310.00	10.00	1.00	1.00	1.00	NaN	NaN

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
last_eval
low : 0.00 - 0.50	1032	398	1430	12	10	20	72	28
medium : 0.51-0.69	3805	512	4317	36	38	26	88	12
high : 0.70 - 1.00	5163	1081	6244	52	52	54	83	17
Total	10000	1991	11991	100	100	100	NaN	NaN

Label	Hrs/day
Low 0-150	0-7 hrs/day
Std 151-200	7-9 hrs/day
High 201-251	9-12 hrs/day
V High 251-300	14-16 hrs/day
Extreme 301-350+	16+ hrs/day

Average Monthly Hours	150	200	250	300	350
Avg Hours Day (ass. 21.25 days/month)	7	9	12	14	16

	salary	left	avg_mnth_hrs	total_difference
0	0	0	198.67	0.00
1	0	1	207.32	8.66
2	1	0	199.02	0.00
3	1	1	209.76	10.75
4	2	0	199.92	0.00
5	2	1	202.98	3.06

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
avg_mnth_hrs
Normal 0-175 : 0-8 hrs/day	3471.0	890.0	4361.0	36.4	34.7	44.7	79.6	20.4
176-200 : 8-9 hrs/day	1664.0	20.0	1684.0	14.0	16.6	1.0	98.8	1.2
High 201-251 : 9-12 hrs/day	3160.0	376.0	3536.0	29.5	31.6	18.9	89.4	10.6
V High 251-300 : 14-16 hrs/day	1705.0	613.0	2318.0	19.3	17.1	30.8	73.6	26.4
Extreme 301-350 : 16+ hrs/day	0.0	92.0	92.0	0.8	0.0	4.6	0.0	100.0
Total	10000.0	1991.0	11991.0	100.0	100.0	100.0	NaN	NaN

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
tenure
2	2879.0	31.0	2910.0	24.9	28.8	1.6	98.9	1.1
3	4316.0	874.0	5190.0	44.3	43.2	43.9	83.2	16.8
4	1510.0	495.0	2005.0	17.1	15.1	24.9	75.3	24.7
5	580.0	482.0	1062.0	9.1	5.8	24.2	54.6	45.4
6	433.0	109.0	542.0	4.6	4.3	5.5	79.9	20.1
7	94.0	0.0	0.0	0.0	0.9	0.0	0.0	0.0
8	81.0	0.0	0.0	0.0	0.8	0.0	0.0	0.0
10	107.0	0.0	0.0	0.0	1.1	0.0	0.0	0.0
Total	10000.0	1991.0	11709.0	100.0	100.0	100.0	NaN	NaN

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
tenure
<5 years	8705	1400	10105	84	87	70	86	14
>5 years	1295	591	1886	16	13	30	69	31
Total	10000	1991	11991	100	100	100	NaN	NaN

	tenure	Count of Tenure	Count of Promotions	Total by Tenure	percentage_of_total	promotions_percentage
0	2.00	2910.00	39.00	2910.00	24.27	0.33
1	3.00	5190.00	89.00	5190.00	43.28	0.74
2	4.00	2005.00	22.00	2005.00	16.72	0.18
3	5.00	1062.00	11.00	1062.00	8.86	0.09
4	6.00	542.00	11.00	542.00	4.52	0.09
5	7.00	94.00	18.00	94.00	0.78	0.15
6	8.00	81.00	5.00	81.00	0.68	0.04
7	10.00	107.00	8.00	107.00	0.89	0.07
Total	NaN	11991.00	203.00	11991.00	100.00	1.69

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
Total	10000.00	1991.00	7598.00	100.00	100.00	100.00	NaN	NaN
(2, 0.46)	12.00	85.00	97.00	1.28	0.12	4.27	12.37	87.63
(2, 0.48)	12.00	77.00	89.00	1.17	0.12	3.87	13.48	86.52
(2, 0.51)	23.00	71.00	94.00	1.24	0.23	3.57	24.47	75.53
(2, 0.54)	11.00	69.00	80.00	1.05	0.11	3.47	13.75	86.25
...	...	...	...	...	...	...	...	...
(5, 0.52)	34.00	0.00	0.00	0.00	0.34	0.00	0.00	0.00
(5, 0.53)	26.00	0.00	0.00	0.00	0.26	0.00	0.00	0.00
(5, 0.55)	46.00	0.00	0.00	0.00	0.46	0.00	0.00	0.00
(5, 0.56)	40.00	0.00	0.00	0.00	0.40	0.00	0.00	0.00
(2, 0.36)	5.00	0.00	0.00	0.00	0.05	0.00	0.00	0.00

	dept	tenure	left	promotion	count_by_dept_and_tenure	perc_by_dept_and_tenure
0	accounting	2	0	0	127	20.45
1	accounting	2	0	1	2	0.32
2	accounting	2	1	0	1	0.16
3	accounting	3	0	0	227	36.55
4	accounting	3	0	1	3	0.48
...	...	...	...	...	...	...
162	technical	6	0	1	2	0.09
163	technical	6	1	0	23	1.02
164	technical	7	0	0	1	0.04
165	technical	8	0	0	5	0.22
166	technical	10	0	0	10	0.45

	mean	median
left
0	0.67	0.69
1	0.44	0.41

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
salary / promotion
Total	10000.00	1991.00	11947.00	100.00	100.00	100.00	NaN	NaN
(low, 0)	4533.00	1169.00	5702.00	47.73	45.33	58.71	79.50	20.50
(medium, 0)	4374.00	766.00	5140.00	43.02	43.74	38.47	85.10	14.90
(high, 0)	898.00	48.00	946.00	7.92	8.98	2.41	94.93	5.07
(low, 1)	33.00	5.00	38.00	0.32	0.33	0.25	86.84	13.16
(medium, 1)	118.00	3.00	121.00	1.01	1.18	0.15	97.52	2.48
(high, 1)	44.00	0.00	0.00	0.00	0.44	0.00	0.00	0.00

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
satisfaction
low 0.00 - 0.37	1121.0	704.0	1825.0	15.2	11.2	35.4	61.4	38.6
medium 0.38 - 0.66	3676.0	756.0	4432.0	37.0	36.8	38.0	82.9	17.1
high 0.67-1.00	5203.0	531.0	5734.0	47.8	52.0	26.7	90.7	9.3
Total	10000.0	1991.0	11991.0	100.0	100.0	100.0	NaN	NaN

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
accident
0	8255.0	1886.0	10141.0	84.6	82.5	94.7	81.4	18.6
1	1745.0	105.0	1850.0	15.4	17.4	5.3	94.3	5.7
Total	10000.0	1991.0	11991.0	100.0	100.0	100.0	NaN	NaN

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
promotion
0	9805.00	1983.00	11788.00	98.31	98.05	99.60	83.18	16.82
1	195.00	8.00	203.00	1.69	1.95	0.40	96.06	3.94
Total	10000.00	1991.00	11991.00	100.00	100.00	100.00	NaN	NaN

	tenure	stayed promotion	stayed promotion perc	left promotion	left promotion perc
0	2	39	20.00	0.00	0.00
1	3	83	42.56	6.00	75.00
2	4	21	10.77	1.00	12.50
3	5	10	5.13	1.00	12.50
4	6	11	5.64	0.00	0.00
5	7	18	9.23	NaN	NaN
6	8	5	2.56	NaN	NaN
7	10	8	4.10	NaN	NaN

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
salary
Total	2689.00	550.00	3239.00	100.00	100.00	100.00	NaN	NaN
0	1234.00	319.00	1553.00	47.95	45.89	58.00	79.46	20.54
1	1230.00	219.00	1449.00	44.74	45.74	39.82	84.89	15.11
2	225.00	12.00	237.00	7.32	8.37	2.18	94.94	5.06

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
salary / Tenure
Total	10000.00	1991.00	11405.00	100.00	100.00	100.00	NaN	NaN
(low, 3)	1993.00	526.00	2519.00	22.09	19.93	26.42	79.12	20.88
(medium, 3)	1927.00	325.00	2252.00	19.75	19.27	16.32	85.57	14.43
(low, 4)	690.00	296.00	986.00	8.65	6.90	14.87	69.98	30.02
(low, 5)	277.00	273.00	550.00	4.82	2.77	13.71	50.36	49.64
(medium, 5)	260.00	195.00	455.00	3.99	2.60	9.79	57.14	42.86
(medium, 4)	681.00	188.00	869.00	7.62	6.81	9.44	78.37	21.63
(low, 6)	168.00	67.00	235.00	2.06	1.68	3.37	71.49	28.51
(medium, 6)	219.00	42.00	261.00	2.29	2.19	2.11	83.91	16.09
(high, 3)	396.00	23.00	419.00	3.67	3.96	1.16	94.51	5.49
(medium, 2)	1259.00	19.00	1278.00	11.21	12.59	0.95	98.51	1.49
(high, 5)	43.00	14.00	57.00	0.50	0.43	0.70	75.44	24.56
(low, 2)	1362.00	12.00	1374.00	12.05	13.62	0.60	99.13	0.87
(high, 4)	139.00	11.00	150.00	1.32	1.39	0.55	92.67	7.33
(low, 7)	18.00	0.00	0.00	0.00	0.18	0.00	0.00	0.00
(low, 8)	30.00	0.00	0.00	0.00	0.30	0.00	0.00	0.00
(low, 10)	28.00	0.00	0.00	0.00	0.28	0.00	0.00	0.00
(high, 10)	32.00	0.00	0.00	0.00	0.32	0.00	0.00	0.00
(high, 6)	46.00	0.00	0.00	0.00	0.46	0.00	0.00	0.00
(high, 7)	19.00	0.00	0.00	0.00	0.19	0.00	0.00	0.00
(high, 8)	9.00	0.00	0.00	0.00	0.09	0.00	0.00	0.00
(medium, 7)	57.00	0.00	0.00	0.00	0.57	0.00	0.00	0.00
(medium, 8)	42.00	0.00	0.00	0.00	0.42	0.00	0.00	0.00
(medium, 10)	47.00	0.00	0.00	0.00	0.47	0.00	0.00	0.00
(high, 2)	258.00	0.00	0.00	0.00	2.58	0.00	0.00	0.00

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
Dept / Salary
Total	10000.00	1991.00	11991.00	100.00	100.00	100.00	NaN	NaN
(sales, low)	1234.00	319.00	1553.00	12.95	12.34	16.02	79.46	20.54
(technical, low)	900.00	238.00	1138.00	9.49	9.00	11.95	79.09	20.91
(sales, medium)	1230.00	219.00	1449.00	12.08	12.30	11.00	84.89	15.11
(support, low)	676.00	191.00	867.00	7.23	6.76	9.59	77.97	22.03
(technical, medium)	798.00	142.00	940.00	7.84	7.98	7.13	84.89	15.11
(support, medium)	712.00	116.00	828.00	6.91	7.12	5.83	85.99	14.01
(it, low)	389.00	87.00	476.00	3.97	3.89	4.37	81.72	18.28
(marketing, low)	234.00	76.00	310.00	2.59	2.34	3.82	75.48	24.52
(it, medium)	362.00	67.00	429.00	3.58	3.62	3.37	84.38	15.62
(hr, low)	234.00	62.00	296.00	2.47	2.34	3.11	79.05	20.95
(accounting, low)	237.00	59.00	296.00	2.47	2.37	2.96	80.07	19.93
(product_mng, low)	284.00	59.00	343.00	2.86	2.84	2.96	82.80	17.20
(randd, low)	271.00	51.00	322.00	2.69	2.71	2.56	84.16	15.84
(hr, medium)	219.00	48.00	267.00	2.23	2.19	2.41	82.02	17.98
(product_mng, medium)	244.00	47.00	291.00	2.43	2.44	2.36	83.85	16.15
(accounting, medium)	215.00	47.00	262.00	2.18	2.15	2.36	82.06	17.94
(marketing, medium)	268.00	33.00	301.00	2.51	2.68	1.66	89.04	10.96
(management, low)	107.00	32.00	139.00	1.16	1.07	1.61	76.98	23.02
(randd, medium)	294.00	31.00	325.00	2.71	2.94	1.56	90.46	9.54
(management, medium)	150.00	19.00	169.00	1.41	1.50	0.95	88.76	11.24
(sales, high)	225.00	12.00	237.00	1.98	2.25	0.60	94.94	5.06
(technical, high)	156.00	10.00	166.00	1.38	1.56	0.50	93.98	6.02
(support, high)	121.00	5.00	126.00	1.05	1.21	0.25	96.03	3.97
(product_mng, high)	48.00	4.00	52.00	0.43	0.48	0.20	92.31	7.69
(it, high)	67.00	4.00	71.00	0.59	0.67	0.20	94.37	5.63
(randd, high)	44.00	3.00	47.00	0.39	0.44	0.15	93.62	6.38
(marketing, high)	59.00	3.00	62.00	0.52	0.59	0.15	95.16	4.84
(hr, high)	35.00	3.00	38.00	0.32	0.35	0.15	92.11	7.89
(accounting, high)	60.00	3.00	63.00	0.53	0.60	0.15	95.24	4.76
(management, high)	127.00	1.00	128.00	1.07	1.27	0.05	99.22	0.78

	Stayed Count	Left Count	Total Count	Total Perc of Total	Stayed Perc of Stayed Total	Left Perc of Left Total	Stayed %	Left %
salary
high	942.00	48.00	990.00	8.26	9.42	2.41	95.15	4.85
low	4566.00	1174.00	5740.00	47.87	45.66	58.97	79.55	20.45
medium	4492.00	769.00	5261.00	43.87	44.92	38.62	85.38	14.62
Total	10000.00	1991.00	11991.00	100.00	100.00	100.00	NaN	NaN

Table of contents¶

Table of contents ¶