import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"

# Load dataset into a dataframe
df = pd.read_csv(load_path + "data_cleaned_Ol_NoFE_AllFeat.csv")          # 11991 rows, all data including outliers
#df1  = pd.read_csv(load_path + "data_cleaned_NoOl_NoFE_AllFeat.csv")      # 11167 rows, no outliers (mostly from tenure)

# Display first few rows of the dataframe

df.head()

pd.options.display.float_format = '{:.2f}'.format
counts = df['left'].value_counts()
perc = df['left'].value_counts(normalize=True)*100

summary_table = pd.DataFrame({"Counts " : counts,
                              "Percentage " : perc})

summary_table

# Calculate counts and percentages by department
dept_counts = df.groupby('dept')['left'].value_counts().unstack().fillna(0) # counts, combine grouped columns into an index, replace an values with 0
dept_counts['total'] = dept_counts.sum(axis=1)  # Total across columns
dept_counts['left_percentage'] = (dept_counts[1] / dept_counts['total']) * 100 

# Plot the bar graph
fig, ax = plt.subplots(figsize=(10, 6))
dept_counts['left_percentage'].plot(kind='bar', color='blue', ax=ax)

# Add lines at 16.6% and 2.8%
ax.axhline(y=16.6, color='red', linestyle='--', label='16.6% Salifort Average')
ax.axhline(y=17.8, color='green', linestyle='--', label='17.8% Business Avg 2022-2023')

# Add labels and legend
ax.set_xlabel('Department')
ax.set_ylabel('Percentage Left')
ax.set_title('Percentage of Employees Left by Department')
ax.legend(['16.6% Salifort Avg','17.8% Business Avg 2022-23','left %'], loc='upper center', bbox_to_anchor=(0.5, -0.3), fancybox=True, shadow=True, ncol=3)

# Show the plot
plt.show()

## Employees who left percentage by department total and salary band / Total employees by department and salary band

df_left_1 = df[df['left'] == 1]
left_1 = df_left_1.groupby(['dept', 'salary']).size().unstack()

total_df = df.groupby(['dept', 'salary']).size().unstack()

df_percentage = (left_1 / total_df) * 100

# Reset index to bring 'department' back as a column
df_percentage.reset_index(inplace=True)

# Print the result
print(df_percentage)

salary         dept  high   low  medium
0        accounting  4.76 19.93   17.94
1                hr  7.89 20.95   17.98
2                it  5.63 18.28   15.62
3        management  0.78 23.02   11.24
4         marketing  4.84 24.52   10.96
5       product_mng  7.69 17.20   16.15
6             randd  6.38 15.84    9.54
7             sales  5.06 20.54   15.11
8           support  3.97 22.03   14.01
9         technical  6.02 20.91   15.11

melted_df = pd.melt(df_percentage,value_vars=['high','low','medium'], id_vars=['dept'], value_name='percentage')

melted_df.sort_values(by=['salary'], inplace=True, ascending=False)



# Plot the bar graph

x_order = ['low','medium','high'] # set order of labels of x axis

fig, ax = plt.subplots(figsize=(12, 8))

# Plot bars for each salary level in each department
#df_percentage.plot(kind='bar', x = 'dept', stacked=False, ax=ax, order=x_order)

sns.barplot(x = 'dept', y='percentage', data=melted_df,  hue='salary', palette='colorblind')


# Add lines at 16.6% and 2.8%
plt.axhline(y=16.6, color='red', linestyle='--', label='Salifort Average Turnover')
ax.axhline(y=17.8, color='green', linestyle='--', label='US Business Avg 2022-23')

# Add labels and legend
ax.set_xlabel('Department')
ax.set_ylabel('%')
plt.xticks(rotation=90)
ax.set_title('Distribution of Salary Levels by Department for Employees who Left')

ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=3)


# Show the plot
plt.show()

#left_df = df[df['left'] == 1][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
#stayed_df = df[['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]

#df['dept'].unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'it', 'product_mng', 'marketing', 'randd'], dtype=object)

def plot_compare_company(var:str):

    ''' Violinplot, side by side comparison: All Company, employees who left, employees who stayed
     in : 
        team : Str : Team name to analyse
        var  : dataset variable to plot
    '''

    #df[(df['left'] == 1)] # & (df['dept'] == 'sales')]

    #left_df = df[(df['left'] == 1) & (df['dept'] == team)][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    left_df = df[(df['left'] == 1) ][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    #stayed_df = df[(df['left'] == 0) & (df['dept'] == team)][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    stayed_df = df[(df['left'] == 0) ][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    
    all_df= df[['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    
    labels = df['salary'].unique()
    # Set the style of the visualization
    sns.set(style="darkgrid")

    
    #plt.figure(figsize=(8, 6))
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 6), sharey=True)
    #sns.barplot(x='dept', y='satisfaction', hue='salary', data=df, ci=None, estimator='mean', palette='colorblind')
    # sns.boxplot(x='dept', y=var, hue='left', data=df, palette='colorblind')

    sns.violinplot(x='salary', y=var, data=all_df,  hue='salary', ax=axes[0], palette='colorblind')
    axes[0].set_title(f'{var} - All Employees')
    axes[0].set_xlabel('Salary')
    axes[0].set_ylabel(f'{var}')
    axes[0].set_xticks(labels)
    axes[0].set_xticklabels(labels,rotation=0)
    #plt.title(f'Employees who left, {var} by Department')

    sns.violinplot(x='salary', y=var, data=left_df,  hue='salary', ax=axes[1], palette='colorblind')
    axes[1].set_title(f'{var} - Employees who Left')

    axes[1].set_xlabel('Salary')
    axes[1].set_ylabel(f'{var}')
    axes[1].set_xticks(labels)
    axes[1].set_xticklabels(labels,rotation=0)

    sns.violinplot(x='salary', y=var, data=stayed_df,  hue='salary', ax=axes[2], palette='colorblind')
    axes[2].set_title(f'{var} - Employees who Stayed')

    axes[2].set_xlabel('Salary')
    axes[2].set_ylabel(f'{var}')
    axes[2].set_xticks(labels)
    axes[2].set_xticklabels(labels,rotation=0)

    plt.tight_layout()
    # Show the plot
    plt.show()

varlist= df.drop(columns=['accident', 'left','promotion'])
variables = varlist.columns

for var in variables:

    plot_compare_company(var)

var = 'number_project' # Variable to plot
hsplit = 4 # Where to place the horizontal line

# List of unique departments
departments = df['dept'].unique()
#dep = ['sales']
#departments = np.array(dep)

# Set the style of the visualization
sns.set(style="darkgrid")

# Loop through departments
for department in departments:
    # Subset data for the current department
    department_data = df[df['dept'] == department]
    
    # Create a figure with two subplots (box plot and scatter plot)
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6), sharey=False)

    # Box plot
    sns.boxplot(x=var, y='salary', data=department_data, hue = 'left', ax=axes[0], palette='colorblind')
    axes[0].set_title(f'{var} - Quartile Distribution for {department}')
   # axes[0].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5, label='Vertical Line 2')

    # Scatter plot
    #sns.scatterplot(x='salary', y='last_eval', data=department_data, hue='left', ax=axes[1], palette='colorblind')
    sns.violinplot(x='salary', y=var, data=department_data, hue='left', ax=axes[1], palette='colorblind')
    axes[1].axvline(x=0.08, color='red', linestyle='--', label='Vertical Line 1')
    axes[1].axvline(x=0.32, color='blue', linestyle='--', label='Vertical Line 2')
    axes[1].axvline(x=1.08, color='red', linestyle='--', label='Vertical Line 1')
    axes[1].axvline(x=1.32, color='blue', linestyle='--', label='Vertical Line 2')
    axes[1].axvline(x=2.08, color='red', linestyle='--', label='Vertical Line 1')
    axes[1].axvline(x=2.32, color='blue', linestyle='--', label='Vertical Line 2')
    axes[1].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    axes[1].set_title(f'{var} - Violin Plot for {department}')

    # Adjust layout
    plt.tight_layout()

    # Show the plots
    plt.show()

var = 'avg_mnth_hrs' # Variable to plot
hsplit = 175 # Where to place the horizontal line

# List of unique departments
departments = df['dept'].unique()
#dep = ['sales']
#departments = np.array(dep)
# Set the style of the visualization
sns.set(style="whitegrid")

# Loop through departments
for department in departments:
    # Subset data for the current department
    department_data = df[df['dept'] == department]
    
    # Calculate the percentage of outliers (for example, considering values beyond 1.5*IQR as outliers)
    Q1 = df[var].quantile(0.25)
    Q3 = df[var].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - .25 * IQR
    upper_bound = Q3 + .25 * IQR

    outliers_percentage = ((department_data[var] < lower_bound) | (department_data[var] > upper_bound)).mean() * 100

# Annotate the plot with the percentage of outliers
    



    # Create a figure with two subplots (box plot and scatter plot)
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6), sharey=True)

    # Box plot
    sns.boxplot(x='salary', y=var, data=department_data, hue = 'left', ax=axes[0], palette='colorblind')
    axes[0].set_title(f'Quartile Distribution for {department}')
    axes[0].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5, label='Vertical Line 2')

    # Scatter plot
    #sns.scatterplot(x='salary', y='last_eval', data=department_data, hue='left', ax=axes[1], palette='colorblind')
    sns.violinplot(x='salary', y=var, data=department_data, hue='left', ax=axes[1], palette='colorblind')
    axes[1].axvline(x=0.08, color='red', linestyle='--', label='Vertical Line 1')
    axes[1].axvline(x=0.32, color='blue', linestyle='--', label='Vertical Line 2')
    axes[1].axvline(x=1.08, color='red', linestyle='--', label='Vertical Line 1')
    axes[1].axvline(x=1.32, color='blue', linestyle='--', label='Vertical Line 2')
    axes[1].axvline(x=2.08, color='red', linestyle='--', label='Vertical Line 1')
    axes[1].axvline(x=2.32, color='blue', linestyle='--', label='Vertical Line 2')
    axes[1].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    axes[1].set_title(f'Scatter Plot for {department}')
    axes[1].text(0.5, 1.05, f'{outliers_percentage:.2f}% Outliers', transform=plt.gca().transAxes, fontsize=10, color='red')

    # Adjust layout
    plt.tight_layout()

    # Show the plots
    plt.show()

Document Title	Salifort Motors - HR Team Summary
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

Salifort Motors - HR Team Summary ¶

Document Information ¶

Client Details ¶

Document Overview ¶

Table of contents¶

Visualisations - counts and percentages by department ¶

Employees who left percentage by department total and salary range ¶

Distribution of Salary Levels by Department for Employees who Left ¶

Visualisations - Comparison of All Employees Vs stayed employees vs left employees ¶

Visualisations of variables By Department ¶

	satisfaction	last_eval	number_project	avg_mnth_hrs	tenure	left	dept	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

	Counts	Percentage
left
0	10000	83.40
1	1991	16.60