import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# set Pandas Display Options
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

# Add a little colour

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"
save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/"

# Load dataset into a dataframe
#df = pd.read_csv(load_path + "data_cleaned_NoOl_FE_AllFeat.csv")          # 11991 rows, all data including outliers
df  = pd.read_csv(load_path + "data_cleaned_Ol_NoFE_AllFeat.csv")      # 11167 rows, no outliers (mostly from tenure)

#df.describe()

#team = 'management'
#var = 'last_eval'


def plot_compare(team, var):
    
    team_name = team.upper() + " Dept"
        
    cols =  ['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']
    
    left_df = df[(df['left'] == 1) & (df['dept'] == team)][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    
    stayed_df = df[(df['left'] == 0) & (df['dept'] == team)][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    
    all_df= df[['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]



    labels = df['salary'].unique()
    # Set the style of the visualization
    sns.set(style="darkgrid")


    x_order = ['low','medium','high']

    #plt.figure(figsize=(8, 6))
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(16, 8), sharey=True)


    hsplit = all_df[var].mean()

    # Variable by Salary
    sns.violinplot(x='salary', y=var, data=all_df,  hue='salary', order=x_order, ax=axes[0,0], palette='colorblind')
    axes[0,0].set_title(f'{var} - All Employees')
    axes[0,0].set_xlabel('Salary')
    axes[0,0].set_ylabel(f'{var}')
    axes[0,0].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    axes[0,0].text(0.5, hsplit + .05, 'Company Average', color='red')
    axes[0,0].set_xticks(labels)
    axes[0,0].set_xticklabels(labels,rotation=0)
    #plt.title(f'Employees who left, {var} by Department')

    sns.violinplot(x='salary', y=var, data=left_df,  hue='salary', order=x_order,  ax=axes[0,1], palette='colorblind')
    axes[0,1].set_title(f'{var} - {team_name} - Employees who Left')

    axes[0,1].set_xlabel('Salary')
    axes[0,1].set_ylabel(f'{var}')
    axes[0,1].set_xticks(labels)
    axes[0,1].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    axes[0,1].text(0.5, hsplit + .05, 'Company Average', color='red')
    axes[0,1].set_xticklabels(labels,rotation=0)

    sns.violinplot(x='salary', y=var, data=stayed_df,  hue='salary', order=x_order,  ax=axes[0,2], palette='colorblind')
    axes[0,2].set_title(f'{var} - {team_name} - Employees who Stayed')
    axes[0,2].set_xlabel('Salary')
    axes[0,2].set_ylabel(f'{var}')
    axes[0,2].set_xticks(labels)
    axes[0,2].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    axes[0,2].text(0.5, hsplit + .05, 'Company Average', color='red')
    axes[0,2].set_xticklabels(labels,rotation=0)

    sns.violinplot(x='tenure', y=var, data=all_df,  hue='tenure',  ax=axes[1,0], palette='colorblind')
    axes[1,0].set_title(f'{var} - All Employees')
    axes[1,0].set_xlabel('Tenure')
    axes[1,0].set_ylabel(f'{var}')
    #axes[1,0].set_xticks(labels)
    axes[1,0].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    axes[1,0].text(0.5, hsplit + .05, 'Company Average', color='red')
    #axes[1,0].set_xticklabels(labels,rotation=0)
    axes[1,0].legend([], [], frameon=False)
    
    sns.violinplot(x='tenure', y=var, data=left_df,  hue='tenure',  ax=axes[1,1], palette='colorblind')
    axes[1,1].set_title(f'{var} - {team_name} - Employees who Left')
    axes[1,1].set_xlabel('Tenure')
    axes[1,1].set_ylabel(f'{var}')
    #axes[1,0].set_xticks(labels)
    axes[1,1].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    axes[1,1].text(0.5, hsplit + .05, 'Company Average', color='red')
    axes[1,1].legend([], [], frameon=False)
    
    sns.violinplot(x='tenure', y=var, data=stayed_df,  hue='tenure',  ax=axes[1,2], palette='colorblind')
    axes[1,2].set_title(f'{var} - {team_name} - Employees who Stayed')
    axes[1,2].set_xlabel('Tenure')
    axes[1,2].set_ylabel(f'{var}')
    #axes[1,0].set_xticks(labels)
    axes[1,2].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    axes[1,2].text(0.5, hsplit + .05, 'Company Average', color='red')
    axes[1,2].legend([], [], frameon=False)
    
    plt.tight_layout()
    # Show the plot
    plt.show()

def plot_count_compare(team):

    team_name = team.upper() + " Dept"

    df[(df['left'] == 1) & (df['dept'] == team)]

    left_df = df[(df['left'] == 1) & (df['dept'] == team)][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    stayed_df = df[(df['left'] == 0) & (df['dept'] == team)][['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    all_df= df[['salary', 'dept', 'last_eval', 'tenure', 'satisfaction', 'left', 'number_project', 'avg_mnth_hrs']]
    
    labels = df['salary'].unique()
    # Set the style of the visualization
    sns.set(style="darkgrid")

    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 6), sharey=False, )
    x_order = ['low','medium','high']

    
    #hsplit = all_df[var].mean()
    
    sns.countplot(x='salary', data=all_df, order=x_order,ax=axes[0], hue='salary', palette='colorblind')
    axes[0].set_title(f'All Employees')
    axes[0].set_xlabel('Salary')
    axes[0].set_ylabel(f'Count')
    #axes[0].text(0.5, hsplit + .05, 'Company Average', color='red')
    axes[0].set_xticks(labels)
    axes[0].set_xticklabels(labels,rotation=90)
    #plt.title(f'Employees who left, {var} by Department')

    sns.countplot(x='salary',  data=left_df, order=x_order, ax=axes[1], hue='salary', palette='colorblind')
    axes[1].set_title(f'dept {team_name} - Employees who Left')
    axes[1].set_xlabel('Salary')
    axes[1].set_ylabel(f'Count')
    axes[1].set_xticks(labels)
#    axes[1].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    #axes[1].text(0.5, hsplit + .05, 'Company Average', color='red')
    axes[1].set_xticklabels(labels,rotation=90)

    sns.countplot(x='salary', data=stayed_df,  order=x_order, ax=axes[2], hue='salary',palette='colorblind')
    axes[2].set_title(f'dept {team_name} - Employees who Stayed')
    axes[2].set_xlabel('Salary')
    axes[2].set_ylabel(f'Count')
    axes[2].set_xticks(labels)
#    axes[2].axhline(y=hsplit, color='red', linestyle='--', linewidth=2.5,  label='Vertical Line 2')
    #axes[2].text(0.5, hsplit + .05, 'Company Average', color='red')
    axes[2].set_xticklabels(labels,rotation=90)

    plt.tight_layout()
    # Show the plot
    plt.show()

# PLOT - Edit the variable below for the team to analyse
# 'sales', 'accounting', 'hr', 'technical', 'support', 'management', 'it', 'product_mng', 'marketing', 'randd'],

team = 'it'

var = 'tenure'

plot_compare(team, var)

var = 'satisfaction'
plot_compare(team, var)

var = 'last_eval'
plot_compare(team, var)

var = 'avg_mnth_hrs'
plot_compare(team, var)

var = 'number_project'
plot_compare(team, var)

var = 'salary'
plot_count_compare(team)

# Read dataset used to train model 
df = pd.read_csv(load_path + "data_cleaned_NoOl_FE_AllFeat.csv")          # 11991 rows, all data including outliers
df.sort_index(axis = 1, inplace=True) # remember to sort the columns!

# Set risk thresholds for calcs

high_risk_threshold = 0.9
medium_risk_threshold = 0.7
predict_true_threshold = 0.5

# path to load/save pickled models
model_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" 

# Load the trained XGBoost model
with open(model_path + '/hr_xg1-AllFeat.pickle', 'rb') as model_file_xg:
    model_file_xg = pickle.load(model_file_xg)
    
temp_df = df[(df['left'] == 0) ]

features_current_empl = temp_df.drop(columns='left')

i = 0

# Predictions for CURRENT EMPLOYEES using model_file_xg for XGBoost

probabilities_cur = model_file_xg.predict_proba(features_current_empl)

leave_probabilities_cur = probabilities_cur[:, 1]
stay_probabilities_cur =  probabilities_cur[:, 0]


# Add the probabilities to the new data DataFrame if needed
features_current_empl['leave_probability'] = leave_probabilities_cur
features_current_empl['stay_probability'] = stay_probabilities_cur

# Sort the df
predictions_cur = features_current_empl.sort_values(by='leave_probability', ascending=False)

# Summarise predictions

selected_department = team  # Set in earlier cell

# Select the specific department column based on the variable
selected_dept_column = predictions_cur['dept_'+team]

# Select all other columns excluding the department columns not selected
other_columns = predictions_cur.drop(columns=[col for col in predictions_cur.columns if col.startswith('dept')])

# Concatenate the selected department column and other columns
new_df = pd.concat([selected_dept_column, other_columns], axis=1)

predictions_cur = new_df[new_df['dept_'+team]] # filter just dept = true

high_risk_count_cur = len(predictions_cur[predictions_cur['leave_probability'] > high_risk_threshold])
high_risk_perc_cur = (high_risk_count_cur / len(predictions_cur)) * 100
medium_risk_count_cur = len(predictions_cur[predictions_cur['leave_probability'] > medium_risk_threshold])
medium_risk_perc_cur = (medium_risk_count_cur / len(predictions_cur)) * 100
predict_risk_count_cur = len(predictions_cur[predictions_cur['leave_probability'] > predict_true_threshold])
predict_risk_perc_cur = (predict_risk_count_cur / len(predictions_cur)) * 100

if i == 0:
       print("\n"+ color.BOLD + f"XGBoost Summary for CURRENT {team.upper()} Employees\n" + color.END)
       print('\u2500' * 55) 
else:
       print("\n\n"+ color.BOLD + f"XGBoost Summary for LEFT Employees\n"+ color.END)
       print('\u2500' * 55) 

# Print the result
print(f"\n" + color.BOLD + f"probabilities > high_risk_threshold {high_risk_threshold:.0%}" + color.END)
print(f'Count of employees with leave probability above {high_risk_threshold:.0%}        : {high_risk_count_cur:.0f}')
print(f'Percentage of employees with leave probability above {high_risk_threshold:.0%}   : {high_risk_perc_cur:.2f}%')

print(f"\n" + color.BOLD +  f"probabilities > medium_risk_threshold {medium_risk_threshold:.0%}" + color.END)
print(f'Count of employees with leave probability above {medium_risk_threshold:.0%}        : {medium_risk_count_cur:.0f}')
print(f'Percentage of employees with leave probability above {medium_risk_threshold:.0%}   : {medium_risk_perc_cur:.2f}%')

print(f"\n" + color.BOLD + f"probabilities > predict_risk_threshold {predict_true_threshold:.0%}" + color.END)
print(f'Count of employees with leave probability above {predict_true_threshold:.0%}        : {predict_risk_count_cur:.0f}')
print(f'Percentage of employees with leave probability above {predict_true_threshold:.0%}   : {predict_risk_perc_cur:.2f}%')

XGBoost Summary for CURRENT IT Employees

───────────────────────────────────────────────────────

probabilities > high_risk_threshold 90%
Count of employees with leave probability above 90%        : 1
Percentage of employees with leave probability above 90%   : 0.13%

probabilities > medium_risk_threshold 70%
Count of employees with leave probability above 70%        : 3
Percentage of employees with leave probability above 70%   : 0.39%

probabilities > predict_risk_threshold 50%
Count of employees with leave probability above 50%        : 7
Percentage of employees with leave probability above 50%   : 0.92%

Document Title	Salifort Motors - Team Summary
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

Salifort Motors - Team Summary ¶

Document Information ¶

Client Details ¶

Prepared for the IT Team¶

Introduction¶

Employee Tenure¶

Self-Reported Satisfaction¶

Management Evaluation Scores¶

Work Hours¶

Project Involvement¶

Salary Levels¶

Employee Retention Prediction¶

Employees at risk of leaving¶

Salifort Motors - Team Summary ¶

Document Information¶

Client Details¶

Prepared for the IT Team¶

Introduction¶

Employee Tenure¶

Self-Reported Satisfaction¶

Management Evaluation Scores¶

Work Hours¶

Project Involvement¶

Salary Levels¶

Employee Retention Prediction¶

Employees at risk of leaving¶

Document Information ¶

Client Details ¶