import pickle
import pandas as pd

# Add a little colour

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

load_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/00-data_cleaned/" # Source folder for cleaned data
#save_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" # destination for pickle saved models

# path to load/save pickled models
model_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/04-pickle-ML-models/" 

# path to save the predictions from  each model applied to the HR datafile
predictions_path = "/home/hass/Documents/Learning/Salifort-Motors-Capstone-Project/99-documentation-project/07-predictions/"

NoDept_df = pd.read_csv(load_path + "data_cleaned_NoOl_FE_NoDept.csv", index_col = False) 
NoDept_df.sort_index(axis = 1, inplace=True)

NoDept_df.columns

Index(['last_eval', 'left', 'number_project', 'overworked', 'promotion',
       'salary', 'satisfaction', 'tenure'],
      dtype='object')

# Load the trained  Random Forest Model
with open(model_path + '/hr_rf2-NOdept.pickle', 'rb') as model_file_rf:
    model_file_rf = pickle.load(model_file_rf)

# Load the trained Decision Tree model
with open(model_path + '/hr_dt2-NOdept.pickle', 'rb') as model_file_dt:
    model_file_dt = pickle.load(model_file_dt)

# Load the trained XGBoost model
with open(model_path + '/hr_xg2-NOdept.pickle', 'rb') as model_file_xg:
    model_file_xg = pickle.load(model_file_xg)

# Review costs of salary ranges for staff predicted to leave
def predict_salary_increase(modelname, title, salary_inc_perc): 

      '''    Generate summary table of costs of a salary increase for predicted employee will leave

      In: 
            modelname: str      : String : name of the model file e.g. model_file_xxx 
            title: str          : String :for the summary title, usually full description of model e.g. 'Decision Tree', 'XGBoost', 'Random Forest'
            salary_inc_perc     : Float  : percentage to increase salary by e.g. 0.50
      Out:  print of the summary
    '''

      # Set output number format
      pd.set_option('display.float_format', '{:.2f}'.format)

      df = AllFeat_df.copy()

      count_by_group = df.groupby(['salary', 'left']).size().reset_index(name='count')

      # Select employees stayed (left=0) and employees who left (left=1)
      features_current_empl = NoDept_df[NoDept_df['left'] == 0][['last_eval', 'number_project',
            'overworked', 'promotion', 'salary', 'satisfaction', 'tenure']]

      #print(features.head())

      # Predictions for CURRENT EMPLOYEES using model_file_{modelname}

      if modelname == "model_file_dt":
            probabilities_cur = model_file_dt.predict_proba(features_current_empl)
      elif modelname == "model_file_xg":
            probabilities_cur = model_file_xg.predict_proba(features_current_empl)
      elif modelname == "model_file_rf":
            probabilities_cur = model_file_rf.predict_proba(features_current_empl)

      leave_probabilities_cur = probabilities_cur[:, 1]
      #stay_probabilities_cur =  probabilities_cur[:, 0]
      #print(features.shape)
      #print(leave_probabilities.shape)

      # Add the probabilities to the new data DataFrame if needed
      features_current_empl['leave_probability'] = leave_probabilities_cur
      #features_current_empl['stay_probability'] = stay_probabilities_cur

      # Sort the df
      predictions_cur = features_current_empl.sort_values(by='leave_probability', ascending=False)

      # predictions_cur
      # Summarise predictions

      high_risk_count_cur = len(predictions_cur[predictions_cur['leave_probability'] > high_risk_threshold])
      high_risk_perc_cur = (high_risk_count_cur / len(predictions_cur)) * 100
      medium_risk_count_cur = len(predictions_cur[predictions_cur['leave_probability'] > medium_risk_threshold])
      medium_risk_perc_cur = (medium_risk_count_cur / len(predictions_cur)) * 100
      low_risk_count_cur = len(predictions_cur[predictions_cur['leave_probability'] > predict_true_threshold])
      predict_risk_perc_cur = (low_risk_count_cur / len(predictions_cur)) * 100

      # Display the result
      # print(count_by_group)

      high_left   = df[(df['salary'] == 'high') & (df['left'] == 0)]
      medium_left = df[(df['salary'] == 'medium') & (df['left'] == 0)]
      low_left    = df[(df['salary'] == 'low') & (df['left'] == 0)]


      # Initialise values

      high_salary   = 150000
      medium_salary = 100000
      low_salary    =  50000

      # salary_increase_percentage = 0.25 

      # Count the rows in the filtered DataFrame multiply by salary thresholds

      # High salary and salary increase 
      high_cost = high_risk_count_cur * high_salary * salary_inc_perc

      # Assumes medium salary @ £100K and cost to replace is 30% of salary
      medium_cost = medium_risk_count_cur * medium_salary * salary_inc_perc

      # Assumes low salary @ £50K and cost to replace is 30% of salary
      low_cost = low_risk_count_cur * low_salary * salary_inc_perc

      # Count of employees
      # total_count = low_left.shape[0]+medium_left.shape[0]+high_left.shape[0]
      total_count = high_risk_count_cur + medium_risk_count_cur + low_risk_count_cur


      print()
      # Display the selected rows

      print(color.BOLD + f"{title} - Cost of salary increase of {salary_inc_perc:.0%} of total salary)" + color.END)
      print('\u2500' * 70)
      print("                                  |   # of Staff |  Cost Per Head |   Est £ Cost")
      print("Estimate High Salary Left Cost    | {:>12,.0f} | {:>14,.0f} | {:>12,.0f} ".
            format(high_risk_count_cur, high_cost/high_risk_count_cur, high_cost))
      print("Estimate Medium Salary Left Cost  | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
            .format(medium_risk_count_cur, medium_cost/ medium_risk_count_cur, medium_cost))
      print("Estimate Low Salary Left Cost     | {:>12,.0f} | {:>14,.0f} | {:>12,.0f}"
            .format(low_risk_count_cur, low_cost/ low_risk_count_cur, low_cost))
      print("                                  | ============ |         ====== |    =========")
      print("Estimate TOTAL Salary Left Cost   | {:>12,.0f} |                | {:>12,.0f} "
            .format(total_count, low_cost+medium_cost+high_cost))

#predict_salary_increase('model_file_xg', "Test", 0.50)

def model_predictions(modelname, title):
       '''    Generate summary table for predictions if an employee will leave and apply predictions to employees left to compare

    In: 
        modelname: str      : model_file_xxx name of the model file string. 
        title: str          : String for the summary title, usually full description of model e.g. 'Decision Tree', 'XGBoost', 'Random Forest'
        

    Out: print of the summary
    '''
       for i in range(2):

              # Select employees stayed (left=0) and employees who left (left=1)
              features_current_empl = NoDept_df[NoDept_df['left'] == i][['last_eval', 'number_project',
                     'overworked', 'promotion', 'salary', 'satisfaction', 'tenure']]

              #print(features.head())

              # Predictions for CURRENT EMPLOYEES using model_file_xg for XGBoost
              if modelname == "model_file_dt":
                     probabilities_cur = model_file_dt.predict_proba(features_current_empl)
              elif modelname =="model_file_rf":
                     probabilities_cur = model_file_rf.predict_proba(features_current_empl)
              elif modelname == "model_file_xg":
                     probabilities_cur = model_file_xg.predict_proba(features_current_empl)

              leave_probabilities_cur = probabilities_cur[:, 1]
              stay_probabilities_cur =  probabilities_cur[:, 0]
              #print(features.shape)
              #print(leave_probabilities.shape)

              # Add the probabilities to the new data DataFrame if needed
              features_current_empl['leave_probability'] = leave_probabilities_cur
              features_current_empl['stay_probability'] = stay_probabilities_cur

              # Sort the df
              predictions_cur = features_current_empl.sort_values(by='leave_probability', ascending=False)

              # predictions_cur
              # Summarise predictions

              high_risk_count_cur = len(predictions_cur[predictions_cur['leave_probability'] > high_risk_threshold])
              high_risk_perc_cur = (high_risk_count_cur / len(predictions_cur)) * 100
              medium_risk_count_cur = len(predictions_cur[predictions_cur['leave_probability'] > medium_risk_threshold])
              medium_risk_perc_cur = (medium_risk_count_cur / len(predictions_cur)) * 100
              low_risk_count_cur = len(predictions_cur[predictions_cur['leave_probability'] > predict_true_threshold])
              predict_risk_perc_cur = (low_risk_count_cur / len(predictions_cur)) * 100

              if i == 0:
                     print("\n"+ color.BOLD + f"{title} Prediction Summary for CURRENT Employees\n" + color.END)
                     print('\u2500' * 55) 
              else:
                     print("\n\n"+ color.BOLD + f"{title} Prediction Summary for LEFT Employees (how good is the model?)\n"+ color.END)
                     print('\u2500' * 55) 

       # Print the result
              print(f"\n" + color.BOLD + f"probabilities > high_risk_threshold {high_risk_threshold:.0%}" + color.END)
              print(f'Count of employees with leave probability above {high_risk_threshold:.0%}        : {high_risk_count_cur:.0f}')
              print(f'Percentage of employees with leave probability above {high_risk_threshold:.0%}   : {high_risk_perc_cur:.2f}%')

              print(f"\n" + color.BOLD +  f"probabilities > medium_risk_threshold {medium_risk_threshold:.0%}" + color.END)
              print(f'Count of employees with leave probability above {medium_risk_threshold:.0%}        : {medium_risk_count_cur:.0f}')
              print(f'Percentage of employees with leave probability above {medium_risk_threshold:.0%}   : {medium_risk_perc_cur:.2f}%')

              print(f"\n" + color.BOLD + f"probabilities > low_risk_threshold {predict_true_threshold:.0%}" + color.END)
              print(f'Count of employees with leave probability above {predict_true_threshold:.0%}        : {low_risk_count_cur:.0f}')
              print(f'Percentage of employees with leave probability above {predict_true_threshold:.0%}   : {predict_risk_perc_cur:.2f}%')
              
              if i == 0:
                     pass
              else:
                     print(f"\n" + color.BOLD + color.UNDERLINE + "\n% of employees left that were predicted\n" + color.END)
                     #print('\u2500' * 55) 
                     print(f'XGBoost Predicted to leave > ({predict_true_threshold:.0%} )/ % of employees who left         : {low_risk_count_cur/1991*100:.2f} %')


       predictions_cur.to_html(predictions_path + "Predictions_XGBoost.html")

def will_they_leave_NoDept(satisfaction:float, last_eval:float, num_proj:int, tenure:int, overworked:bin, promotion:bin, salary:int):
    '''
    Generate predictions if an employee will leave, with a probability

    In: 
        last_eval:          Last evaluation score 0-1 float
        num_proj:           Number of projects them employee has worked on 1-7 int
        tenure:             Number of years employed 2-10 int 
        overworked          If an employee works more than 175 hours a month bin

    Out: String statement of prediction and % probability
    '''

    

    input_data2 = {
        'last_eval'         : last_eval,

        'number_project'    : num_proj, 
        'overworked'        : overworked,
        'promotion'         : promotion,  # 1 for yes, 0 for no
        'salary'            : salary,     # 0 =low, 1 = medium, 2 = high
        'satisfaction'      : satisfaction,
        'tenure'            : tenure     
        #'dept_IT'           : 0, 
        #'dept_RandD'        : 0,
        #'dept_accounting'   : 0, 
        #'dept_hr'           : 1, 
        #'dept_management'   : 0, 
        #'dept_marketing'    : 0,
        #'dept_product_mng'  : 0, 
        #'dept_sales'        : 0, 
        #'dept_support'      : 0, 
        #'dept_technical'    : 0,
        
        
    }

    # Convert the input data to a DataFrame
    input_df2 = pd.DataFrame(input_data2, index=[0])

    # Make a prediction
    predicted_class = model_file_rf.predict(input_df2)
    probability = model_file_rf.predict_proba(input_df2)[:, 1]

    # Output the prediction and probability
    print()
    print(color.BOLD + "Random Forest Model 2 - No departments" + color.END)
    print('\u2500' * 40) 
    if predicted_class[0] == 1:
        print("The employee is **LIKELY** to leave with a probability of {:.2f}%".format(probability[0] * 100))
    else:
        print("The employee is not likely to leave with a probability of {:.2f}%".format((1 - probability[0]) * 100))

    predicted_class = model_file_dt.predict(input_df2)
    probability = model_file_dt.predict_proba(input_df2)[:, 1]

    print()
    print(color.BOLD + "Decision Tree Model 2 - No departments" + color.END)
    print('\u2500' * 40) 
    if predicted_class[0] == 1:
        print("The employee is **LIKELY** to leave with a probability of {:.2f}%".format(probability[0] * 100))
    else:
        print("The employee is not likely to leave with a probability of {:.2f}%".format((1 - probability[0]) * 100))

    predicted_class = model_file_xg.predict(input_df2)
    probability = model_file_xg.predict_proba(input_df2)[:, 1]

    print()
    print(color.BOLD + "XGBoost Model 2 - No departments" + color.END)
    print('\u2500' * 40) 
    if predicted_class[0] == 1:
        print("The employee is **LIKELY** to leave with a probability of {:.2f}%".format(probability[0] * 100))
    else:
        print("The employee is not likely to leave with a probability of {:.2f}%".format((1 - probability[0]) * 100))

# Amend the variables below to create an employee record example to test if a employee is predicted to leave
# Note Key Feature : tenure, promotion, num_projects, last_eval.

number_of_projects      = 1         # Discrete range [0-7]
salary                  = 1         # 0 = low, 1 = medium, 2 = high
satisfaction            = 0.36       # Continuous [0-1] demo example 0.36 predicts to leave, 0.7 predicts to stay
last_evaluation_score   = 0.46       # Continuous range [0-1]
employee_tenure         = 3         # Discrete range [2-10]
promotion_l5y           = 0         # Binary range [0,1] 
employee_overworked     = 0         # Binary [0,1] 
avg_mnth_hrs            = 300       # Discrete range [96 - 310] 175 hrs = overworked

will_they_leave_NoDept(satisfaction, last_evaluation_score, number_of_projects, employee_tenure, employee_overworked, promotion_l5y, salary)

Random Forest Model 2 - No departments
────────────────────────────────────────
The employee is **LIKELY** to leave with a probability of 97.09%

Decision Tree Model 2 - No departments
────────────────────────────────────────
The employee is **LIKELY** to leave with a probability of 95.89%

XGBoost Model 2 - No departments
────────────────────────────────────────
The employee is **LIKELY** to leave with a probability of 96.11%

# Load cleaned dataset into a dataframe
df1 = pd.read_csv(load_path + "data_cleaned_NoOl_FE_AllFeat.csv")

AllFeat_df = df1.copy()
NoDept_df = pd.read_csv(load_path + "data_cleaned_NoOl_FE_NoDept.csv")

AllFeat_df.sort_index(axis=1, inplace=True)
NoDept_df.sort_index(axis=1, inplace=True)

#print(AllFeat_df.columns)
#print(NoDept_df.columns)

AllFeat_LEFT_df = AllFeat_df[AllFeat_df['left'] == 1].drop(columns=['left'])

NoDept_LEFT_df = NoDept_df[NoDept_df['left'] == 1].drop(columns=['left'])

#print(AllFeat_LEFT_df.shape[0])
#print(NoDept_LEFT_df.shape[0])

# Set risk thresholds for calcs

high_risk_threshold = 0.9
medium_risk_threshold = 0.7
predict_true_threshold = 0.5

model_predictions('model_file_rf', 'Random Forest')
predict_salary_increase('model_file_rf','Random Forest', 0.50) # modelname and model title

Random Forest Prediction Summary for CURRENT Employees

───────────────────────────────────────────────────────

probabilities > high_risk_threshold 90%
Count of employees with leave probability above 90%        : 26
Percentage of employees with leave probability above 90%   : 0.28%

probabilities > medium_risk_threshold 70%
Count of employees with leave probability above 70%        : 48
Percentage of employees with leave probability above 70%   : 0.52%

probabilities > low_risk_threshold 50%
Count of employees with leave probability above 50%        : 59
Percentage of employees with leave probability above 50%   : 0.64%


Random Forest Prediction Summary for LEFT Employees (how good is the model?)

───────────────────────────────────────────────────────

probabilities > high_risk_threshold 90%
Count of employees with leave probability above 90%        : 1704
Percentage of employees with leave probability above 90%   : 90.54%

probabilities > medium_risk_threshold 70%
Count of employees with leave probability above 70%        : 1715
Percentage of employees with leave probability above 70%   : 91.13%

probabilities > low_risk_threshold 50%
Count of employees with leave probability above 50%        : 1721
Percentage of employees with leave probability above 50%   : 91.45%


% of employees left that were predicted

XGBoost Predicted to leave > (50% )/ % of employees who left         : 86.44 %


Random Forest Model Predictions of salary cost increases

Cost of salary increase of 50% of total salary)
──────────────────────────────────────────────────────────────────────
                                  |   # of Staff |  Cost Per Head |   Est £ Cost
Estimate High Salary Left Cost    |           26 |         75,000 |    1,950,000 
Estimate Medium Salary Left Cost  |           48 |         50,000 |    2,400,000
Estimate Low Salary Left Cost     |           59 |         25,000 |    1,475,000
                                  | ============ |         ====== |    =========
Estimate TOTAL Salary Left Cost   |          133 |                |    5,825,000

model_predictions('model_file_xg', 'XGBoost')
predict_salary_increase('model_file_xg','XGBoost', 0.50) # modelname and model title

XGBoost Prediction Summary for CURRENT Employees

───────────────────────────────────────────────────────

probabilities > high_risk_threshold 90%
Count of employees with leave probability above 90%        : 12
Percentage of employees with leave probability above 90%   : 0.13%

probabilities > medium_risk_threshold 70%
Count of employees with leave probability above 70%        : 33
Percentage of employees with leave probability above 70%   : 0.36%

probabilities > low_risk_threshold 50%
Count of employees with leave probability above 50%        : 67
Percentage of employees with leave probability above 50%   : 0.72%


XGBoost Prediction Summary for LEFT Employees (how good is the model?)

───────────────────────────────────────────────────────

probabilities > high_risk_threshold 90%
Count of employees with leave probability above 90%        : 1360
Percentage of employees with leave probability above 90%   : 72.26%

probabilities > medium_risk_threshold 70%
Count of employees with leave probability above 70%        : 1712
Percentage of employees with leave probability above 70%   : 90.97%

probabilities > low_risk_threshold 50%
Count of employees with leave probability above 50%        : 1731
Percentage of employees with leave probability above 50%   : 91.98%


% of employees left that were predicted

XGBoost Predicted to leave > (50% )/ % of employees who left         : 86.94 %


XGBoost Model Predictions of salary cost increases

Cost of salary increase of 50% of total salary)
──────────────────────────────────────────────────────────────────────
                                  |   # of Staff |  Cost Per Head |   Est £ Cost
Estimate High Salary Left Cost    |           12 |         75,000 |      900,000 
Estimate Medium Salary Left Cost  |           33 |         50,000 |    1,650,000
Estimate Low Salary Left Cost     |           67 |         25,000 |    1,675,000
                                  | ============ |         ====== |    =========
Estimate TOTAL Salary Left Cost   |          112 |                |    4,225,000

model_predictions('model_file_dt', 'Decision Tree')
predict_salary_increase('model_file_dt','Decision Tree', 0.50) # modelname and model title

Decision Tree Prediction Summary for CURRENT Employees

───────────────────────────────────────────────────────

probabilities > high_risk_threshold 90%
Count of employees with leave probability above 90%        : 40
Percentage of employees with leave probability above 90%   : 0.43%

probabilities > medium_risk_threshold 70%
Count of employees with leave probability above 70%        : 87
Percentage of employees with leave probability above 70%   : 0.94%

probabilities > low_risk_threshold 50%
Count of employees with leave probability above 50%        : 87
Percentage of employees with leave probability above 50%   : 0.94%


Decision Tree Prediction Summary for LEFT Employees (how good is the model?)

───────────────────────────────────────────────────────

probabilities > high_risk_threshold 90%
Count of employees with leave probability above 90%        : 1342
Percentage of employees with leave probability above 90%   : 71.31%

probabilities > medium_risk_threshold 70%
Count of employees with leave probability above 70%        : 1730
Percentage of employees with leave probability above 70%   : 91.92%

probabilities > low_risk_threshold 50%
Count of employees with leave probability above 50%        : 1730
Percentage of employees with leave probability above 50%   : 91.92%


% of employees left that were predicted

XGBoost Predicted to leave > (50% )/ % of employees who left         : 86.89 %


Decision Tree Model Predictions of salary cost increases

Cost of salary increase of 50% of total salary)
──────────────────────────────────────────────────────────────────────
                                  |   # of Staff |  Cost Per Head |   Est £ Cost
Estimate High Salary Left Cost    |           40 |         75,000 |    3,000,000 
Estimate Medium Salary Left Cost  |           87 |         50,000 |    4,350,000
Estimate Low Salary Left Cost     |           87 |         25,000 |    2,175,000
                                  | ============ |         ====== |    =========
Estimate TOTAL Salary Left Cost   |          214 |                |    9,525,000

Document Title	Salifort Motors - Demonstration Model - Best Choice Random Forest
Author	Rod Slater
Version	1.0
Created	01-11-2023
Modified	16-11-2023

Client Name	Salifort Motors
Client Contact	Mr HR Team
Client Email	hr@salifortmotors.it
Client Project	HR Team Data Driven Solutions from Machine Learning Models

Salifort Motors - Demonstration model ¶

Document Information ¶

Client Details ¶

Document Overview ¶

Table of contents ¶

Initialise Notebook ¶

Import Packages ¶

Initialise Notebook Options ¶

Load the dataset ¶

Pickle Load models developed using the Focus Features dataset ¶

Predict cost of salary increase for employees predicted to leave by the ML Model ¶

Predictions Summary of the model ¶

`will_they_leave` function- `Focus Features`¶

Interactive Model Comparison Demonstration ¶

Model prediction Summaries ¶

Load and Prepare data ¶

Set Prediction Probability Thresholds for reporting ¶

Summary - Random Forest Model Predictions ¶

Summary XGBoost Predictions ¶

Summary Decision Tree Predictions ¶

Summarise model findings ¶

View Results in browser ¶

Table of contents¶

Table of contents ¶