#!/usr/bin/env python3
import pandas as pd
from datetime import datetime
from termcolor import colored

# Set defaults
source_path = "/home/hass/Development/smmtdata-evolved/ocr/"
#source_path = "ocr/"
import_name = source_path + "OUT_1b_fueltype_CLEANSE_newformat-2023.csv"
output_name = source_path + "OUT_1c_fueltype_FINISHED_newformat-2023.csv"

error_name  = source_path + "OUT_1d_fueltype_FINISHED_newformat_errors-2023.log"

# Constants
num_lines = 0           # lines in the import file, determined later
rowcount = 10           # Number of lines per month including month header
fieldcount = 8          # Number of columns with data to capture (rowcount - 2 lines for headers)

import_dataframe = pd.read_csv(import_name, header=None, dtype={0 : object, 1 : object, 2 : object})

# Error logging
debug = 0
error_file = open(error_name, 'w')
logfile = open(source_path + "OUT_1b_cleansed_fueltype_newformat.log","w")

def error_rep(input_row, sourcedate, field, error):
    error_file.write("line : " + str(input_row) + " : " + sourcedate + " " + field + " " + error + "\n")

## Count lines for import
with open(import_name, 'r') as linecount:
    num_lines = sum(1 for line in linecount)

# setout the expectations : how many lines and the input and output filenames, which on some terminals will be clickable.
print("The generated file needs to be validated, OCR is not 100% reliable")
print("================================================================================================================================")

print("Import File       : " + import_name)

print("Expecting         : " + str(int(num_lines / rowcount * fieldcount * 2)) + " Rows") 

print("Output File       : " + output_name)

print("================================================================================================================================")
print("\n")

The generated file needs to be validated, OCR is not 100% reliable
================================================================================================================================
Import File       : /home/hass/Development/smmtdata-evolved/ocr/OUT_1b_fueltype_CLEANSE_newformat-2023.csv
Expecting         : 16 Rows
Output File       : /home/hass/Development/smmtdata-evolved/ocr/OUT_1c_fueltype_FINISHED_newformat-2023.csv
================================================================================================================================

# Create Pandas dataframe for the transposed record
output_table = pd.DataFrame(columns=[
"Usage",
"Source",
"Year",
"Month",
"Topcat",
"Subcat",
"Quantity",
"Acquire_Date",
"Acquire_By",
"Acquire_Method",
"fileDate"
]
)

TOPCATLOOKUP = {
    "Diesel" : "Fossil",
    "Petrol": "Fossil",
    "AFV": "xHEV",
    "Total": "Fuel_TOTAL",
    "Private": "Customer",
    "Fleet": "Customer",
    "Business": "Customer",
    "Total" : "Fuel Total",
    "Mhev_diesel": "xHEV",
    "Hev" : "xHEV",
    "Total": "Total",
    "Mhev_petrol": "xHEV",
    "Bev" : "Plugin",
    "Phev" : "Plugin"
    }

def get_tcat(g_scat):    # Get topcat Tp Level Category for Fuel Type
    try:
        g_topcat = TOPCATLOOKUP[g_scat]
    except KeyError:
        g_topcat = "err"
    return g_topcat

SCATLOOKUP = {
    "Diesel" : "Diesel",
    "Petrol" : "Petrol",
    "AFV" : "AFV",
    "Total": "Total",
    "Private": "Private",
    "Fleet": "Fleet",
    "Business": "Business",
    "Mhev_diesel": "MHEV_Diesel",
    "Hev" : "HEV",
    "Hev" : "xHEV",
    "Mhev_petrol": "MHEV_Petrol",
    "Bev" : "BEV",
    "Phev" : "PHEV"
}


def get_scat(g_scat): 
    try:
        g_scat = SCATLOOKUP[g_scat]
    except KeyError:
        g_scat = "err"
    return g_scat

def addtotbl(t_cat, s_cat, month, year, src, qty):
    if debug == 1 : print("Saw   : " + " " + s_cat + " " +  month + " " +  year + " " +  src + " " +  qty)
    logfile.write("Saw   : " + " " + s_cat + " " +  month + " " +  year + " " +  src + " " +  qty + "\n")



    new_row = {"Usage" : "Private",
            "Source" : src,
            "Year" : year,
            "Month" : month,
            "Topcat" : t_cat,
            "Subcat" : s_cat,
            "Quantity" : qty,
            "Acquire_Date" : datetime.now().strftime("%d-%m-%y"),
            "Acquire_By" : "Tesseract-new",
            "Acquire_Method" : "Manual",
            "fileDate" : month + "-" + year}
    output_table.loc[len(output_table)] = new_row
    #print(new_row)
    #a = input("press any key")

# Declare variables for data transposition
#i = 0 
input_row = 0
input_col = 0

for i in range(0, num_lines, rowcount):  # start, number of lines in file, number of rows per month
    input_row = i 
    out_month = import_dataframe.iloc[input_row, 0]
    input_row += 1
    source_year = import_dataframe.iloc[input_row, 1]
    out_source = out_month + "-" + source_year
    out_year0 = import_dataframe.iloc[input_row, 1]
    out_year1 = import_dataframe.iloc[input_row, 2]
    input_row += 1

    for x in range(0,rowcount - 2): # Subtract 2 for month / year header
        subcat = import_dataframe.iloc[input_row, 0].capitalize()
        out_topcat = get_tcat(subcat)
        out_subcat = get_scat(subcat)
        out_qty = import_dataframe.iloc[input_row, 1]
        if out_qty.isnumeric() != True:
            out_qty = "err"
            error_rep(input_row+1, out_source, "out_total0", out_subcat)
        if out_topcat == "err":
            error_rep(input_row+1, out_source, out_subcat, out_qty)
        if out_subcat == "err":
            error_rep(input_row+1, out_source, out_subcat, out_qty)


        if debug == 1 : print("Called : " + out_subcat + " " + out_month + " " +  out_year0 + " " +  "Primary" + " " +  str(out_qty))

        addtotbl(out_topcat, out_subcat, out_month, out_year0, "Primary", out_qty) # Fields added in function : 

        out_qty = import_dataframe.iloc[input_row, 2]
        if out_qty.isnumeric() != True:
            out_qty = "err"
            error_rep(input_row+1, out_source, "out_total1", out_subcat)
        
        if debug == 1 : print("Called : " + out_subcat + " " + out_month + " " +  out_year1 + " " +  "Secondary" + " " +  str(out_qty))

        addtotbl(out_topcat, out_subcat, out_month, out_year1, "Secondary", out_qty)



        input_row +=1
    input_row +=1


#print(output_table)
error_file.close()
output_table.to_csv(output_name, index=True)
#jname = source_path + "1c-json.json"
#output_table.to_json(jname, index=True)


print("\n")
print("The generated file needs to be validated, OCR is not 100% reliable")
print("================================================================================================================================")

print("Import File       : " + import_name)

print("Output File       : " + output_name)

print("Expecting         : " + str(int(num_lines / rowcount * fieldcount * 2)) + " Rows")
print("Output File       : " + str(len(output_table.index)) + " Rows")
print("\n")
print("Error report : " + error_name)
print("\n")
print("================================================================================================================================")

print("Errors detected:")
print("================================================================================================================================")

error_file=open(error_name,'r')
line=error_file.readline()
if line == "":
    print("No Errors")
else:
    while(line!=""):
        error_line = colored(line, "black", "on_white").replace("\n","") 
        print(error_line)
        line=error_file.readline()
error_file.close()
logfile.close()


The generated file needs to be validated, OCR is not 100% reliable
================================================================================================================================
Import File       : /home/hass/Development/smmtdata-evolved/ocr/OUT_1b_fueltype_CLEANSE_newformat-2023.csv
Output File       : /home/hass/Development/smmtdata-evolved/ocr/OUT_1c_fueltype_FINISHED_newformat-2023.csv
Expecting         : 16 Rows
Output File       : 16 Rows


Error report : /home/hass/Development/smmtdata-evolved/ocr/OUT_1d_fueltype_FINSIHED_newformat_errors-2023.log


================================================================================================================================
Errors detected:
================================================================================================================================
No Errors

Notebook Name	Function
SMMT-1a-OCR-fueltype-newformat.php	Optical Character recognition of the image files in the import directory. Save to unclean csv
SMMT-1b-CLEANSE-fueltype-newformat	Datacleanse, error fetection and feature engineering. Saves to cleansed file
SMMT-1b-CLEANSE-fueltype-newformat	Datacleanse, error detection and feature engineering. Saves to cleansed csv file in long data format.

Save to fuel type in the new format

Import Libraries¶

Initialise notebook options¶

Setup Error Logging¶

Initialise Pandas dataframe¶

Topcat and Subcat search and replace.¶

Process import file¶