• Site Navigation
    • Project Home
  • Project Documentation
    • Project Details
    • Tableau Dashboard
  • Customer / Fuel Type (new) Notebooks
    • SMMT-1a-OCR-fueltype-newformat.ipyn
    • SMMT-1b-CLEANSE-fueltype-newformat.ipynb
    • SMMT-1c-SAVE-fueltype-newformat.ipynb
  • Fuel Type (old)
    • SMMT-2a-OCR-fueltype-oldformat.ipynb
    • SMMT-2b-CLEANSE-fueltype-oldformat.ipynb
    • SMMT-2c-SAVE-fueltype-oldformat.ipynb
  • Customer Type (old)
    • SMMT-3a-OCR-custtype-newformat.ipynb
    • SMMT-3b-CLEANSE-custtype-newformat.ipynb
    • SMMT-3c-SAVE-custtype-newformat.ipynb

Import Libraries¶

In [ ]:
#!/usr/bin/env python3
import pandas as pd
from datetime import datetime
from termcolor import colored

Initialise Notebook variables and options¶

In [ ]:
source_path = "ocr/"
import_name = source_path + "OUT_2b_cleansed_fueltype_oldformat.csv"
output_name = source_path + "OUT_2c_fueltype_oldformat.csv"
error_name  = source_path + "OUT_2d_fueltype_oldformat_errors.log"

# Constants
num_lines = 0           # lines in the import file, determined later
fieldcount = 3          # Number of lines per month including month header
cat_count = 7           # Number of categories

Format dataframe and save as longdata¶

In [ ]:
## Count lines for import
with open(import_name, 'r') as linecount:
    num_lines = sum(1 for line in linecount)
    print('Total Import lines      :', num_lines) 
    
print("\n")
print("\n")
print("================================================================================================================================")
print("\n")
print("Opening File      : " + import_name)
print("Expecting         : " + str(int(num_lines / fieldcount * cat_count * 2)) + " Rows")
print("Output File       : " + output_name)
print("\n")
print("================================================================================================================================")
print("\n")


df = pd.read_csv(import_name, header=None, dtype={0 : object, 1 : object, 2 : object, 3 : object, 4 : object, 5 : object
                                                 , 6 : object, 7 : object, 8 : object, 9 : object})
# Error Logging
debug = 0
error_file = open(error_name, 'w')
def error_rep(input_row, sourcedate, field, error):
    error_file.write("Input Line : " + str(input_row) + " : " + out_source + " " + field + " " + error + "\n")

Feature Replacement Lookups, feature engineering lookups¶

In [ ]:
TOPCATLOOKUP = {
    "Diesel" : "Fossil",
    "Petrol": "Fossil",
    "AFV": "Xhev",
    "Total": "Fuel_TOTAL",
    "Private": "Customer",
    "Fleet": "Customer",
    "Business": "Customer"
    }
def get_tcat(g_scat):    
    try:
        g_topcat = TOPCATLOOKUP[g_scat]
    except KeyError:
        g_topcat = "err"

    return g_topcat


SCATLOOKUP = {
    "Diesel" : "Diesel",
    "Petrol" : "Petrol",
    "AFV" : "AFV",
    "Total": "Total",
    "Private": "Private",
    "Fleet": "Fleet",
    "Business": "Business",
    "Mhev_diesel": "MHEV_Diesel",
    "Hev" : "HEV",
    "Hev" : "xHEV",
    "Mhev_petrol": "MHEV_Petrol",
    "Bev" : "Plugin",
    "Phev" : "Plugin"
}

def get_scat(g_scat):
    try:
        g_scat = SCATLOOKUP[g_scat]
    except KeyError:
        g_scat = "err"
    return g_scat

Initialise output pandas dataframe¶

In [ ]:
# Create Pandas dataframe for the transposed record
output_table = pd.DataFrame(columns=[
"Usage",
"Source",
"Year",
"Month",
"Topcat",
"Subcat",
"Quantity",
"Acquire_Date",
"Acquire_By",
"Acquire_Method",
"fileDate"
]
)

#print(output_table)

# Declare variables for data transposition

input_row = 0
input_col = 0


#print(df.iloc[input_row, input_col])


newrow_num = 0
# print(df.dtypes)

out_acquire_date = datetime.now().strftime('%d-%m-%y')
out_acquire_by = "tesseract-old"
out_acquire_method = "Manual"
out_usage = "Private"
#
for x in range(0, num_lines, fieldcount): # start, lines in file, rows per month
    input_row = x
    out_month = df.iloc[input_row, input_col].capitalize()
    source_year = df.iloc[input_row+1, input_col]
    out_source = out_month + "-" + source_year

    for i in range(1, 3): # Three rows per month one for each year + header
        out_year = df.iloc[input_row+i, input_col]


        for y in range(1, 8): # 8 fields per year
            o_subcat = df.iloc[input_row , input_col + y]
            # out_topcat = TOPCATLOOKUP[out_sub_cat]
            out_topcat = get_tcat(o_subcat)
 
            out_subcat = get_scat(o_subcat)

            out_qty = df.iloc[input_row + i , input_col + y]

            # Error checking and reporting
            if out_topcat == "err": 
                error_rep(input_row, out_source, o_subcat, out_qty)
            if out_subcat == "err":
                error_rep(input_row, out_source, out_subcat, out_qty)
            if out_qty.isnumeric() != True:
                out_qty = "err"
                error_rep(input_row+ i +1 , out_source, out_subcat, out_qty)
            if source_year == out_year:
                out_source = "Primary"
            else:
                out_source = "Secondary"

            # Insert new row into output dataframe
            new_row = { "Usage"          : out_usage,
                        "Source"         : out_source,
                        "Year"           : out_year,
                        "Month"          : out_month,
                        "Topcat"         : out_topcat,
                        "Subcat"         : out_subcat,
                        "Quantity"       : out_qty,
                        "Acquire_Date"   : out_acquire_date,
                        "Acquire_By"     : out_acquire_by,
                        "Acquire_Method" : out_acquire_method,
                        "fileDate"       : out_month + "-"  + out_year}
            output_table.loc[len(output_table)] = new_row
            if debug == 1: print(new_row)



#print(output_table)
error_file.close()
output_table.to_csv(output_name, index=True)
#jname = source_path + "2c-json.json"
#output_table.to_json(jname, index=True)

print("\n")
print("\n")
print("The generated file needs to be validated, OCR is not 100% reliable")
print("================================================================================================================================")

print("\n")
print("      Import File : " + import_name)
print("      Output File : " + output_name)
print("        Expecting : " + str(int(num_lines / fieldcount * cat_count * 2)) + " Rows")
print("      Output File : " + str(len(output_table.index)) + " Rows")
print("     Error report : " + error_name)
print("\n")
print("================================================================================================================================")
print("\n")



error_file=open(error_name,'r')
line=error_file.readline()
if line == "":
    print("No Errors")
else:
    print("Errors detected, edit SOURCE FILE :" + import_name)
    print("================================================================================================================================")   
    while(line!=""):
        error_line = colored(line, "black", "on_white").replace("\n","") 
        print(error_line)
        line=error_file.readline()
error_file.close()