Import Packages¶
In [ ]:
#!/usr/bin/env python3
from termcolor import colored
Initialise Notebook Variables¶
In [ ]:
# source_path = "/home/hass/Development/Learning/Python/smmtdata/ocr/"
source_path = "ocr/"
import_name = source_path + "OUT_2a_fueltype_ocr_oldformat.txt"
import_file = open(import_name, 'r').readlines()
# output_path = "/home/hass/Development/Learning/Python/smmtdata/ocr/"
output_path = "ocr/"
output_name = output_path + "OUT_2b_cleansed_fueltype_oldformat.csv"
output_file = open(output_name, 'w')
Replacements & removals lists¶
Replacements list tuple
In [ ]:
REPLACEMENTS = [
("Mhev diesel", "MHEV_Diesel"),
("Mhevdiesel", "MHEV_Diesel"),
("Mhev petrol", "MHEV_Petrol"),
("Mhevpetrol", "MHEV_Petrol"),
(" 3 ", " "),
(" 5 ", " "),
(" 2 ", " "),
(" ", " "),
(" ", ",")
]
1 - Clean the data¶
- Remove unwanted change and market share lines
- Correct Tesseract common errors
- Remove blank lines
- Generate a clean CSV for the PANDAS import
- Track line numbers for error reporting
In [ ]:
print("\n")
print("\n")
print("DATA CLEANSING : " + import_name)
uline = colored("================================================================================================================================", "black", "on_white")
print(uline)
print("\n")
# Keep count of line numbers to provide feedback when there are an incorrect number of fields in the OCR output
line_count = 1
# Iterate import_file for line removals
for line in import_file:
print("before \n==============\n" + line)
a = "change" in line
b = "mkt" in line
c = "Mkt" in line
d = "Mit" in line
if a == True:
line_count += 1
# ignore line
# print("Change True" + "\n")
elif b == True:
line_count += 1
# also do nothing
#print("mkt True" + "\n")
elif c == True:
line_count += 1
# also do nothing
#print("Mkt True" + "\n")
elif d == True:
line_count += 1
# also do nothing
# print("Mit True" + "\n")
elif line == "\n":
line_count += 1
# also do nothing
# print("Blank Line" + "\n")
elif line == "\n\r":
line_count += 1
# also do nothing
# print("Blank Line" + "\n")
elif line == "\f":
line_count += 1
# also do nothing
# print("Blank Line" + "\n")
elif len(line.strip()) == 0:
line_count += 1
# also do nothing
# print("Blank Line" + "\n")
else:
for old, new in REPLACEMENTS: # Replace text from REPLACEMENTS dictionary
line = line.replace(old,new)
linecheck = line.split(",")
if len(linecheck) != 8:
print("================================================================================================================================")
print("Line Error : " + str(line_count) + " Field Count : " + str(len(linecheck)))
error_line = colored(line, "black", "on_white").replace("\n","")
print(error_line)
output_line = line.strip()
output_file.write(output_line)
output_file.write("\n")
line_count += 1
#print(line_count)
output_file.close()
print("\n")
print("The generated file needs to be validated, OCR is not 100% reliable")
print("================================================================================================================================")
print("\n")
print(" Import File : " + import_name)
print("\n")
print(" Output File : " + output_name)
print("\n")
print("================================================================================================================================")