Import Packages¶
In [23]:
#!/usr/bin/env python3
from PIL import Image
import psutil
import pytesseract
from os import listdir
from os.path import isfile, join
from pathlib import Path
Initialise Notebook options and variables¶
In [24]:
source_path = "/home/hass/Development/smmtdata-evolved/fuel_type/"
#source_path = "../fuel_type/"
output_path = "/home/hass/Development/smmtdata-evolved/ocr/"
#output_path = "../ocr/"
output_name = output_path + "OUT_2a_fueltype_ocr_oldformat-2023.txt"
output_file = open(output_name ,"a")
Define functions¶
In [25]:
def doit(img_crop):
    text = pytesseract.image_to_string(img_crop, config="-c tessedit_char_whitelist=' 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'  --psm 6")
    print(text)
    for line in text:
        output_file.write(line)
# Set Debug Parameters
debug = 0
Process the file through tesseract OCR¶
- Iterate over files in the ocr directory
- Crop the image
- OCR the image
- Store the text (in the doit function)
In [26]:
# Get fuel data from the old smmt format
##################################################################
# Image width is 100%, we only need the top half of the data for Fuel type and customer type
for path in Path(source_path).glob("*-old.png"):
    output_file = open(output_name ,"a")
    print(path)
    img = Image.open(path)
    img_crop = img.crop((0, 0, img.width, img.height /2))
    if debug == 1:  # double check filename = file contents
        # Image._show(img_crop)
        # img_crop.show()
        #absolute_path = str(path)
        #output_file.write(absolute_path + "\n")
        print(img.width)
        a = input()
    else: 
        display(img_crop)
        doit(img_crop) # OCR and the save function
        
        #input()
    output_file.close()
    #a = input()
    #for proc in psutil.process_iter():
    #    if proc.name() == "display":
    #        proc.kill()
    
output_file.close()
print("\n")
print("\n")
print("=========================================")
print("output file saved in " + output_name)
print("=========================================")
========================================= output file saved in /home/hass/Development/smmtdata-evolved/ocr/OUT_2a_fueltype_ocr_oldformat-2023.txt =========================================