Import Packages¶
In [23]:
#!/usr/bin/env python3
from PIL import Image
import psutil
import pytesseract
from os import listdir
from os.path import isfile, join
from pathlib import Path
Initialise Notebook options and variables¶
In [24]:
source_path = "/home/hass/Development/smmtdata-evolved/fuel_type/"
#source_path = "../fuel_type/"
output_path = "/home/hass/Development/smmtdata-evolved/ocr/"
#output_path = "../ocr/"
output_name = output_path + "OUT_2a_fueltype_ocr_oldformat-2023.txt"
output_file = open(output_name ,"a")
Define functions¶
In [25]:
def doit(img_crop):
text = pytesseract.image_to_string(img_crop, config="-c tessedit_char_whitelist=' 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' --psm 6")
print(text)
for line in text:
output_file.write(line)
# Set Debug Parameters
debug = 0
Process the file through tesseract OCR¶
- Iterate over files in the ocr directory
- Crop the image
- OCR the image
- Store the text (in the doit function)
In [26]:
# Get fuel data from the old smmt format
##################################################################
# Image width is 100%, we only need the top half of the data for Fuel type and customer type
for path in Path(source_path).glob("*-old.png"):
output_file = open(output_name ,"a")
print(path)
img = Image.open(path)
img_crop = img.crop((0, 0, img.width, img.height /2))
if debug == 1: # double check filename = file contents
# Image._show(img_crop)
# img_crop.show()
#absolute_path = str(path)
#output_file.write(absolute_path + "\n")
print(img.width)
a = input()
else:
display(img_crop)
doit(img_crop) # OCR and the save function
#input()
output_file.close()
#a = input()
#for proc in psutil.process_iter():
# if proc.name() == "display":
# proc.kill()
output_file.close()
print("\n")
print("\n")
print("=========================================")
print("output file saved in " + output_name)
print("=========================================")
========================================= output file saved in /home/hass/Development/smmtdata-evolved/ocr/OUT_2a_fueltype_ocr_oldformat-2023.txt =========================================