Import Libraries¶
In [35]:
#!/usr/bin/env python3
from PIL import Image
import pytesseract
from os import listdir
from os.path import isfile, join
from pathlib import Path
import psutil
Initialise notebook options¶
In [36]:
source_path = "/home/hass/Development/smmtdata-evolved/cust_type/"
output_path = "/home/hass/Development/smmtdata-evolved/ocr/"
output_name = output_path + "OUT_3a_custtype_OCR_newformat-2023.txt"
Define Functions¶
In [37]:
def doit(img_crop):
    output_file = open(output_name ,"a")
    text = pytesseract.image_to_string(img_crop, config="-c tessedit_char_whitelist=' 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'  --psm 6")
    print(text)
    
    for line in text:
        output_file.write(line)
    output_file.close()
    #display(img_crop)
    #input()
   # a = input()
   # for proc in psutil.process_iter():
   #     #print(proc)
   #     if proc.name() == "shotwell":
   #         proc.kill()
   #     if proc.name() == "display":
   #         proc.kill()
In [38]:
# Set debug to 1 to display cropped images rather than run OCR
debug = 0
# ** img_spec DEPRECATED ** Different images formats and resolutions used over the years, a cludgy way of dealing with this:
file_specs = [['*2023*-cars.png'   , (0, 0, 655, 400)],
              ['*202x3*-cars.png'  , (0, 0, 880, 500)],
              ['*2022*-cars.png'   , (0, 0, 880, 500)],
              ['*2021*-cars.png'   , (0, 0, 650, 380)],
              ['*20x21*-cars.png'  , (0, 0, 880, 500)],
              ['*2020*-cars.png'   , (0, 0, 650, 380)],
              ['*20x20*-cars.png'  , (0, 0, 880, 500)],
              ['*2019*-cars.png'   , (0, 0, 650, 380)],
              ['*20x19*-cars.png'  , (0, 0, 880, 500)]
              ]
In [39]:
for path, img_spec in file_specs: #img_spec deprecated, newer images are more consistent, only required for old format
    for path in Path(source_path).glob(path):
        print(path)
        img = Image.open(path)
        wid, hgt = img.size 
        img_spec = (0,0,wid*.55, hgt *.45)
        im_crop = img.crop(img_spec)
        if debug == 1: 
            pass #Image._show(im_crop)
        else:
            
            display(im_crop)
            doit(im_crop)
            print(f"edit - {output_name}\n")
print("\n")
print("=========================================")
print("output file saved in " + output_name)
print("=========================================")
/home/hass/Development/smmtdata-evolved/cust_type/June-Sales-2023-and-YTD-cars.png
JUNE 2023 2022 Private 79798 69485 Fleet 92699 67243 Business 4769 4230 TOTAL 177266 140958 edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt /home/hass/Development/smmtdata-evolved/cust_type/May-Sales-2023-and-YTD-cars.png
MAY 2023 2022 Private 65932 66242 Fleet 76207 55649 Business 3065 2503 TOTAL 145204 124394 edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt /home/hass/Development/smmtdata-evolved/cust_type/July-Sales-2023-and-YTD-cars.png
JULY 2023 2022 Private 60045 59883 Fleet 80961 50014 Business 2915 2265 TOTAL 143921 112162 edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt /home/hass/Development/smmtdata-evolved/cust_type/Sept-Sales-2023-and-YTD-cars.png
SEPTEMBER 2023 2022 Private 122944 2 116227 Fleet 143256 101761 Business 6410 7281 TOTAL 272610 225269 edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt /home/hass/Development/smmtdata-evolved/cust_type/Nov-Sales-202x3-and-YTD-cars.png
NOVEMBER 2023 2022 Private 60506 64291 Fleet 93049 74185 Business 2970 4413 TOTAL 156525 142889 edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt /home/hass/Development/smmtdata-evolved/cust_type/October-Sales-202x3-and-YTD-cars.png
OCTOBER 2023 2022 Private 62915 62738 Fleet 87479 67911 Business 3135 3695 TOTAL 153529 134344 edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt /home/hass/Development/smmtdata-evolved/cust_type/Nov-Sales-2022-and-YTD-cars.png
NOVEMBER 2022 2021 Private 64292 62621 Fleet 74184 51005 Business 4413 2080 TOTAL 142889 115706 edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt ========================================= output file saved in /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt =========================================