• Site Navigation
    • Project Home
  • Project Documentation
    • Project Details
    • Tableau Dashboard
  • Customer / Fuel Type (new) Notebooks
    • SMMT-1a-OCR-fueltype-newformat.ipyn
    • SMMT-1b-CLEANSE-fueltype-newformat.ipynb
    • SMMT-1c-SAVE-fueltype-newformat.ipynb
  • Fuel Type (old)
    • SMMT-2a-OCR-fueltype-oldformat.ipynb
    • SMMT-2b-CLEANSE-fueltype-oldformat.ipynb
    • SMMT-2c-SAVE-fueltype-oldformat.ipynb
  • Customer Type (old)
    • SMMT-3a-OCR-custtype-newformat.ipynb
    • SMMT-3b-CLEANSE-custtype-newformat.ipynb
    • SMMT-3c-SAVE-custtype-newformat.ipynb

Import Libraries¶

In [35]:
#!/usr/bin/env python3
from PIL import Image
import pytesseract
from os import listdir
from os.path import isfile, join
from pathlib import Path
import psutil

Initialise notebook options¶

In [36]:
source_path = "/home/hass/Development/smmtdata-evolved/cust_type/"

output_path = "/home/hass/Development/smmtdata-evolved/ocr/"
output_name = output_path + "OUT_3a_custtype_OCR_newformat-2023.txt"

Define Functions¶

In [37]:
def doit(img_crop):
    output_file = open(output_name ,"a")

    text = pytesseract.image_to_string(img_crop, config="-c tessedit_char_whitelist=' 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'  --psm 6")
    print(text)
    
    for line in text:
        output_file.write(line)
    output_file.close()
    #display(img_crop)
    #input()
   # a = input()
   # for proc in psutil.process_iter():
   #     #print(proc)
   #     if proc.name() == "shotwell":
   #         proc.kill()
   #     if proc.name() == "display":
   #         proc.kill()
In [38]:
# Set debug to 1 to display cropped images rather than run OCR
debug = 0


# ** img_spec DEPRECATED ** Different images formats and resolutions used over the years, a cludgy way of dealing with this:
file_specs = [['*2023*-cars.png'   , (0, 0, 655, 400)],
              ['*202x3*-cars.png'  , (0, 0, 880, 500)],
              ['*2022*-cars.png'   , (0, 0, 880, 500)],
              ['*2021*-cars.png'   , (0, 0, 650, 380)],
              ['*20x21*-cars.png'  , (0, 0, 880, 500)],
              ['*2020*-cars.png'   , (0, 0, 650, 380)],
              ['*20x20*-cars.png'  , (0, 0, 880, 500)],
              ['*2019*-cars.png'   , (0, 0, 650, 380)],
              ['*20x19*-cars.png'  , (0, 0, 880, 500)]
              ]
In [39]:
for path, img_spec in file_specs: #img_spec deprecated, newer images are more consistent, only required for old format

    for path in Path(source_path).glob(path):
        print(path)
        img = Image.open(path)
        wid, hgt = img.size 
        img_spec = (0,0,wid*.55, hgt *.45)
        im_crop = img.crop(img_spec)
        if debug == 1: 
            pass #Image._show(im_crop)
        else:
            
            display(im_crop)
            doit(im_crop)
            print(f"edit - {output_name}\n")

print("\n")
print("=========================================")

print("output file saved in " + output_name)

print("=========================================")
/home/hass/Development/smmtdata-evolved/cust_type/June-Sales-2023-and-YTD-cars.png
No description has been provided for this image
JUNE
2023 2022
Private 79798  69485
Fleet 92699  67243
Business 4769 4230
TOTAL  177266  140958

edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt

/home/hass/Development/smmtdata-evolved/cust_type/May-Sales-2023-and-YTD-cars.png
No description has been provided for this image
MAY
2023 2022
Private 65932  66242
Fleet 76207 55649
Business 3065 2503
TOTAL  145204  124394

edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt

/home/hass/Development/smmtdata-evolved/cust_type/July-Sales-2023-and-YTD-cars.png
No description has been provided for this image
JULY
2023 2022
Private 60045 59883
Fleet 80961 50014
Business  2915 2265
TOTAL  143921  112162

edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt

/home/hass/Development/smmtdata-evolved/cust_type/Sept-Sales-2023-and-YTD-cars.png
No description has been provided for this image
SEPTEMBER
2023 2022
Private 122944 2 116227
Fleet 143256 101761
Business 6410  7281
TOTAL  272610  225269

edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt

/home/hass/Development/smmtdata-evolved/cust_type/Nov-Sales-202x3-and-YTD-cars.png
No description has been provided for this image
NOVEMBER
2023 2022
Private 60506 64291
Fleet 93049  74185
Business 2970 4413
TOTAL  156525  142889

edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt

/home/hass/Development/smmtdata-evolved/cust_type/October-Sales-202x3-and-YTD-cars.png
No description has been provided for this image
OCTOBER
2023 2022
Private 62915  62738
Fleet 87479  67911
Business 3135 3695
TOTAL  153529  134344

edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt

/home/hass/Development/smmtdata-evolved/cust_type/Nov-Sales-2022-and-YTD-cars.png
No description has been provided for this image
NOVEMBER
2022 2021
Private 64292 62621
Fleet 74184  51005
Business 4413 2080
TOTAL  142889  115706

edit - /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt



=========================================
output file saved in /home/hass/Development/smmtdata-evolved/ocr/OUT_3a_custtype_OCR_newformat-2023.txt
=========================================