#!/usr/bin/env python3
import psutil
from PIL import Image
import pytesseract
from os import listdir
import os
from os.path import isfile, join
from pathlib import Path
from IPython.display import Image as pimage, display
from subprocess import PIPE, Popen, STDOUT, SubprocessError
from colorama import Fore, Style, init

source_path = "/home/hass/Development/smmtdata-evolved/fuel_type/"
output_path = "/home/hass/Development/smmtdata-evolved/ocr/"
output_name = output_path + "OUT_1a_fueltype_OCR_newformat-2023.txt"

pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

def doit(img_crop):
    output_file = open(output_name ,"a")
    text = pytesseract.image_to_string(img_crop, config="-c tessedit_char_whitelist=' 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'  --psm 6")

    # print(text)
    # for line in text:
    output_file.write(text)
    output_file.close()
   # img_crop.show()
   # a = input()
   # for proc in psutil.process_iter():
   #     #print(proc)
   #     if proc.name() == "shotwell":
   #         proc.kill()
   #     if proc.name() == "display":
   #         proc.kill()
    
    return text

# Set debug to 1 to display cropped images rather than run OCR
debug = 0


# Different image formats and resolutions used over the years, pillow doesn't always report the right image dimensions, a cludgy way of dealing with this:
file_specs_new = [['*.png'   , (0, 0, 655, 850)]]
#,
#             ['*202x3*-cars.png'  , (0, 0, 655, 650)],
#             ['*202xx3*-cars.png'  , (0, 0, 890, 850)],
#             ['*202xxx3*-cars.png'  , (0, 0, 890, 650)],
#             ['*2022*-cars.png'   , (0, 0, 890, 890)],
#             ['*202x2*-cars.png'   , (0, 0, 890, 850)],
#             ['*2021*-cars.png'   , (0, 0, 650, 650)],
#             ['*202x1*-cars.png'  , (0, 0, 890, 850)],
#             ['*2020*-cars.png'   , (0, 0, 670, 660)],
#             ['*202x0*-cars.png'  , (0, 0, 890, 850)],
#             ['*202xx0*-cars.png' , (0, 0, 890, 850)],
#             ['*2019*-cars.png'   , (0, 0, 650, 660)],
#            ['*201xx9*-cars.png'  , (0, 0, 890, 850)]
#              ]

line_num = 1 # Keep a count of the line number so we can open the editor at the right line to correct errors
for path, img_spec in file_specs_new:
    line_num = line_num + 1
    for path in Path(source_path).glob(path):
        line_num = line_num + 1
        # print(path)
        
        img = Image.open(path)
        wid, hgt = img.size 
        img_spec = (0,0,wid*.55, hgt *.45) # turns out we can math the image crop after all
        im_crop = img.crop(img_spec)
        if debug == 1: 
            pass #Image._show(im_crop)
        else:
            # fetching the dimensions 
            wid, hgt = img.size 
  
        # displaying the dimensions 
            #hyperlink_text = f"Edit {str(wid)} x {str(hgt)} + {str(output_name)}"
            #shell_command = f"gedit '{output_name} +{line_num}'"
            #edit_name = f"'{output_name} +{line_num}'"
            
            
            display(im_crop)
            print(f"{str(wid)} x {str(hgt)} - {str(path)} - {line_num}") # display the full file path and line number. Can act as hyperlink in some terminals
            print(f"Edit {output_name}") # Display the output text file path. Can act as hyperlink in some terminals
            text = doit(im_crop) #Call the doit function to OCR and save the text
            print(text)
        #input()

1618 x 1899 - /home/hass/Development/smmtdata-evolved/fuel_type/May-Fuel-2023-and-YTD-cars.png - 3
Edit /home/hass/Development/smmtdata-evolved/ocr/OUT_1a_fueltype_OCR_newformat-2023.txt
MAY

2023 2022
Diesel 5758 7614
Petrol 59766  56767
MHEVdiesel 5316 5823
MHEV petrol 23034 16842
BEV 24513 15448
PHEV 9025 7339
HEV 17792 14561
TOTAL 145204 124394

Notebook Name	Function
SMMT-1a-OCR-fueltype-newformat.ipynb	Optical Character recognition of the image files in the import directory. Save to unclean csv
SMMT-1b-CLEANSE-fueltype-newformat.ipynb	Data cleanse, error detection and feature engineering. Saves to cleansed file
SMMT-1b-CLEANSE-fueltype-newformat.ipynb	Data cleanse, error detection and feature engineering. Saves to cleansed csv file in long data format.

Import and OCR the image files to wide format CSV

Notebook for importing and OCR processing images¶

Import python packages¶

Initialise notebook variables and options¶

Define functions¶

`<deprecated>` img dimensions for cropping¶

Process the file through tesseract OCR¶

Import and OCR the image files to wide format CSV

Notebook for importing and OCR processing images¶

Import python packages¶

Initialise notebook variables and options¶

Define functions¶

<deprecated> img dimensions for cropping¶

Process the file through tesseract OCR¶

`<deprecated>` img dimensions for cropping¶