from pytesseract import pytesseract

from PIL import Image
import os
from config import paths
from service.AbstractOCRService import AbstractOCRService
import re

class TesseractOCRService(AbstractOCRService):

    def __init__(self):
        self.images_paths = os.listdir(paths.SLIDES_OUTPUT_FOLDER)

    def perform_ocr(self) -> str:
        self.text = ""
        for image_path in self.images_paths:
            image = Image.open(os.path.join(paths.SLIDES_OUTPUT_FOLDER, image_path))
            result = pytesseract.image_to_string(image)
            postprocessed_text = self.postprocess_ocr_output(result)
            self.text += postprocessed_text
            self.text += " ##### " ##slides splitter
        return self.text

    def postprocess_ocr_output(self, ocr_text: str) -> str:
        """
        Function first replaces any whitechars to a single space by regex and then keeps only alphanumeric chars
        :param text:
        :return: postprocessed text
        """

        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', ocr_text)
        processed_text = re.sub(r'\s+', ' ', cleaned_text).strip().lower()
        return processed_text




