#!/bin/python3
from logger_config import get_logger
from pdf2image import convert_from_path  
from PIL import Image, ImageEnhance, ImageFilter
from concurrent.futures import ThreadPoolExecutor
import PyPDF2  
import tesserocr  
import sys
import io
from logger_config import get_logger
import os
import uuid
import subprocess

# 获取日志记录器
logger = get_logger()

lang = 'chi_sim'
paddle_path = "/opt/print-control/getImgTextOcr"

# 判断是否是pdf文件
def is_pdf_file(file_path):
    try:
        with open(file_path, 'rb') as f:
            PyPDF2.PdfReader(f)
        return True
    except Exception as e:
        return False

# 从pdf中获取文本
def extract_text_from_pdf(pdf_path):  
    text = "" 
    with open(pdf_path, 'rb') as file:  
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text.replace('\n', '').replace(' ', '')

def delete_file(file_path):
    try:
        if os.path.exists(file_path):
            os.remove(file_path)
            return True
        return False
    except Exception as e:
        logger.debug(f"文件删除失败: {str(e)}")
        return False

def save_image_to_cache(image):
    # 生成随机目录名
    # cachepath = "/tmp/.cache/kydlp/print/"
    cachepath = os.path.expanduser("~/.cache/kydlp-print/")  #/var/spool/cups/tmp/.cache/kydlp-print/

    filename = str(uuid.uuid4())
    image_path = os.path.join(cachepath, filename)
    try:
        os.makedirs(cachepath, mode=0o755, exist_ok=True)
        image.save(image_path, format='PNG')
        return image_path
    except Exception as e:
        logger.debug(f"操作失败: {str(e)}")
        return None

# 从图片类pdf中获取文本
def paddle_support():
    # 检查可执行程序是否存在
    if os.path.exists(paddle_path):
        return True
    else:
        return False

# 从图片类pdf中获取文本
def extract_text_from_images_paddle(images):
    text = ""
    logger.debug(f"调试：使用paddleocr")
    for img in images:
        img_path = save_image_to_cache(img)
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format='PNG')
        img_byte_arr.seek(0)
        result = subprocess.run(
            [paddle_path, img_path, "4"], 
            capture_output=True, 
            # stdout=subprocess.PIPE,
            # stderr=subprocess.STDOUT,  # 把 stderr 合并到 stdout
            # stderr=subprocess.DEVNULL,  # 丢弃 stderr
            text=True)
        img_text = result.stdout
        text += img_text
        delete_file(img_path)
    return text.replace('\n', '').replace('', '')

# 从图片类pdf中获取文本
def extract_text_from_images(images):
    text = ""
    lang = "chi_tra+chi_sim+eng"
    logger.debug(f"调试：使用tesserocr")
    for img in images:
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format='PNG')
        img_byte_arr.seek(0)
        img_text = tesserocr.image_to_text(Image.open(img_byte_arr), lang=lang)
        text += img_text
    logger.debug(f"调试：当前lang模式为： {lang}")
    return text.replace('\n', '').replace(' ', '')

# 将 PDF 转换为图片
def conver_pdf_to_image(filepath):
    images = convert_from_path(filepath)
    logger.debug(f"调试：将 PDF 转换为图片")
    return images

def get_text_from_pdf(filepath):
    # 从普通pdf中获取文本
    pdf_text = extract_text_from_pdf(filepath)  
    logger.debug(f"调试：从普通pdf中获取文本为： {pdf_text}")
    if len(pdf_text) <= 0:   
        # 从图片类pdf中获取文本
        if paddle_support():
            pdf_text = extract_text_from_images_paddle(conver_pdf_to_image(filepath))
        else:
            pdf_text = extract_text_from_images(conver_pdf_to_image(filepath))
        logger.debug(f"调试：ocr从图片类pdf中获取文本为： {pdf_text}")
    return pdf_text

if __name__ == '__main__':
    if len(sys.argv) > 1 and  (filepath := sys.argv[1]) and is_pdf_file(filepath):
        print(get_text_from_pdf(filepath))
