Tesseract Installation
- #!/usr/bin/python3
- #
- # Python OCR PDF Extraction
- # https://github.com/tesseract-ocr/tesseract
- #
- # sudo apt install tesseract-ocr
- # sudo apt install libtesseract-dev
- # pip install pytesseract PyPDF2 pdfplumber opencv-python pillow
- # pip install pdf2image
- # sudo apt-get install poppler-utils
- # sudo apt-get install tesseract-ocr-chi-sim # Simplified Chinese
- # sudo apt-get install tesseract-ocr-chi-tra # Traditional Chinese
- # tesseract --list-langs
- import pytesseract
- from pdf2image import convert_from_path
- from PyPDF2 import PdfReader
- import cv2
- import numpy as np
- from PIL import Image
- # Path to Tesseract executable (update to match your system)
- pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
- def preprocess_image(pil_image):
- """
- Preprocesses an image for OCR using OpenCV.
- Converts to grayscale, applies thresholding.
- """
- # Convert PIL image to OpenCV format
- open_cv_image = np.array(pil_image)
- # Convert RGB to BGR (OpenCV default format)
- open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)
- # Convert to grayscale
- gray_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
- # Apply binary thresholding
- _, thresh_image = cv2.threshold(gray_image, 128, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
- return thresh_image
- def extract_text_from_pdf(pdf_path):
- # First try extracting text from the PDF directly
- reader = PdfReader(pdf_path)
- text = ""
- for page in reader.pages:
- text += page.extract_text() or ""
- # If no text is extracted, assume it's a scanned PDF and use OCR
- if not text.strip():
- images = convert_from_path(pdf_path)
- for image in images:
- # Preprocess image for better OCR results
- preprocessed_image = preprocess_image(image)
- # Convert OpenCV image back to PIL format for Tesseract
- pil_image = Image.fromarray(preprocessed_image)
- # Perform OCR
- text += pytesseract.image_to_string(pil_image, lang='chi_sim')
- return text
- # Example usage
- pdf_path = "scan_2025-01-02_09.31.pdf"
- extracted_text = extract_text_from_pdf(pdf_path)
- print(extracted_text)
复制代码 免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |