Python OCR PDF Extraction

打印 上一主题 下一主题

主题 923|帖子 923|积分 2773

Tesseract Installation


  • ocr2.py
  1. #!/usr/bin/python3
  2. #
  3. # Python OCR PDF Extraction
  4. # https://github.com/tesseract-ocr/tesseract
  5. #
  6. # sudo apt install tesseract-ocr
  7. # sudo apt install libtesseract-dev
  8. # pip install pytesseract PyPDF2 pdfplumber opencv-python pillow
  9. # pip install pdf2image
  10. # sudo apt-get install poppler-utils
  11. # sudo apt-get install tesseract-ocr-chi-sim  # Simplified Chinese
  12. # sudo apt-get install tesseract-ocr-chi-tra  # Traditional Chinese
  13. # tesseract --list-langs
  14. import pytesseract
  15. from pdf2image import convert_from_path
  16. from PyPDF2 import PdfReader
  17. import cv2
  18. import numpy as np
  19. from PIL import Image
  20. # Path to Tesseract executable (update to match your system)
  21. pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
  22. def preprocess_image(pil_image):
  23.     """
  24.     Preprocesses an image for OCR using OpenCV.
  25.     Converts to grayscale, applies thresholding.
  26.     """
  27.     # Convert PIL image to OpenCV format
  28.     open_cv_image = np.array(pil_image)
  29.     # Convert RGB to BGR (OpenCV default format)
  30.     open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)
  31.     # Convert to grayscale
  32.     gray_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
  33.     # Apply binary thresholding
  34.     _, thresh_image = cv2.threshold(gray_image, 128, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
  35.     return thresh_image
  36. def extract_text_from_pdf(pdf_path):
  37.     # First try extracting text from the PDF directly
  38.     reader = PdfReader(pdf_path)
  39.     text = ""
  40.     for page in reader.pages:
  41.         text += page.extract_text() or ""
  42.     # If no text is extracted, assume it's a scanned PDF and use OCR
  43.     if not text.strip():
  44.         images = convert_from_path(pdf_path)
  45.         for image in images:
  46.             # Preprocess image for better OCR results
  47.             preprocessed_image = preprocess_image(image)
  48.             # Convert OpenCV image back to PIL format for Tesseract
  49.             pil_image = Image.fromarray(preprocessed_image)
  50.             # Perform OCR
  51.             text += pytesseract.image_to_string(pil_image, lang='chi_sim')
  52.     return text
  53. # Example usage
  54. pdf_path = "scan_2025-01-02_09.31.pdf"
  55. extracted_text = extract_text_from_pdf(pdf_path)
  56. print(extracted_text)
复制代码
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。
回复

使用道具 举报

0 个回复

倒序浏览

快速回复

您需要登录后才可以回帖 登录 or 立即注册

本版积分规则

王國慶

金牌会员
这个人很懒什么都没写!
快速回复 返回顶部 返回列表