使用easyocr、PyPDF2对图像及PDF文档举行识别 - qidao123.com技术社区-IT企服评测·应用市场

pip install easyocr PyPDF2 Pillow

复制代码

import os
import time
import easyocr
from PyPDF2 import PdfReader
from PIL import Image

复制代码

model_storage_directory = './easyocr_models'
os.makedirs(model_storage_directory, exist_ok=True)

复制代码

def check_network():
try:
import urllib.request
urllib.request.urlopen('https://www.baidu.com', timeout=5)
return True
except:
return False

复制代码

try:
print("Initializing EasyOCR...")
print(f"Model storage directory: {os.path.abspath(model_storage_directory)}")
if not check_network():
print("Network connection failed. Please check your internet connection.")
exit(1)
print("Downloading models (this may take several minutes)...")
reader = easyocr.Reader(
['ch_sim', 'en'],
model_storage_directory=model_storage_directory,
download_enabled=True,
verbose=True
)
print("EasyOCR initialized successfully")
except Exception as e:
print(f"Failed to initialize EasyOCR: {str(e)}")
exit(1)

复制代码

def process_image(image_path):
"""处理图片文件"""
try:
result = reader.readtext(image_path)
text = '\n'.join([item[1] for item in result])
return text
except Exception as e:
print(f"Error processing image {image_path}: {str(e)}")
return ""

复制代码

def process_pdf(pdf_path):
"""处理PDF文件"""
try:
text = ""
reader = PdfReader(pdf_path)
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
print(f"Error processing PDF {pdf_path}: {str(e)}")
return ""

复制代码

def save_text(text, output_path):
"""保存提取的文本"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)

复制代码

def main():
# 尝试多个可能的输出目录位置
output_folders = [
'./output_text', # 当前目录
os.path.expanduser('~/ocr_output'), # 用户主目录
os.path.join(os.getcwd(), 'ocr_output') # 当前工作目录
]
output_folder = None
for folder in output_folders:
try:
os.makedirs(folder, exist_ok=True)
output_folder = folder
print(f"Using output directory: {os.path.abspath(output_folder)}")
break
except Exception as e:
print(f"Failed to create output directory {folder}: {str(e)}")
if output_folder is None:
print("Error: Could not create any output directory")
exit(1)
# 初始化日志
log_file = os.path.join(output_folder, 'ocr_log.txt')
# 重定向标准输出到日志文件
import sys
class Logger(object):
def __init__(self, filename):
self.terminal = sys.stdout
self.log = open(filename, "a", encoding='utf-8')
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
pass
sys.stdout = Logger(log_file)
print("OCR Processing Log\n")
print(f"Starting OCR processing at {time.strftime('%Y-%m-%d %H:%M:%S')}")
# 支持的图片格式
image_extensions = ['.bmp', '.jpg', '.jpeg', '.png', '.tiff', '.gif']
# 遍历当前目录及子目录
for root, dirs, files in os.walk('.'):
for file in files:
file_path = os.path.join(root, file)
base_name, ext = os.path.splitext(file)
try:
# 处理图片文件
if ext.lower() in image_extensions:
print(f"Processing image: {file_path}")
text = process_image(file_path)
output_path = os.path.join(output_folder, f"{base_name}.txt")
save_text(text, output_path)
print(f"Successfully processed image: {file_path} -> {output_path}")
with open(log_file, 'a') as f:
f.write(f"Success: {file_path} -> {output_path}\n")
# 处理PDF文件
elif ext.lower() == '.pdf':
print(f"Processing PDF: {file_path}")
text = process_pdf(file_path)
output_path = os.path.join(output_folder, f"{base_name}.txt")
save_text(text, output_path)
print(f"Successfully processed PDF: {file_path} -> {output_path}")
with open(log_file, 'a') as f:
f.write(f"Success: {file_path} -> {output_path}\n")
except Exception as e:
error_msg = f"Error processing {file_path}: {str(e)}"
print(error_msg)
with open(log_file, 'a') as f:
f.write(error_msg + "\n")

复制代码

if __name__ == "__main__":
main()

复制代码

python ocr_process.py

复制代码

import os
import time
import easyocr
from PyPDF2 import PdfReader
from PIL import Image
# 设置模子下载路径model_storage_directory = './easyocr_models'
os.makedirs(model_storage_directory, exist_ok=True)
# 查抄网络毗连def check_network():
try:
import urllib.request
urllib.request.urlopen('https://www.baidu.com', timeout=5)
return True
except:
return False
# 初始化EasyOCR readertry:
print("Initializing EasyOCR...")
print(f"Model storage directory: {os.path.abspath(model_storage_directory)}")
if not check_network():
print("Network connection failed. Please check your internet connection.")
exit(1)
print("Downloading models (this may take several minutes)...")
reader = easyocr.Reader(
['ch_sim', 'en'],
model_storage_directory=model_storage_directory,
download_enabled=True,
verbose=True
)
print("EasyOCR initialized successfully")
except Exception as e:
print(f"Failed to initialize EasyOCR: {str(e)}")
exit(1)
def process_image(image_path): """处置惩罚图片文件""" try: # 使用EasyOCR提取文本 result = reader.readtext(image_path) # 合并所有识别结果 text = '\n'.join([item[1] for item in result]) return text except Exception as e: print(f"Error processing image {image_path}: {str(e)}") return ""def process_pdf(pdf_path):
"""处理PDF文件"""
try:
text = ""
reader = PdfReader(pdf_path)
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
print(f"Error processing PDF {pdf_path}: {str(e)}")
return ""
def save_text(text, output_path):
"""保存提取的文本"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
def main():
# 尝试多个可能的输出目录位置
output_folders = [
'./output_text', # 当前目录
os.path.expanduser('~/ocr_output'), # 用户主目录
os.path.join(os.getcwd(), 'ocr_output') # 当前工作目录
]
output_folder = None
for folder in output_folders:
try:
os.makedirs(folder, exist_ok=True)
output_folder = folder
print(f"Using output directory: {os.path.abspath(output_folder)}")
break
except Exception as e:
print(f"Failed to create output directory {folder}: {str(e)}")
if output_folder is None:
print("Error: Could not create any output directory")
exit(1)
# 初始化日志
log_file = os.path.join(output_folder, 'ocr_log.txt')
# 重定向标准输出到日志文件
import sys
class Logger(object):
def __init__(self, filename):
self.terminal = sys.stdout
self.log = open(filename, "a", encoding='utf-8')
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
pass
sys.stdout = Logger(log_file)
print("OCR Processing Log\n")
print(f"Starting OCR processing at {time.strftime('%Y-%m-%d %H:%M:%S')}")
# 支持的图片格式
image_extensions = ['.bmp', '.jpg', '.jpeg', '.png', '.tiff', '.gif']
# 遍历当前目录及子目录
for root, dirs, files in os.walk('.'):
for file in files:
file_path = os.path.join(root, file)
base_name, ext = os.path.splitext(file)
try:
# 处理图片文件
if ext.lower() in image_extensions:
print(f"Processing image: {file_path}")
text = process_image(file_path)
output_path = os.path.join(output_folder, f"{base_name}.txt")
save_text(text, output_path)
print(f"Successfully processed image: {file_path} -> {output_path}")
with open(log_file, 'a') as f:
f.write(f"Success: {file_path} -> {output_path}\n")
# 处理PDF文件
elif ext.lower() == '.pdf':
print(f"Processing PDF: {file_path}")
text = process_pdf(file_path)
output_path = os.path.join(output_folder, f"{base_name}.txt")
save_text(text, output_path)
print(f"Successfully processed PDF: {file_path} -> {output_path}")
with open(log_file, 'a') as f:
f.write(f"Success: {file_path} -> {output_path}\n")
except Exception as e:
error_msg = f"Error processing {file_path}: {str(e)}"
print(error_msg)
with open(log_file, 'a') as f:
f.write(error_msg + "\n")
if __name__ == "__main__":
main()

复制代码