目录
0 环境准备
- 已安装miniconda环境
1 环境依赖
此篇文章基于以下文章更改
python识别扫描版PDF文件,获取扫描版PDF文件的文本内容-CSDN博客
2 安装环境依赖包
pip install PyMuPDF
pip install paddlepaddle
pip install paddleocr
pip install opencv-python scikit-image
pip install common dual tight data prox
3 程序逻辑实现
3.1 导入依赖包
pip install PyMuPDF
pip install paddlepaddle
pip install paddleocr
pip install opencv-python scikit-image
pip install common dual tight data prox
pip install paddlepaddle
pip install paddleocr
3.2 定义将十六进制颜色字符串转换成RGB
def hex_to_rgb(hex_color):
"""将十六进制颜色代码转换为RGB元组"""
hex_color = hex_color.lstrip('#')
return tuple(int(hex_color[i:i + 2], 16) for i in (0, 2, 4))
3.3 定义相关参数
# 配置参数,水印的颜色值
WATERMARK_HEX = "#f33c34"
TOLERANCE = 100 # 颜色容差范围。如果是彩色,可以设置容差范围大些,如果是灰度图,可以设置小些
MIN_WATERMARK_PERCENT = 1 # 水印像素占比阈值
# 转换水印颜色
target_rgb = hex_to_rgb(WATERMARK_HEX)
lower_bound = np.array([max(0, x - TOLERANCE) for x in target_rgb])
upper_bound = np.array([min(255, x + TOLERANCE) for x in target_rgb])
# 转换水印颜色为灰度值
target_gray = hex_to_gray(WATERMARK_HEX)
lower_bound_gray = max(0, target_gray - TOLERANCE)
upper_bound_gray = min(255, target_gray + TOLERANCE)
3.4 定义PDF处理类
3.4.1 定义PDFOCREnhanced类
此类继承至参考博客的PDFOCR类
class PDFOCREnhanced(PDFOCR):
3.4.2 定义init方法
def __init__(self, file_path, output_txt):
super().__init__(file_path, output_txt)
self.ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=True, show_log=True)
3.4.3 定义探测水印方法
def _auto_detect_water(self, img_array, img):
""" 基于图像分析自动识别水印区域"""
# 创建颜色蒙版
# 处理灰度图
if len(img_array.shape) == 2:
# 计算单通道的上下界
mask = (img_array >= lower_bound_gray) & (img_array <= upper_bound_gray)
else:
# 处理彩色图像
mask = np.all((img_array[:, :, :3] >= lower_bound) &
(img_array[:, :, :3] <= upper_bound), axis=-1)
# 水印检测逻辑
watermark_ratio = np.sum(mask) / (img.width * img.height) * 100
return watermark_ratio >= MIN_WATERMARK_PERCENT, mask
3.4.4 定义删除水印方法
def _remove_watermark(self, mask, img_array):
# 创建白色背景
white_bg = np.full_like(img_array, 255)
# 合并处理后的图像
if len(img_array.shape) == 2:
output_array = np.where(mask, white_bg, img_array)
else:
output_array = np.where(mask[..., None], white_bg, img_array)
return Image.fromarray(output_array)
3.4.5 定义paddle获取pdf文字方法
def ocr_recognition(self, image):
"""执行OCR识别"""
result = self.ocr.ocr(image, cls=True)
texts = [line[1][0] for line in result[0]] if result else []
# texts = [line[1][0] for line in result]
return '\n'.join(texts)
3.4.6 定义识别pdf图像主流程方法
def _ocr_images(self):
"""增强版OCR流程"""
with open(self.output_txt, 'w', encoding='utf-8') as f:
for img_file in sorted(os.listdir(self.temp_img_dir)):
img_path = os.path.join(self.temp_img_dir, img_file)
img = Image.open(img_path)
# img = img.filter(ImageFilter.MedianFilter(size=3)) # 中值滤波降噪
img_array = np.array(img)
# 阈值处理
# thresh = threshold_otsu(img_array)
# binary = img_array > thresh
# img = Image.fromarray((binary * 255).astype(np.uint8))
img.save(img_path)
# 水印处理环节
has_watermark, mask = self._auto_detect_water(img_array, img)
clean_img = img
if has_watermark:
print(f"检测到水印: {img_file}")
clean_img = self._remove_watermark(mask, img_array)
clean_img.save(img_path) # 覆盖原图
# 后续OCR识别流程保持不变
text = self.ocr_recognition(np.array(clean_img))
f.write(text + '\n')
print(f"已完成 {img_file} 识别")
3.5 定义main方法
if __name__ == '__main__':
cur_timestamp = time.time()
processor = PDFOCREnhanced('./in_pdf/Txxxx71.pdf', f'./out/output-{cur_timestamp}.txt')
try:
processor._pdf_to_images(zoom=4)
processor._ocr_images()
finally:
processor._cleanup()
4 完整代码
import os
import time
import fitz
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR
from pdf_ocr_reader import PDFOCR
def hex_to_rgb(hex_color):
"""将十六进制颜色代码转换为RGB元组"""
hex_color = hex_color.lstrip('#')
return tuple(int(hex_color[i:i + 2], 16) for i in (0, 2, 4))
def hex_to_gray(hex_color):
"""将十六进制颜色代码转换为灰度值"""
hex_color = hex_color.lstrip('#')
r, g, b = (int(hex_color[i:i + 2], 16) for i in (0, 2, 4))
return 0.299 * r + 0.587 * g + 0.114 * b
# 配置参数,水印的颜色值
WATERMARK_HEX = "#f33c34"
# WATERMARK_HEX = "#000000"
TOLERANCE = 100 # 颜色容差范围。如果是彩色,可以设置容差范围大些,如果是灰度图,可以设置小些
DPI = 300 # 图像分辨率
MIN_WATERMARK_PERCENT = 1 # 水印像素占比阈值
# 转换水印颜色
target_rgb = hex_to_rgb(WATERMARK_HEX)
lower_bound = np.array([max(0, x - TOLERANCE) for x in target_rgb])
upper_bound = np.array([min(255, x + TOLERANCE) for x in target_rgb])
# 转换水印颜色为灰度值
target_gray = hex_to_gray(WATERMARK_HEX)
lower_bound_gray = max(0, target_gray - TOLERANCE)
upper_bound_gray = min(255, target_gray + TOLERANCE)
class PDFOCREnhanced(PDFOCR):
def __init__(self, file_path, output_txt):
super().__init__(file_path, output_txt)
self.ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=True, show_log=True)
def _auto_detect_water(self, img_array, img):
""" 基于图像分析自动识别水印区域"""
# 创建颜色蒙版
# 处理灰度图
if len(img_array.shape) == 2:
# 计算单通道的上下界
mask = (img_array >= lower_bound_gray) & (img_array <= upper_bound_gray)
else:
# 处理彩色图像
mask = np.all((img_array[:, :, :3] >= lower_bound) &
(img_array[:, :, :3] <= upper_bound), axis=-1)
# 水印检测逻辑
watermark_ratio = np.sum(mask) / (img.width * img.height) * 100
return watermark_ratio >= MIN_WATERMARK_PERCENT, mask
def _remove_watermark(self, mask, img_array):
# 创建白色背景
white_bg = np.full_like(img_array, 255)
# 合并处理后的图像
if len(img_array.shape) == 2:
output_array = np.where(mask, white_bg, img_array)
else:
output_array = np.where(mask[..., None], white_bg, img_array)
return Image.fromarray(output_array)
def _pdf_to_images(self, zoom=3):
"""将PDF每页转换为高清图片"""
doc = fitz.open(self.file_path)
page_num = 7
page = doc.load_page(page_num)
# 设置缩放参数提升分辨率
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
pix.tobytes("ppm")
p_index = page_num + 1
p_index = str(p_index).zfill(5)
img_path = os.path.join(self.temp_img_dir, f'page_{p_index}.png')
pix.save(img_path)
print(f"已完成 {img_path} 存储")
doc.close()
def ocr_recognition(self, image):
"""执行OCR识别"""
result = self.ocr.ocr(image, cls=True)
texts = [line[1][0] for line in result[0]] if result else []
return '\n'.join(texts)
def _ocr_images(self):
"""增强版OCR流程"""
with open(self.output_txt, 'w', encoding='utf-8') as f:
for img_file in sorted(os.listdir(self.temp_img_dir)):
img_path = os.path.join(self.temp_img_dir, img_file)
img = Image.open(img_path)
img_array = np.array(img)
img.save(img_path)
# 水印处理环节
has_watermark, mask = self._auto_detect_water(img_array, img)
clean_img = img
if has_watermark:
print(f"检测到水印: {img_file}")
clean_img = self._remove_watermark(mask, img_array)
clean_img.save(img_path) # 覆盖原图
# 后续OCR识别流程保持不变
text = self.ocr_recognition(np.array(clean_img))
f.write(text + '\n')
print(f"已完成 {img_file} 识别")
if __name__ == '__main__':
cur_timestamp = time.time()
processor = PDFOCREnhanced('./in_pdf/T2xxx71.pdf', f'./out/output-{cur_timestamp}.txt')
try:
processor._pdf_to_images(zoom=4)
processor._ocr_images()
finally:
processor._cleanup()