批量PDF电子发票识别与Excel导出系统
下面是一个完整的Python解决方案,用于批量识别PDF电子发票并提取关键信息到Excel。系统采用模块化设计,结合OCR和文本解析技术,可处理扫描版和数字版PDF发票。
import os
import re
import pytesseract
import pdf2image
import pandas as pd
from PIL import Image
import numpy as np
from pdfminer.high_level import extract_text
from datetime import datetime
import time
import logging
from concurrent.futures import ThreadPoolExecutor
import fitz # PyMuPDF
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("invoice_processing.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger("InvoiceProcessor")
# 配置Tesseract路径(根据系统修改)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Windows示例
# pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' # Linux/macOS
# 发票关键信息正则表达式
PATTERNS = {
'invoice_code': r'发票代码[::\s]*([0-9]{10,12})',
'invoice_number': r'发票号码[::\s]*([0-9]{8,10})',
'invoice_date': r'开票日期[::\s]*(\d{4}年\d{1,2}月\d{1,2}日|\d{4}-\d{1,2}-\d{1,2})',
'amount': r'金额合计[::\s]*(?:¥|¥|人民币)?\s*([\d,]+\.\d{2})',
'tax_amount': r'税额合计[::\s]*(?:¥|¥|人民币)?\s*([\d,]+\.\d{2})',
'total_amount': r'价税合计[::\s]*(?:¥|¥|人民币)?\s*([\d,]+\.\d{2})',
'seller_name': r'销售方[::]\s*([^\s]{6,40})',
'seller_tax_id': r'销售方纳税人识别号[::\s]*([0-9A-Za-z]{15,20})',
'buyer_name': r'购买方[::]\s*([^\s]{6,40})',
'buyer_tax_id': r'购买方纳税人识别号[::\s]*([0-9A-Za-z]{15,20})'
}
class InvoiceProcessor:
def __init__(self, pdf_folder, output_excel):
self.pdf_folder = pdf_folder
self.output_excel = output_excel
self.invoice_data = []
self.processed_files = 0
self.failed_files = 0
self.start_time = time.time()
def _convert_pdf_to_images(self, pdf_path, dpi=200):
"""将PDF转换为图像列表"""
try:
images = pdf2image.convert_from_path(
pdf_path,
dpi=dpi,
poppler_path=r'C:\Program Files\poppler-23.11.0\Library\bin' # Windows示例
# poppler_path='/opt/homebrew/bin' # macOS示例
)
return images
except Exception as e:
logger.error(f"PDF转换失败: {
pdf_path} - {
str(e)}")
return []
def _preprocess_image(self, image):
"""图像预处理增强OCR识别率"""
img = np.array(image)
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) > 2 else img
# 二值化处理
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 降噪
denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
return Image.fromarray(denoised)
def _ocr_image(self, image):
"""使用Tesseract进行OCR识别"""
try:
# 预处理图像
processed_img = self._preprocess_image(image)
# 使用Tesseract OCR
custom_config = r'--oem 3 --psm 6 -l chi_sim+eng'
text = pytesseract.image_to_string(
processed_img,
config=custom_config
)
return text
except Exception as e:
logger.error(f"OCR处理失败: {
str(e)}")
return ""
def _extract_from_digital_pdf(self, pdf_path):
"""从数字PDF中直接提取文本"""
try:
text = extract_text(pdf_path)
return text
except Exception as e:
logger.error(f"数字PDF提取失败: {
pdf_path} - {
str(e)}")
return ""
def _parse_invoice_data(self, text):
"""从文本中解析发票信息"""
result = {
}
for key, pattern in PATTERNS.items():
match = re.search(pattern, text)
if match:
result[key] = match.group(1).strip()
else:
result