import io import pdfplumber from opencc import OpenCC import fitz # pymupdf import os file_path = '/document/pdf/xxx.pdf' output_dir = '/classification/pdf/images' #获取图片 demo def extract_images_from_pdf(pdf_path, output_dir): # 确保输出目录存在 if not os.path.exists(output_dir): os.makedirs(output_dir) # 打开PDF文件 doc = fitz.open(pdf_path) page_count = doc.page_count # 遍历PDF的每一页 for page_num in range(page_count): page = doc.load_page(page_num) # 获取页面中的图片信息 images = page.get_images(full=True) image_index = 0 for img_index, img in enumerate(images): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] # 使用Pillow将图片保存到本地 from PIL import Image image = Image.open(io.BytesIO(image_bytes)) image_path = os.path.join(output_dir, f"image_{page_num + 1}_{image_index + 1}.{image_ext}") image.save(image_path) image_index += 1 doc.close() # 使用示例 extract_images_from_pdf(file_path, output_dir) cc = OpenCC('t2s') def read_pdf_with_pdfplumber(file_path): images = [] with (pdfplumber.open(file_path) as pdf): num_pages = len(pdf.pages) print(f"Number of pages: {num_pages}") text = pdf.pages[0].extract_text() text = cc.convert(text) print(text) # 示例用法 read_pdf_with_pdfplumber(file_path)