PPT转word:
import re
from pptx import Presentation
from docx import Document
from docx.shared import Inches
from io import BytesIO
from PIL import Image
def clean_text(text):
# 使用正则表达式删除控制字符和NULL字节
return re.sub(r'[\x00-\x1F\x7F]', '', text)
def ppt_to_word(ppt_file, word_file):
prs = Presentation(ppt_file)
doc = Document()
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
text = shape.text_frame.text
cleaned_text = clean_text(text) # 清理文本中的无效字符
doc.add_paragraph(cleaned_text)
# shape.shape_type==13 表示图片
if shape.shape_type == 13:
image = shape.image
# 使用图片原始的 blob 数据构建 BytesIO
image_stream = BytesIO(image.blob)
# 插入图片到Word文档
doc.add_paragraph().add_run().add_picture(image_stream, width=Inches(3))
doc.save(word_file)
print(f"转换完成,文件保存为 {word_file}")
# 示例
ppt_to_word("PPTs/Fixed Asset.pptx", "Documents/fixed asset.docx")
PDF转word:
import sys
import PyPDF2
from docx import Document
def extract_text_from_pdf(pdf_path):
"""
从 PDF 文件中提取文本内容
:param pdf_path: PDF 文件的路径
:return: 提取的文本,字符串类型
"""
text = ""
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
# 遍历每一页,并提取文字
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
def write_text_to_docx(text, docx_path):
"""
将文本写入到 Word 文档中
:param text: 需要写入的文本内容
:param docx_path: 输出 docx 文件的路径
"""
document = Document()
document.add_paragraph(text)
document.save(docx_path)
if __name__ == '__main__':
if len(sys.argv) < 2:
print("用法: python extract_pdf_text.py 输入文件.pdf [输出文件.docx]")
sys.exit(1)
pdf_path = sys.argv[1]
# 如果传入了输出文件名则使用,否则默认 "output.docx"
output_path = sys.argv[2] if len(sys.argv) > 2 else "output.docx"
try:
print("正在提取 PDF 文本...")
text = extract_text_from_pdf(pdf_path)
print("正在写入到 Word 文档...")
write_text_to_docx(text, output_path)
print(f"转换成功!输出文档为:{output_path}")
except Exception as e:
print("转换失败:", e)