你想实现一个基于 Streamlit 的 PDF 编辑器,功能包括:
- PDF 转 Word
- PDF 表格提取
- PDF 拆分页面
- PDF 转图片
下面是一个初步的 Streamlit 应用框架代码示例,集成了以上功能。使用了常见库如 pdfplumber
、PyMuPDF
(fitz)、pdf2image
、pdf2docx
等。
✅ 安装依赖
pip install streamlit pdfplumber PyMuPDF pdf2image pdf2docx pandas pillow
还需安装 poppler
用于 pdf2image
:
- Windows:下载地址
- macOS:
brew install poppler
- Linux:
sudo apt install poppler-utils
✅ app.py
示例代码
import streamlit as st
import pdfplumber
import fitz # PyMuPDF
from pdf2image import convert_from_path
from pdf2docx import Converter
import os
import tempfile
import pandas as pd
from PIL import Image
st.set_page_config(page_title="PDF 编辑器", layout="wide")
st.title("📄 Streamlit PDF 编辑器")
uploaded_file = st.file_uploader("上传 PDF 文件", type=["pdf"])
if uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
tmp_file.write(uploaded_file.read())
pdf_path = tmp_file.name
st.success("PDF 上传成功")
# 功能选择
option = st.sidebar.selectbox("选择功能", [
"📑 PDF 转 Word",
"📋 提取表格",
"🪓 拆分页面",
"🖼️ PDF 转图片"
])
# PDF 转 Word
if option == "📑 PDF 转 Word":
docx_output = pdf_path.replace(".pdf", ".docx")
if st.button("开始转换"):
with st.spinner("正在转换..."):
cv = Converter(pdf_path)
cv.convert(docx_output, start=0, end=None)
cv.close()
st.success("转换完成")
with open(docx_output, "rb") as f:
st.download_button("下载 Word 文件", f, file_name="converted.docx")
# 提取表格
elif option == "📋 提取表格":
page_num = st.number_input("选择页码", min_value=1, value=1)
with pdfplumber.open(pdf_path) as pdf:
if page_num <= len(pdf):
page = pdf[page_num - 1]
tables = page.extract_tables()
if tables:
for idx, table in enumerate(tables):
df = pd.DataFrame(table[1:], columns=table[0])
st.write(f"表格 {idx + 1}")
st.dataframe(df)
csv = df.to_csv(index=False).encode("utf-8")
st.download_button(f"下载表格 {idx + 1}", csv, file_name=f"table_{idx + 1}.csv")
else:
st.warning("该页未检测到表格")
# 拆分页面
elif option == "🪓 拆分页面":
doc = fitz.open(pdf_path)
page_total = len(doc)
st.write(f"PDF 共 {page_total} 页")
selected_pages = st.multiselect("选择要拆分的页码", list(range(1, page_total + 1)))
if st.button("拆分并下载"):
for page_num in selected_pages:
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=page_num - 1, to_page=page_num - 1)
output_path = f"split_page_{page_num}.pdf"
new_doc.save(output_path)
with open(output_path, "rb") as f:
st.download_button(f"下载 第{page_num}页", f, file_name=output_path)
# PDF 转图片
elif option == "🖼️ PDF 转图片":
dpi = st.slider("选择分辨率 (DPI)", 50, 300, 150)
st.info("每页将被转为 PNG 图片")
if st.button("开始转换"):
with st.spinner("转换中..."):
images = convert_from_path(pdf_path, dpi=dpi)
for i, img in enumerate(images):
st.image(img, caption=f"Page {i+1}", use_column_width=True)
img_path = f"page_{i+1}.png"
img.save(img_path)
with open(img_path, "rb") as f:
st.download_button(f"下载 Page {i+1}", f, file_name=img_path)
✅ 功能扩展建议
- ✅ PDF 合并:用
PyPDF2
或pypdf
实现多文件合并。 - ✅ PDF 添加水印 / 文字注释:用
reportlab
+PyPDF2
实现。 - ✅ OCR 图片识别:嵌入
pytesseract
对图片页进行文本提取。