olmOCR大模型:支持结构化精准提取复杂PDF文件内容

发布于:2025-03-20 ⋅ 阅读:(11) ⋅ 点赞:(0)

基于streamlit与olmOCR大模型实现的pdf提取工具

import os
import json
import subprocess
import pandas as pd
from pathlib import Path
import shutil
import time
import re
import streamlit as st

# 创建工作目录
WORKSPACE_DIR = "olmocr_workspace"
os.makedirs(WORKSPACE_DIR, exist_ok=True)

def modify_html_for_better_display(html_content):
    """修改HTML以便在Streamlit中更好地显示"""
    if not html_content:
        return html_content
    
    # 增加容器宽度
    html_content = html_content.replace('<div class="container">', 
                                       '<div class="container" style="max-width: 100%; width: 100%;">')
    
    # 增加文本大小
    html_content = html_content.replace('<style>', 
                                       '<style>\nbody {font-size: 16px;}\n.text-content {font-size: 16px; line-height: 1.5;}\n')
    
    # 调整图像和文本部分的大小比例
    html_content = html_content.replace('<div class="row">', 
                                       '<div class="row" style="display: flex; flex-wrap: wrap;">')
    html_content = html_content.replace('<div class="col-md-6">', 
                                       '<div class="col-md-6" style="flex: 0 0 50%; max-width: 50%; padding: 15px;">')
    
    # 增加页面之间的间距
    html_content = html_content.replace('<div class="page">', 
                                       '<div class="page" style="margin-bottom: 30px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">')
    
    # 增加图像大小
    html_content = re.sub(r'<img([^>]*)style="([^"]*)"', 
                         r'<img\1style="max-width: 100%; height: auto; \2"', 
                         html_content)
    
    # 添加缩放控制
    zoom_controls = """
    <div style="position: fixed; bottom: 20px; right: 20px; background: #fff; padding: 10px; border-radius: 5px; box-shadow: 0 0 10px rgba(0,0,0,0.2); z-index: 1000;">
        <button onclick="document.body.style.zoom = parseFloat(document.body.style.zoom || 1) + 0.1;" style="margin-right: 5px;">放大</button>
        <button onclick="document.body.style.zoom = parseFloat(document.body.style.zoom || 1) - 0.1;">缩小</button>
    </div>
    """
    html_content = html_content.replace('</body>', f'{zoom_controls}</body>')
    
    return html_content

def process_pdf(pdf_file):
    """处理PDF文件并返回结果"""
    if pdf_file is None:
        return "请上传PDF文件", "", None, None
    
    # 创建一个唯一的工作目录
    timestamp = int(time.time())
    work_dir = os.path.join(WORKSPACE_DIR, f"job_{timestamp}")
    os.makedirs(work_dir, exist_ok=True)
    
    # 复制PDF文件
    pdf_path = os.path.join(work_dir, "input.pdf")
    with open(pdf_path, "wb") as f:
        f.write(pdf_file.read())
    
    # 构建命令并执行
    cmd = ["python", "-m", "olmocr.pipeline", work_dir, "--pdfs", pdf_path]
    
    try:
        # 执行命令,等待完成
        process = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        
        # 命令输出
        log_text = process.stdout
        
        # 检查结果目录
        results_dir = os.path.join(work_dir, "results")
        if not os.path.exists(results_dir):
            return f"处理完成,但未生成结果目录\n\n日志输出:\n{log_text}", "", None, None
        
        # 查找输出文件
        output_files = list(Path(results_dir).glob("output_*.jsonl"))
        if not output_files:
            return f"处理完成,但未找到输出文件\n\n日志输出:\n{log_text}", "", None, None
        
        # 读取JSONL文件
        output_file = output_files[0]
        with open(output_file, "r") as f:
            content = f.read().strip()
            if not content:
                return f"输出文件为空\n\n日志输出:\n{log_text}", "", None, None
            
            # 解析JSON
            result = json.loads(content)
            extracted_text = result.get("text", "未找到文本内容")
            
            # 生成HTML预览
            try:
                preview_cmd = ["python", "-m", "olmocr.viewer.dolmaviewer", str(output_file)]
                subprocess.run(preview_cmd, check=True)
            except Exception as e:
                log_text += f"\n生成HTML预览失败: {str(e)}"
            
            # 查找HTML文件
            html_files = list(Path("dolma_previews").glob("*.html"))
            html_content = ""
            if html_files:
                try:
                    with open(html_files[0], "r", encoding="utf-8") as hf:
                        html_content = hf.read()
                        # 修改HTML以更好地显示
                        html_content = modify_html_for_better_display(html_content)
                except Exception as e:
                    log_text += f"\n读取HTML预览失败: {str(e)}"
            
            # 创建元数据表格
            metadata = result.get("metadata", {})
            meta_rows = []
            for key, value in metadata.items():
                meta_rows.append([key, value])
            
            df = pd.DataFrame(meta_rows, columns=["属性", "值"])
            
            return log_text, extracted_text, html_content, df
        
    except subprocess.CalledProcessError as e:
        return f"命令执行失败: {e.stderr}", "", None, None
    except Exception as e:
        return f"处理过程中发生错误: {str(e)}", "", None, None

# 设置Streamlit应用
st.title("一个黑客创业者:基于streamlit和olmOCR PDF文本提取工具")

# 文件上传
uploaded_file = st.file_uploader("上传PDF文件", type="pdf")

if uploaded_file is not None:
    st.button("处理PDF", on_click=lambda: process_and_display(uploaded_file))

def process_and_display(pdf_file):
    log_text, extracted_text, html_content, df = process_pdf(pdf_file)
    
    st.header("处理日志")
    st.text(log_text)
    
    st.header("提取的文本")
    st.text_area("", value=extracted_text, height=200)
    
    if html_content:
        st.header("HTML预览")
        st.components.v1.html(html_content, height=800, scrolling=True)
    
    if df is not None and not df.empty:
        st.header("文档元数据")
        st.dataframe(df)