java,poi,提取ppt文件中的文字内容

发布于:2025-03-20 ⋅ 阅读:(21) ⋅ 点赞:(0)

注意,不涉及图片处理。

先上pom依赖:

        <!-- 处理PPTX文件 -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>5.2.3</version>
        </dependency>
        <!-- 处理PPT文件 -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>5.2.3</version>
        </dependency>

代码:

public static void main(String[] args) {
        String filePath = "C:\\xx.pptx"; // 待处理ppt全路径
        try {
            IOUtils.setByteArrayMaxOverride(160000000);//分配内存160M
            String content = readPresentation(filePath);
            System.out.println(content);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static String readPresentation(String filePath) throws Exception {
        if (filePath.toLowerCase().endsWith(".pptx")) {
            return readPPTX(filePath);
        } else if (filePath.toLowerCase().endsWith(".ppt")) {
            return readPPT(filePath);
        }
        throw new IllegalArgumentException("Unsupported file format");
    }

    // 处理PPTX文件
    private static String readPPTX(String filePath) throws Exception {
        StringBuilder content = new StringBuilder();
        XMLSlideShow ppt = new XMLSlideShow(new FileInputStream(filePath));

            for (XSLFSlide slide : ppt.getSlides()) {
                for (XSLFShape shape : slide.getShapes()) {
                    if (shape instanceof XSLFTextShape) {
                        content.append(((XSLFTextShape) shape).getText()).append("\n");
                    }
                }
            }

        return content.toString();
    }

    // 处理PPT文件
    private static String readPPT(String filePath) throws Exception {
        StringBuilder content = new StringBuilder();
        try (HSLFSlideShow ppt = new HSLFSlideShow(new FileInputStream(filePath))) {
            for (HSLFSlide slide : ppt.getSlides()) {
                // 读取幻灯片中的形状
                for (HSLFShape shape : slide.getShapes()) {
                    if (shape instanceof HSLFTextShape) {
                        HSLFTextShape textShape = (HSLFTextShape) shape;
                        content.append(textShape.getText()).append("\n");
                    }
                }
                // 读取幻灯片中的文本框(兼容旧版本)
                for (List<HSLFTextParagraph> textParagraphs : slide.getTextParagraphs()) {
                    for (HSLFTextParagraph para : textParagraphs) {
                        content.append(para).append("\n");
                    }
                }
            }
        }
        return content.toString();
    }

 最终效果与wps自带的ppt转word只勾选文本差不多。