java读取txt,doc,docx文档格式的文本内容

发布于:2024-06-28 ⋅ 阅读:(15) ⋅ 点赞:(0)

 读取txt,doc,docx文档格式的文本内容,通过不同格式,读取逻辑不同,避免造成文本内容乱码问题,

这里需要安装Maven:

版本最好统一

  <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>5.2.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>5.2.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>5.2.3</version>
        </dependency>
    /**
     * 根据文本不同的编码格式拿到文本内容
     * @param file
     * @return
     * @throws IOException
     */    
private String getContent(MultipartFile file) throws IOException {
        String fileName = file.getOriginalFilename();
        if (fileName != null) {
            if (fileName.endsWith(".txt")) {
                return readTextFile(file.getBytes());
            } else if (fileName.endsWith(".doc")) {
                return readDocFile(file);
            } else if (fileName.endsWith(".docx")) {
                return readDocxFile(file);
            }
        }
        return "";
    }


    /**
     * 文本编码格式
     */
    private static final List<Charset> FALLBACK_ENCODINGS = Arrays.asList(
            StandardCharsets.UTF_8,
            Charset.forName("GBK"),
            Charset.forName("GB2312"),
            StandardCharsets.ISO_8859_1
    );


   /**
     * 读取txt格式的文件
     * @param fileBytes
     * @return
     */
    private  String readTextFile(byte[] fileBytes) {
        // 使用 UniversalDetector 检测文件编码
        UniversalDetector detector = new UniversalDetector(null);
        detector.handleData(fileBytes, 0, fileBytes.length);
        detector.dataEnd();

        String encoding = detector.getDetectedCharset();
        if (encoding != null) {
            String content = new String(fileBytes, Charset.forName(encoding));
            if (isValidContent(content)) {
                return content;
            }
        }

        // 尝试使用多种常见编码解析文件内容
        for (Charset charset : FALLBACK_ENCODINGS) {
            String content = new String(fileBytes, charset);
            if (isValidContent(content)) {
                return content;
            }
        }

        // 如果所有尝试都失败,返回默认的 UTF-8 编码内容
        return new String(fileBytes, StandardCharsets.UTF_8);
    }


    /**
     * 读取doc格式的文件
     * @param file
     * @return
     * @throws IOException
     */
    private static String readDocFile(MultipartFile file) throws IOException {
        try (InputStream inputStream = file.getInputStream();
             HWPFDocument doc = new HWPFDocument(inputStream)) {
            WordExtractor extractor = new WordExtractor(doc);
            return extractor.getText();
        }
    }




    /**
     * 读取docx格式的文件
     * @param file
     * @return
     * @throws IOException
     */
    private String readDocxFile(MultipartFile file) throws IOException {
        InputStream inputStream = file.getInputStream();
        XWPFDocument docx = new XWPFDocument(inputStream);
        XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
        String content = extractor.getText();
        docx.close();
        inputStream.close();
        return content;
    }