读取txt,doc,docx文档格式的文本内容,通过不同格式,读取逻辑不同,避免造成文本内容乱码问题,
这里需要安装Maven:
版本最好统一
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.3</version>
</dependency>
/**
* 根据文本不同的编码格式拿到文本内容
* @param file
* @return
* @throws IOException
*/
private String getContent(MultipartFile file) throws IOException {
String fileName = file.getOriginalFilename();
if (fileName != null) {
if (fileName.endsWith(".txt")) {
return readTextFile(file.getBytes());
} else if (fileName.endsWith(".doc")) {
return readDocFile(file);
} else if (fileName.endsWith(".docx")) {
return readDocxFile(file);
}
}
return "";
}
/**
* 文本编码格式
*/
private static final List<Charset> FALLBACK_ENCODINGS = Arrays.asList(
StandardCharsets.UTF_8,
Charset.forName("GBK"),
Charset.forName("GB2312"),
StandardCharsets.ISO_8859_1
);
/**
* 读取txt格式的文件
* @param fileBytes
* @return
*/
private String readTextFile(byte[] fileBytes) {
// 使用 UniversalDetector 检测文件编码
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(fileBytes, 0, fileBytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
if (encoding != null) {
String content = new String(fileBytes, Charset.forName(encoding));
if (isValidContent(content)) {
return content;
}
}
// 尝试使用多种常见编码解析文件内容
for (Charset charset : FALLBACK_ENCODINGS) {
String content = new String(fileBytes, charset);
if (isValidContent(content)) {
return content;
}
}
// 如果所有尝试都失败,返回默认的 UTF-8 编码内容
return new String(fileBytes, StandardCharsets.UTF_8);
}
/**
* 读取doc格式的文件
* @param file
* @return
* @throws IOException
*/
private static String readDocFile(MultipartFile file) throws IOException {
try (InputStream inputStream = file.getInputStream();
HWPFDocument doc = new HWPFDocument(inputStream)) {
WordExtractor extractor = new WordExtractor(doc);
return extractor.getText();
}
}
/**
* 读取docx格式的文件
* @param file
* @return
* @throws IOException
*/
private String readDocxFile(MultipartFile file) throws IOException {
InputStream inputStream = file.getInputStream();
XWPFDocument docx = new XWPFDocument(inputStream);
XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
String content = extractor.getText();
docx.close();
inputStream.close();
return content;
}