序
本文主要研究一下Spring AI Alibaba的DocumentParser
DocumentParser
spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/DocumentParser.java
public interface DocumentParser {
/**
* Parses a given {@link InputStream} into a {@link Document}. The specific
* implementation of this method will depend on the type of the document being parsed.
* <p>
* Note: This method does not close the provided {@link InputStream} - it is the
* caller's responsibility to manage the lifecycle of the stream.
* @param inputStream The {@link InputStream} that contains the content of the
* {@link Document}.
* @return The parsed {@link Document}.
*/
List<Document> parse(InputStream inputStream);
}
DocumentParser接口定义了parse方法,解析inputStream为org.springframework.ai.document.Document,它有TextDocumentParser、JsonDocumentParser等实现
TextDocumentParser
spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/TextDocumentParser.java
public class TextDocumentParser implements DocumentParser {
private final Charset charset;
public TextDocumentParser() {
this(UTF_8);
}
public TextDocumentParser(Charset charset) {
Assert.notNull(charset, "charset");
this.charset = charset;
}
@Override
public List<Document> parse(InputStream inputStream) {
try {
String text = new String(inputStream.readAllBytes(), charset);
if (text.isBlank()) {
throw new Exception();
}
return Collections.singletonList(new Document(text));
}
catch (Exception e) {
throw new RuntimeException(e);
}
}
}
TextDocumentParser实现了DocumentParser接口,其parse方法将inputStream转换为String
JsonDocumentParser
spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/JsonDocumentParser.java
public class JsonDocumentParser implements DocumentParser {
private final JsonMetadataGenerator jsonMetadataGenerator;
private final ObjectMapper objectMapper = new ObjectMapper();
/**
* The key from the JSON that we will use as the text to parse into the Document text
*/
private final List<String> jsonKeysToUse;
public JsonDocumentParser(String... jsonKeysToUse) {
this(new EmptyJsonMetadataGenerator(), jsonKeysToUse);
}
public JsonDocumentParser(JsonMetadataGenerator jsonMetadataGenerator, String... jsonKeysToUse) {
Objects.requireNonNull(jsonKeysToUse, "keys must not be null");
Objects.requireNonNull(jsonMetadataGenerator, "jsonMetadataGenerator must not be null");
this.jsonMetadataGenerator = jsonMetadataGenerator;
this.jsonKeysToUse = List.of(jsonKeysToUse);
}
@Override
public List<Document> parse(InputStream inputStream) {
try {
JsonNode rootNode = this.objectMapper.readTree(inputStream);
if (rootNode.isArray()) {
return StreamSupport.stream(rootNode.spliterator(), true)
.map(jsonNode -> parseJsonNode(jsonNode, this.objectMapper))
.toList();
}
else {
return Collections.singletonList(parseJsonNode(rootNode, this.objectMapper));
}
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
//......
private Document parseJsonNode(JsonNode jsonNode, ObjectMapper objectMapper) {
Map<String, Object> item = objectMapper.convertValue(jsonNode, new TypeReference<Map<String, Object>>() {
});
var sb = new StringBuilder();
this.jsonKeysToUse.stream()
.filter(item::containsKey)
.forEach(key -> sb.append(key).append(": ").append(item.get(key)).append(System.lineSeparator()));
Map<String, Object> metadata = this.jsonMetadataGenerator.generate(item);
String content = sb.isEmpty() ? item.toString() : sb.toString();
return new Document(content, metadata);
}
//......
}
JsonDocumentParser使用ObjectMapper来解析json字符串,它先转为map形式,再根据jsonKeysToUse将key和value通过
:
拼接在一起,另外根据jsonMetadataGenerator生成metadata,最后一起构建Document
示例
class JsonDocumentParserTests {
private JsonDocumentParser parser;
@BeforeEach
void setUp() {
// Initialize parser with text and description fields
parser = new JsonDocumentParser("text", "description");
}
@Test
void testParseSingleJsonObject() {
// Test parsing a single JSON object with text and description fields
String json = """
{
"text": "Sample text",
"description": "Sample description",
"other": "Other field"
}
""";
List<Document> documents = parser.parse(toInputStream(json));
assertThat(documents).hasSize(1);
Document doc = documents.get(0);
assertThat(doc.getText()).contains("Sample text").contains("Sample description");
}
@Test
void testParseJsonArray() {
// Test parsing an array of JSON objects
String json = """
[
{
"text": "First text",
"description": "First description"
},
{
"text": "Second text",
"description": "Second description"
}
]
""";
List<Document> documents = parser.parse(toInputStream(json));
assertThat(documents).hasSize(2);
assertThat(documents.get(0).getText()).contains("First text");
assertThat(documents.get(1).getText()).contains("Second text");
}
@Test
void testJsonPointerParsing() {
// Test parsing using JSON pointer to specific location in document
String json = """
{
"data": {
"items": [
{
"text": "Pointer text",
"description": "Pointer description"
}
]
}
}
""";
List<Document> documents = parser.get("/data/items", toInputStream(json));
assertThat(documents).hasSize(1);
assertThat(documents.get(0).getText()).contains("Pointer text").contains("Pointer description");
}
@Test
void testEmptyJsonInput() {
// Test handling of empty JSON object
String json = "{}";
List<Document> documents = parser.parse(toInputStream(json));
assertThat(documents).hasSize(1);
assertThat(documents.get(0).getText()).isEqualTo("{}");
}
@Test
void testInvalidJsonPointer() {
// Test handling of invalid JSON pointer
String json = """
{
"data": {}
}
""";
assertThrows(IllegalArgumentException.class, () -> parser.get("/invalid/pointer", toInputStream(json)));
}
private InputStream toInputStream(String content) {
return new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8));
}
}
小结
Spring AI Alibaba定义了com.alibaba.cloud.ai.document.DocumentParser,然后部分org.springframework.ai.document.DocumentReader的实现是委托给了相应的parser。spring-ai-alibaba-core默认提供了TextDocumentParser、JsonDocumentParser这两种DocumentParser。