Browse Source

支持多种文件格式的上传和解析, 实现自定义知识库的功能

master
hygl 12 months ago
parent
commit
f49e186f8a
  1. 6
      pom.xml
  2. 3
      src/main/java/com/wok/supportbot/controller/AiController.java
  3. 232
      src/main/java/com/wok/supportbot/controller/DocumentController.java
  4. 59
      src/main/java/com/wok/supportbot/document/extract/JsonDocumentLoader.java
  5. 52
      src/main/java/com/wok/supportbot/document/extract/MarkdownDocumentLoader.java
  6. 37
      src/main/java/com/wok/supportbot/document/extract/MyJsonReader.java
  7. 20
      src/main/java/com/wok/supportbot/document/extract/SimpleStringDocumentReader.java
  8. 44
      src/main/java/com/wok/supportbot/document/extract/TikaDocumentReader.java
  9. 2
      src/main/java/com/wok/supportbot/document/transform/MyTokenTextSplitter.java

6
pom.xml

@ -101,6 +101,12 @@
<artifactId>spring-ai-starter-vector-store-pgvector</artifactId> <artifactId>spring-ai-starter-vector-store-pgvector</artifactId>
<version>1.0.0-M7</version> <version>1.0.0-M7</version>
</dependency>--> </dependency>-->
<!-- spring-ai-tika-document-reader -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-tika-document-reader</artifactId>
<version>1.0.0</version>
</dependency>
<dependency> <dependency>
<groupId>com.baomidou</groupId> <groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-spring-boot3-starter</artifactId> <artifactId>mybatis-plus-spring-boot3-starter</artifactId>

3
src/main/java/com/wok/supportbot/controller/AiController.java

@ -18,7 +18,8 @@ import reactor.core.publisher.Flux;
import java.io.IOException; import java.io.IOException;
@RestController
@RequestMapping("/ai")
public class AiController { public class AiController {
@Resource @Resource

232
src/main/java/com/wok/supportbot/controller/DocumentController.java

@ -0,0 +1,232 @@
package com.wok.supportbot.controller;
import com.wok.supportbot.document.extract.JsonDocumentLoader;
import com.wok.supportbot.document.extract.MarkdownDocumentLoader;
import com.wok.supportbot.document.extract.SimpleStringDocumentReader;
import com.wok.supportbot.document.extract.TikaDocumentReader;
import com.wok.supportbot.document.transform.MyKeywordEnricher;
import com.wok.supportbot.document.transform.MyTokenTextSplitter;
import org.springframework.ai.document.Document;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import java.util.List;
import java.util.Map;
@RestController
@RequestMapping("/document")
public class DocumentController {
@Autowired
private TikaDocumentReader tikaDocumentReader;
@Autowired
private SimpleStringDocumentReader simpleStringDocumentReader;
@Autowired
private MarkdownDocumentLoader markdownDocumentLoader;
@Autowired
private JsonDocumentLoader jsonDocumentLoader;
@Autowired
private MyTokenTextSplitter myTokenTextSplitter;
@Autowired
private MyKeywordEnricher myKeywordEnricher;
@Autowired
private VectorStore pgVectorVectorStore;
/**
* 上传普通文件支持多种格式 Tika 解析
*/
@PostMapping("/upload/file")
public ResponseEntity<Map<String, Object>> uploadFile(@RequestParam("file") MultipartFile file) {
try {
List<Document> documents = tikaDocumentReader.read(file);
// 拆分文档
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);
// 添加元数据
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);
// 转成向量并存入数据库
pgVectorVectorStore.add(enrichedDocuments);
return ResponseEntity.ok(Map.of(
"success", true,
"message", "文件上传并向量化成功",
"documentCount", enrichedDocuments.size()
));
} catch (Exception e) {
return ResponseEntity.status(500).body(Map.of(
"success", false,
"message", "上传失败:" + e.getMessage()
));
}
}
/**
* 上传字符串内容
*/
@PostMapping("/upload/string")
public ResponseEntity<Map<String, Object>> uploadString(@RequestBody String content) {
try {
List<Document> documents = simpleStringDocumentReader.read(content);
// 拆分文档
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);
// 添加元数据
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);
// 转成向量并存入数据库
pgVectorVectorStore.add(enrichedDocuments);
return ResponseEntity.ok(Map.of(
"success", true,
"message", "文本内容上传并向量化成功",
"documentCount", enrichedDocuments.size()
));
} catch (Exception e) {
return ResponseEntity.status(500).body(Map.of(
"success", false,
"message", "上传失败:" + e.getMessage()
));
}
}
/**
* 上传 Markdown 文件
*/
@PostMapping("/upload/markdown")
public ResponseEntity<Map<String, Object>> uploadMarkdown(@RequestParam("file") MultipartFile file) {
try {
List<Document> documents = markdownDocumentLoader.loadMarkdownFromFile(file);
// 拆分文档
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);
// 添加元数据
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);
// 转成向量并存入数据库
pgVectorVectorStore.add(enrichedDocuments);
return ResponseEntity.ok(Map.of(
"success", true,
"message", "Markdown文件上传并向量化成功",
"documentCount", enrichedDocuments.size()
));
} catch (Exception e) {
return ResponseEntity.status(500).body(Map.of(
"success", false,
"message", "上传失败:" + e.getMessage()
));
}
}
/**
* 上传 JSON 文件基本方式
* JSON 根节点当成一个整体文档
*/
@PostMapping("/upload/json/basic")
public ResponseEntity<Map<String, Object>> uploadJsonBasic(@RequestParam("file") MultipartFile file) {
try {
List<Document> documents = jsonDocumentLoader.loadBasicJson(file);
// 拆分文档
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);
// 添加元数据
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);
// 转成向量并存入数据库
pgVectorVectorStore.add(enrichedDocuments);
return ResponseEntity.ok(Map.of(
"success", true,
"message", "JSON文件(基本方式)上传并向量化成功",
"documentCount", enrichedDocuments.size()
));
} catch (Exception e) {
return ResponseEntity.status(500).body(Map.of(
"success", false,
"message", "上传失败:" + e.getMessage()
));
}
}
/**
* 上传 JSON 文件按字段提取
* 用于提取指定字段文本
*/
@PostMapping("/upload/json/fields")
public ResponseEntity<Map<String, Object>> uploadJsonWithFields(
@RequestParam("file") MultipartFile file,
@RequestParam("fields") List<String> fields) {
try {
List<Document> documents = jsonDocumentLoader.loadJsonByFields(file, fields.toArray(new String[0]));
// 拆分文档
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);
// 添加元数据
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);
// 转成向量并存入数据库
pgVectorVectorStore.add(enrichedDocuments);
return ResponseEntity.ok(Map.of(
"success", true,
"message", "JSON文件(按字段)上传并向量化成功",
"documentCount", enrichedDocuments.size(),
"extractedFields", fields
));
} catch (Exception e) {
return ResponseEntity.status(500).body(Map.of(
"success", false,
"message", "上传失败:" + e.getMessage()
));
}
}
/**
* 上传 JSON 文件按指针拆分
* 用于拆分数组元素常用来分段成多文档
*/
@PostMapping("/upload/json/pointer")
public ResponseEntity<Map<String, Object>> uploadJsonWithPointer(
@RequestParam("file") MultipartFile file,
@RequestParam("pointer") String pointer) {
try {
List<Document> documents = jsonDocumentLoader.loadJsonByPointer(file, pointer);
// 拆分文档
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);
// 添加元数据
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);
// 转成向量并存入数据库
pgVectorVectorStore.add(enrichedDocuments);
return ResponseEntity.ok(Map.of(
"success", true,
"message", "JSON文件(按指针)上传并向量化成功",
"documentCount", enrichedDocuments.size(),
"pointer", pointer
));
} catch (Exception e) {
return ResponseEntity.status(500).body(Map.of(
"success", false,
"message", "上传失败:" + e.getMessage()
));
}
}
}

59
src/main/java/com/wok/supportbot/document/extract/JsonDocumentLoader.java

@ -0,0 +1,59 @@
package com.wok.supportbot.document.extract;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.JsonReader;
import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import java.io.File;
import java.io.IOException;
import java.util.List;
@Component
@Slf4j
public class JsonDocumentLoader {
/**
* 基本读取方式
*/
public List<Document> loadBasicJson(MultipartFile file) {
Resource resource = toResource(file);
JsonReader reader = new JsonReader(resource);
return reader.get();
}
/**
* 指定字段读取方式例如 descriptionfeatures 字段
*/
public List<Document> loadJsonByFields(MultipartFile file, String... fields) {
Resource resource = toResource(file);
JsonReader reader = new JsonReader(resource, fields);
return reader.get();
}
/**
* 使用 JSON Pointer 提取数组路径内容 /items
*/
public List<Document> loadJsonByPointer(MultipartFile file, String pointer) {
Resource resource = toResource(file);
JsonReader reader = new JsonReader(resource);
return reader.get(pointer);
}
/**
* MultipartFile 转换为 Resource
*/
private Resource toResource(MultipartFile file) {
try {
File temp = File.createTempFile("upload-", file.getOriginalFilename());
file.transferTo(temp);
return new FileSystemResource(temp);
} catch (IOException e) {
log.error("JSON 文件转换失败", e);
throw new RuntimeException("JSON 文件转换失败", e);
}
}
}

52
src/main/java/com/wok/supportbot/document/extract/MarkdownDocumentLoader.java

@ -2,52 +2,46 @@ package com.wok.supportbot.document.extract;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.document.Document; import org.springframework.ai.document.Document;
import org.springframework.ai.reader.markdown.MarkdownDocumentReader; import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig; import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource; import org.springframework.core.io.Resource;
import org.springframework.core.io.support.ResourcePatternResolver;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* 文档加载
* Markdown 文件上传解析
*/ */
@Component @Component
@Slf4j @Slf4j
public class MarkdownDocumentLoader { public class MarkdownDocumentLoader {
private final ResourcePatternResolver resourcePatternResolver;
public List<Document> loadMarkdownFromFile(MultipartFile file) {
try {
// MultipartFile 保存为临时文件
File temp = File.createTempFile("upload-", file.getOriginalFilename());
file.transferTo(temp);
Resource resource = new FileSystemResource(temp);
public MarkdownDocumentLoader(ResourcePatternResolver resourcePatternResolver) {
this.resourcePatternResolver = resourcePatternResolver;
}
// 配置文档解析
MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
.withHorizontalRuleCreateDocument(true)
.withIncludeCodeBlock(false)
.withIncludeBlockquote(false)
.withAdditionalMetadata("filename", file.getOriginalFilename())
.build();
// 读取文档内容
MarkdownDocumentReader reader = new MarkdownDocumentReader(resource, config);
return reader.get();
/**
* 加载多篇 Markdown 文档
* @return
*/
public List<Document> loadMarkdowns() {
List<Document> allDocuments = new ArrayList<>();
try {
Resource[] resources = resourcePatternResolver.getResources("classpath:document/*.md");
for (Resource resource : resources) {
String filename = resource.getFilename();
MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
.withHorizontalRuleCreateDocument(true)
.withIncludeCodeBlock(false)
.withIncludeBlockquote(false)
.withAdditionalMetadata("filename", filename)
.build();
MarkdownDocumentReader markdownDocumentReader = new MarkdownDocumentReader(resource, config);
allDocuments.addAll(markdownDocumentReader.get());
}
} catch (IOException e) { } catch (IOException e) {
log.error("Markdown 文档加载失败", e);
log.error("Markdown 文件解析失败", e);
throw new RuntimeException("Markdown 文件解析失败", e);
} }
return allDocuments;
} }
} }

37
src/main/java/com/wok/supportbot/document/extract/MyJsonReader.java

@ -1,37 +0,0 @@
package com.wok.supportbot.document.extract;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.JsonReader;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.stereotype.Component;
import java.util.List;
// classpath 下的 JSON 文件中读取文档
@Component
class MyJsonReader {
private final Resource resource;
MyJsonReader(@Value("classpath:products.json") Resource resource) {
this.resource = resource;
}
// 基本用法
List<Document> loadBasicJsonDocuments() {
JsonReader jsonReader = new JsonReader(this.resource);
return jsonReader.get();
}
// 指定使用哪些 JSON 字段作为文档内容
List<Document> loadJsonWithSpecificFields() {
JsonReader jsonReader = new JsonReader(this.resource, "description", "features");
return jsonReader.get();
}
// 使用 JSON 指针精确提取文档内容
List<Document> loadJsonWithPointer() {
JsonReader jsonReader = new JsonReader(this.resource);
return jsonReader.get("/items"); // 提取 items 数组内的内容
}
}

20
src/main/java/com/wok/supportbot/document/extract/SimpleStringDocumentReader.java

@ -0,0 +1,20 @@
package com.wok.supportbot.document.extract;
import org.springframework.ai.document.Document;
import org.springframework.stereotype.Component;
import java.util.Collections;
import java.util.List;
import java.util.UUID;
@Component
public class SimpleStringDocumentReader {
public List<Document> read(String content) {
Document doc = Document.builder()
.id(UUID.randomUUID().toString())
.text(content)
.build();
return Collections.singletonList(doc);
}
}

44
src/main/java/com/wok/supportbot/document/extract/TikaDocumentReader.java

@ -0,0 +1,44 @@
package com.wok.supportbot.document.extract;
import lombok.extern.slf4j.Slf4j;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.springframework.ai.document.Document;
import org.springframework.core.io.Resource;
import org.springframework.core.io.FileSystemResource;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.UUID;
@Component
@Slf4j
public class TikaDocumentReader {
public List<Document> read(MultipartFile file) {
try {
// MultipartFile Resource
File tempFile = File.createTempFile("upload-", file.getOriginalFilename());
file.transferTo(tempFile);
Resource resource = new FileSystemResource(tempFile);
Tika tika = new Tika();
String text = tika.parseToString(resource.getInputStream());
Document doc = Document.builder()
.id(UUID.randomUUID().toString())
.text(text)
.build();
return Collections.singletonList(doc);
} catch (IOException | TikaException e) {
log.error("Tika 文件解析失败", e);
throw new RuntimeException("Tika 文件解析失败", e);
}
}
}

2
src/main/java/com/wok/supportbot/document/transform/MyTokenTextSplitter.java

@ -10,7 +10,7 @@ import java.util.List;
* 自定义基于 Token 的切词器 * 自定义基于 Token 的切词器
*/ */
@Component @Component
class MyTokenTextSplitter {
public class MyTokenTextSplitter {
/** /**
* 使用默认设置创建分割器 * 使用默认设置创建分割器

Loading…
Cancel
Save