From f49e186f8a594a199a448c8db8d23a3bd0e0309c Mon Sep 17 00:00:00 2001 From: hygl <3154803225@qq.com> Date: Mon, 30 Jun 2025 18:25:58 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=A4=9A=E7=A7=8D=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E6=A0=BC=E5=BC=8F=E7=9A=84=E4=B8=8A=E4=BC=A0=E5=92=8C?= =?UTF-8?q?=E8=A7=A3=E6=9E=90,=20=E5=AE=9E=E7=8E=B0=E8=87=AA=E5=AE=9A?= =?UTF-8?q?=E4=B9=89=E7=9F=A5=E8=AF=86=E5=BA=93=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 6 + .../supportbot/controller/AiController.java | 3 +- .../controller/DocumentController.java | 232 ++++++++++++++++++ .../document/extract/JsonDocumentLoader.java | 59 +++++ .../extract/MarkdownDocumentLoader.java | 52 ++-- .../document/extract/MyJsonReader.java | 37 --- .../extract/SimpleStringDocumentReader.java | 20 ++ .../document/extract/TikaDocumentReader.java | 44 ++++ .../transform/MyTokenTextSplitter.java | 2 +- 9 files changed, 387 insertions(+), 68 deletions(-) create mode 100644 src/main/java/com/wok/supportbot/controller/DocumentController.java create mode 100644 src/main/java/com/wok/supportbot/document/extract/JsonDocumentLoader.java delete mode 100644 src/main/java/com/wok/supportbot/document/extract/MyJsonReader.java create mode 100644 src/main/java/com/wok/supportbot/document/extract/SimpleStringDocumentReader.java create mode 100644 src/main/java/com/wok/supportbot/document/extract/TikaDocumentReader.java diff --git a/pom.xml b/pom.xml index a5c01d4..d79948a 100644 --- a/pom.xml +++ b/pom.xml @@ -101,6 +101,12 @@ spring-ai-starter-vector-store-pgvector 1.0.0-M7 --> + + + org.springframework.ai + spring-ai-tika-document-reader + 1.0.0 + com.baomidou mybatis-plus-spring-boot3-starter diff --git a/src/main/java/com/wok/supportbot/controller/AiController.java b/src/main/java/com/wok/supportbot/controller/AiController.java index ef6c15b..1bd6f7e 100644 --- a/src/main/java/com/wok/supportbot/controller/AiController.java +++ b/src/main/java/com/wok/supportbot/controller/AiController.java @@ -18,7 +18,8 @@ import reactor.core.publisher.Flux; import java.io.IOException; - +@RestController +@RequestMapping("/ai") public class AiController { @Resource diff --git a/src/main/java/com/wok/supportbot/controller/DocumentController.java b/src/main/java/com/wok/supportbot/controller/DocumentController.java new file mode 100644 index 0000000..bcc7d34 --- /dev/null +++ b/src/main/java/com/wok/supportbot/controller/DocumentController.java @@ -0,0 +1,232 @@ +package com.wok.supportbot.controller; + +import com.wok.supportbot.document.extract.JsonDocumentLoader; +import com.wok.supportbot.document.extract.MarkdownDocumentLoader; +import com.wok.supportbot.document.extract.SimpleStringDocumentReader; +import com.wok.supportbot.document.extract.TikaDocumentReader; +import com.wok.supportbot.document.transform.MyKeywordEnricher; +import com.wok.supportbot.document.transform.MyTokenTextSplitter; +import org.springframework.ai.document.Document; +import org.springframework.ai.vectorstore.VectorStore; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; +import org.springframework.web.multipart.MultipartFile; + +import java.util.List; +import java.util.Map; + +@RestController +@RequestMapping("/document") +public class DocumentController { + + @Autowired + private TikaDocumentReader tikaDocumentReader; + + @Autowired + private SimpleStringDocumentReader simpleStringDocumentReader; + + @Autowired + private MarkdownDocumentLoader markdownDocumentLoader; + + @Autowired + private JsonDocumentLoader jsonDocumentLoader; + + @Autowired + private MyTokenTextSplitter myTokenTextSplitter; + + @Autowired + private MyKeywordEnricher myKeywordEnricher; + + @Autowired + private VectorStore pgVectorVectorStore; + + /** + * 上传普通文件(支持多种格式),用 Tika 解析 + */ + @PostMapping("/upload/file") + public ResponseEntity> uploadFile(@RequestParam("file") MultipartFile file) { + try { + List documents = tikaDocumentReader.read(file); + + // 拆分文档 + List splitDocuments = myTokenTextSplitter.splitDocuments(documents); + + // 添加元数据 + List enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); + + // 转成向量并存入数据库 + pgVectorVectorStore.add(enrichedDocuments); + + return ResponseEntity.ok(Map.of( + "success", true, + "message", "文件上传并向量化成功", + "documentCount", enrichedDocuments.size() + )); + } catch (Exception e) { + return ResponseEntity.status(500).body(Map.of( + "success", false, + "message", "上传失败:" + e.getMessage() + )); + } + } + + /** + * 上传字符串内容 + */ + @PostMapping("/upload/string") + public ResponseEntity> uploadString(@RequestBody String content) { + try { + List documents = simpleStringDocumentReader.read(content); + + // 拆分文档 + List splitDocuments = myTokenTextSplitter.splitDocuments(documents); + + // 添加元数据 + List enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); + + // 转成向量并存入数据库 + pgVectorVectorStore.add(enrichedDocuments); + + return ResponseEntity.ok(Map.of( + "success", true, + "message", "文本内容上传并向量化成功", + "documentCount", enrichedDocuments.size() + )); + } catch (Exception e) { + return ResponseEntity.status(500).body(Map.of( + "success", false, + "message", "上传失败:" + e.getMessage() + )); + } + } + + /** + * 上传 Markdown 文件 + */ + @PostMapping("/upload/markdown") + public ResponseEntity> uploadMarkdown(@RequestParam("file") MultipartFile file) { + try { + List documents = markdownDocumentLoader.loadMarkdownFromFile(file); + + // 拆分文档 + List splitDocuments = myTokenTextSplitter.splitDocuments(documents); + + // 添加元数据 + List enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); + + // 转成向量并存入数据库 + pgVectorVectorStore.add(enrichedDocuments); + + return ResponseEntity.ok(Map.of( + "success", true, + "message", "Markdown文件上传并向量化成功", + "documentCount", enrichedDocuments.size() + )); + } catch (Exception e) { + return ResponseEntity.status(500).body(Map.of( + "success", false, + "message", "上传失败:" + e.getMessage() + )); + } + } + + /** + * 上传 JSON 文件(基本方式) + * 把 JSON 根节点当成一个整体文档 + */ + @PostMapping("/upload/json/basic") + public ResponseEntity> uploadJsonBasic(@RequestParam("file") MultipartFile file) { + try { + List documents = jsonDocumentLoader.loadBasicJson(file); + + // 拆分文档 + List splitDocuments = myTokenTextSplitter.splitDocuments(documents); + + // 添加元数据 + List enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); + + // 转成向量并存入数据库 + pgVectorVectorStore.add(enrichedDocuments); + + return ResponseEntity.ok(Map.of( + "success", true, + "message", "JSON文件(基本方式)上传并向量化成功", + "documentCount", enrichedDocuments.size() + )); + } catch (Exception e) { + return ResponseEntity.status(500).body(Map.of( + "success", false, + "message", "上传失败:" + e.getMessage() + )); + } + } + + /** + * 上传 JSON 文件(按字段提取) + * 用于提取指定字段文本 + */ + @PostMapping("/upload/json/fields") + public ResponseEntity> uploadJsonWithFields( + @RequestParam("file") MultipartFile file, + @RequestParam("fields") List fields) { + try { + List documents = jsonDocumentLoader.loadJsonByFields(file, fields.toArray(new String[0])); + + // 拆分文档 + List splitDocuments = myTokenTextSplitter.splitDocuments(documents); + + // 添加元数据 + List enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); + + // 转成向量并存入数据库 + pgVectorVectorStore.add(enrichedDocuments); + + return ResponseEntity.ok(Map.of( + "success", true, + "message", "JSON文件(按字段)上传并向量化成功", + "documentCount", enrichedDocuments.size(), + "extractedFields", fields + )); + } catch (Exception e) { + return ResponseEntity.status(500).body(Map.of( + "success", false, + "message", "上传失败:" + e.getMessage() + )); + } + } + + /** + * 上传 JSON 文件(按指针拆分) + * 用于拆分数组元素,常用来分段成多文档 + */ + @PostMapping("/upload/json/pointer") + public ResponseEntity> uploadJsonWithPointer( + @RequestParam("file") MultipartFile file, + @RequestParam("pointer") String pointer) { + try { + List documents = jsonDocumentLoader.loadJsonByPointer(file, pointer); + + // 拆分文档 + List splitDocuments = myTokenTextSplitter.splitDocuments(documents); + + // 添加元数据 + List enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); + + // 转成向量并存入数据库 + pgVectorVectorStore.add(enrichedDocuments); + + return ResponseEntity.ok(Map.of( + "success", true, + "message", "JSON文件(按指针)上传并向量化成功", + "documentCount", enrichedDocuments.size(), + "pointer", pointer + )); + } catch (Exception e) { + return ResponseEntity.status(500).body(Map.of( + "success", false, + "message", "上传失败:" + e.getMessage() + )); + } + } +} diff --git a/src/main/java/com/wok/supportbot/document/extract/JsonDocumentLoader.java b/src/main/java/com/wok/supportbot/document/extract/JsonDocumentLoader.java new file mode 100644 index 0000000..3fcdcd7 --- /dev/null +++ b/src/main/java/com/wok/supportbot/document/extract/JsonDocumentLoader.java @@ -0,0 +1,59 @@ +package com.wok.supportbot.document.extract; + +import lombok.extern.slf4j.Slf4j; +import org.springframework.ai.document.Document; +import org.springframework.ai.reader.JsonReader; +import org.springframework.core.io.FileSystemResource; +import org.springframework.core.io.Resource; +import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +@Component +@Slf4j +public class JsonDocumentLoader { + + /** + * 基本读取方式 + */ + public List loadBasicJson(MultipartFile file) { + Resource resource = toResource(file); + JsonReader reader = new JsonReader(resource); + return reader.get(); + } + + /** + * 指定字段读取方式(例如 description、features 字段) + */ + public List loadJsonByFields(MultipartFile file, String... fields) { + Resource resource = toResource(file); + JsonReader reader = new JsonReader(resource, fields); + return reader.get(); + } + + /** + * 使用 JSON Pointer 提取数组路径内容(如 /items) + */ + public List loadJsonByPointer(MultipartFile file, String pointer) { + Resource resource = toResource(file); + JsonReader reader = new JsonReader(resource); + return reader.get(pointer); + } + + /** + * 将 MultipartFile 转换为 Resource + */ + private Resource toResource(MultipartFile file) { + try { + File temp = File.createTempFile("upload-", file.getOriginalFilename()); + file.transferTo(temp); + return new FileSystemResource(temp); + } catch (IOException e) { + log.error("JSON 文件转换失败", e); + throw new RuntimeException("JSON 文件转换失败", e); + } + } +} diff --git a/src/main/java/com/wok/supportbot/document/extract/MarkdownDocumentLoader.java b/src/main/java/com/wok/supportbot/document/extract/MarkdownDocumentLoader.java index 494943d..e981088 100644 --- a/src/main/java/com/wok/supportbot/document/extract/MarkdownDocumentLoader.java +++ b/src/main/java/com/wok/supportbot/document/extract/MarkdownDocumentLoader.java @@ -2,52 +2,46 @@ package com.wok.supportbot.document.extract; import lombok.extern.slf4j.Slf4j; import org.springframework.ai.document.Document; - import org.springframework.ai.reader.markdown.MarkdownDocumentReader; import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig; +import org.springframework.core.io.FileSystemResource; import org.springframework.core.io.Resource; -import org.springframework.core.io.support.ResourcePatternResolver; import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; +import java.io.File; import java.io.IOException; -import java.util.ArrayList; import java.util.List; /** - * 文档加载器 + * Markdown 文件上传解析器 */ @Component @Slf4j public class MarkdownDocumentLoader { - private final ResourcePatternResolver resourcePatternResolver; + public List loadMarkdownFromFile(MultipartFile file) { + try { + // 将 MultipartFile 保存为临时文件 + File temp = File.createTempFile("upload-", file.getOriginalFilename()); + file.transferTo(temp); + Resource resource = new FileSystemResource(temp); - public MarkdownDocumentLoader(ResourcePatternResolver resourcePatternResolver) { - this.resourcePatternResolver = resourcePatternResolver; - } + // 配置文档解析 + MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder() + .withHorizontalRuleCreateDocument(true) + .withIncludeCodeBlock(false) + .withIncludeBlockquote(false) + .withAdditionalMetadata("filename", file.getOriginalFilename()) + .build(); + + // 读取文档内容 + MarkdownDocumentReader reader = new MarkdownDocumentReader(resource, config); + return reader.get(); - /** - * 加载多篇 Markdown 文档 - * @return - */ - public List loadMarkdowns() { - List allDocuments = new ArrayList<>(); - try { - Resource[] resources = resourcePatternResolver.getResources("classpath:document/*.md"); - for (Resource resource : resources) { - String filename = resource.getFilename(); - MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder() - .withHorizontalRuleCreateDocument(true) - .withIncludeCodeBlock(false) - .withIncludeBlockquote(false) - .withAdditionalMetadata("filename", filename) - .build(); - MarkdownDocumentReader markdownDocumentReader = new MarkdownDocumentReader(resource, config); - allDocuments.addAll(markdownDocumentReader.get()); - } } catch (IOException e) { - log.error("Markdown 文档加载失败", e); + log.error("Markdown 文件解析失败", e); + throw new RuntimeException("Markdown 文件解析失败", e); } - return allDocuments; } } diff --git a/src/main/java/com/wok/supportbot/document/extract/MyJsonReader.java b/src/main/java/com/wok/supportbot/document/extract/MyJsonReader.java deleted file mode 100644 index c7be227..0000000 --- a/src/main/java/com/wok/supportbot/document/extract/MyJsonReader.java +++ /dev/null @@ -1,37 +0,0 @@ -package com.wok.supportbot.document.extract; - -import org.springframework.ai.document.Document; -import org.springframework.ai.reader.JsonReader; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.core.io.Resource; -import org.springframework.stereotype.Component; - -import java.util.List; - -// 从 classpath 下的 JSON 文件中读取文档 - @Component - class MyJsonReader { - private final Resource resource; - - MyJsonReader(@Value("classpath:products.json") Resource resource) { - this.resource = resource; - } - - // 基本用法 - List loadBasicJsonDocuments() { - JsonReader jsonReader = new JsonReader(this.resource); - return jsonReader.get(); - } - - // 指定使用哪些 JSON 字段作为文档内容 - List loadJsonWithSpecificFields() { - JsonReader jsonReader = new JsonReader(this.resource, "description", "features"); - return jsonReader.get(); - } - - // 使用 JSON 指针精确提取文档内容 - List loadJsonWithPointer() { - JsonReader jsonReader = new JsonReader(this.resource); - return jsonReader.get("/items"); // 提取 items 数组内的内容 - } - } \ No newline at end of file diff --git a/src/main/java/com/wok/supportbot/document/extract/SimpleStringDocumentReader.java b/src/main/java/com/wok/supportbot/document/extract/SimpleStringDocumentReader.java new file mode 100644 index 0000000..b9d29cc --- /dev/null +++ b/src/main/java/com/wok/supportbot/document/extract/SimpleStringDocumentReader.java @@ -0,0 +1,20 @@ +package com.wok.supportbot.document.extract; + +import org.springframework.ai.document.Document; +import org.springframework.stereotype.Component; + +import java.util.Collections; +import java.util.List; +import java.util.UUID; + +@Component +public class SimpleStringDocumentReader { + + public List read(String content) { + Document doc = Document.builder() + .id(UUID.randomUUID().toString()) + .text(content) + .build(); + return Collections.singletonList(doc); + } +} diff --git a/src/main/java/com/wok/supportbot/document/extract/TikaDocumentReader.java b/src/main/java/com/wok/supportbot/document/extract/TikaDocumentReader.java new file mode 100644 index 0000000..7ee7d2c --- /dev/null +++ b/src/main/java/com/wok/supportbot/document/extract/TikaDocumentReader.java @@ -0,0 +1,44 @@ +package com.wok.supportbot.document.extract; + +import lombok.extern.slf4j.Slf4j; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.springframework.ai.document.Document; +import org.springframework.core.io.Resource; +import org.springframework.core.io.FileSystemResource; +import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.UUID; + +@Component +@Slf4j +public class TikaDocumentReader { + + public List read(MultipartFile file) { + try { + // MultipartFile 转 Resource + File tempFile = File.createTempFile("upload-", file.getOriginalFilename()); + file.transferTo(tempFile); + Resource resource = new FileSystemResource(tempFile); + + Tika tika = new Tika(); + String text = tika.parseToString(resource.getInputStream()); + + Document doc = Document.builder() + .id(UUID.randomUUID().toString()) + .text(text) + .build(); + + return Collections.singletonList(doc); + + } catch (IOException | TikaException e) { + log.error("Tika 文件解析失败", e); + throw new RuntimeException("Tika 文件解析失败", e); + } + } +} diff --git a/src/main/java/com/wok/supportbot/document/transform/MyTokenTextSplitter.java b/src/main/java/com/wok/supportbot/document/transform/MyTokenTextSplitter.java index 0584b96..3e13b15 100644 --- a/src/main/java/com/wok/supportbot/document/transform/MyTokenTextSplitter.java +++ b/src/main/java/com/wok/supportbot/document/transform/MyTokenTextSplitter.java @@ -10,7 +10,7 @@ import java.util.List; * 自定义基于 Token 的切词器 */ @Component -class MyTokenTextSplitter { +public class MyTokenTextSplitter { /** * 使用默认设置创建分割器。