9 changed files with 387 additions and 68 deletions
-
6pom.xml
-
3src/main/java/com/wok/supportbot/controller/AiController.java
-
232src/main/java/com/wok/supportbot/controller/DocumentController.java
-
59src/main/java/com/wok/supportbot/document/extract/JsonDocumentLoader.java
-
44src/main/java/com/wok/supportbot/document/extract/MarkdownDocumentLoader.java
-
37src/main/java/com/wok/supportbot/document/extract/MyJsonReader.java
-
20src/main/java/com/wok/supportbot/document/extract/SimpleStringDocumentReader.java
-
44src/main/java/com/wok/supportbot/document/extract/TikaDocumentReader.java
-
2src/main/java/com/wok/supportbot/document/transform/MyTokenTextSplitter.java
@ -0,0 +1,232 @@ |
|||
package com.wok.supportbot.controller; |
|||
|
|||
import com.wok.supportbot.document.extract.JsonDocumentLoader; |
|||
import com.wok.supportbot.document.extract.MarkdownDocumentLoader; |
|||
import com.wok.supportbot.document.extract.SimpleStringDocumentReader; |
|||
import com.wok.supportbot.document.extract.TikaDocumentReader; |
|||
import com.wok.supportbot.document.transform.MyKeywordEnricher; |
|||
import com.wok.supportbot.document.transform.MyTokenTextSplitter; |
|||
import org.springframework.ai.document.Document; |
|||
import org.springframework.ai.vectorstore.VectorStore; |
|||
import org.springframework.beans.factory.annotation.Autowired; |
|||
import org.springframework.http.ResponseEntity; |
|||
import org.springframework.web.bind.annotation.*; |
|||
import org.springframework.web.multipart.MultipartFile; |
|||
|
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
@RestController |
|||
@RequestMapping("/document") |
|||
public class DocumentController { |
|||
|
|||
@Autowired |
|||
private TikaDocumentReader tikaDocumentReader; |
|||
|
|||
@Autowired |
|||
private SimpleStringDocumentReader simpleStringDocumentReader; |
|||
|
|||
@Autowired |
|||
private MarkdownDocumentLoader markdownDocumentLoader; |
|||
|
|||
@Autowired |
|||
private JsonDocumentLoader jsonDocumentLoader; |
|||
|
|||
@Autowired |
|||
private MyTokenTextSplitter myTokenTextSplitter; |
|||
|
|||
@Autowired |
|||
private MyKeywordEnricher myKeywordEnricher; |
|||
|
|||
@Autowired |
|||
private VectorStore pgVectorVectorStore; |
|||
|
|||
/** |
|||
* 上传普通文件(支持多种格式),用 Tika 解析 |
|||
*/ |
|||
@PostMapping("/upload/file") |
|||
public ResponseEntity<Map<String, Object>> uploadFile(@RequestParam("file") MultipartFile file) { |
|||
try { |
|||
List<Document> documents = tikaDocumentReader.read(file); |
|||
|
|||
// 拆分文档 |
|||
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
|||
|
|||
// 添加元数据 |
|||
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
|||
|
|||
// 转成向量并存入数据库 |
|||
pgVectorVectorStore.add(enrichedDocuments); |
|||
|
|||
return ResponseEntity.ok(Map.of( |
|||
"success", true, |
|||
"message", "文件上传并向量化成功", |
|||
"documentCount", enrichedDocuments.size() |
|||
)); |
|||
} catch (Exception e) { |
|||
return ResponseEntity.status(500).body(Map.of( |
|||
"success", false, |
|||
"message", "上传失败:" + e.getMessage() |
|||
)); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 上传字符串内容 |
|||
*/ |
|||
@PostMapping("/upload/string") |
|||
public ResponseEntity<Map<String, Object>> uploadString(@RequestBody String content) { |
|||
try { |
|||
List<Document> documents = simpleStringDocumentReader.read(content); |
|||
|
|||
// 拆分文档 |
|||
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
|||
|
|||
// 添加元数据 |
|||
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
|||
|
|||
// 转成向量并存入数据库 |
|||
pgVectorVectorStore.add(enrichedDocuments); |
|||
|
|||
return ResponseEntity.ok(Map.of( |
|||
"success", true, |
|||
"message", "文本内容上传并向量化成功", |
|||
"documentCount", enrichedDocuments.size() |
|||
)); |
|||
} catch (Exception e) { |
|||
return ResponseEntity.status(500).body(Map.of( |
|||
"success", false, |
|||
"message", "上传失败:" + e.getMessage() |
|||
)); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 上传 Markdown 文件 |
|||
*/ |
|||
@PostMapping("/upload/markdown") |
|||
public ResponseEntity<Map<String, Object>> uploadMarkdown(@RequestParam("file") MultipartFile file) { |
|||
try { |
|||
List<Document> documents = markdownDocumentLoader.loadMarkdownFromFile(file); |
|||
|
|||
// 拆分文档 |
|||
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
|||
|
|||
// 添加元数据 |
|||
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
|||
|
|||
// 转成向量并存入数据库 |
|||
pgVectorVectorStore.add(enrichedDocuments); |
|||
|
|||
return ResponseEntity.ok(Map.of( |
|||
"success", true, |
|||
"message", "Markdown文件上传并向量化成功", |
|||
"documentCount", enrichedDocuments.size() |
|||
)); |
|||
} catch (Exception e) { |
|||
return ResponseEntity.status(500).body(Map.of( |
|||
"success", false, |
|||
"message", "上传失败:" + e.getMessage() |
|||
)); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 上传 JSON 文件(基本方式) |
|||
* 把 JSON 根节点当成一个整体文档 |
|||
*/ |
|||
@PostMapping("/upload/json/basic") |
|||
public ResponseEntity<Map<String, Object>> uploadJsonBasic(@RequestParam("file") MultipartFile file) { |
|||
try { |
|||
List<Document> documents = jsonDocumentLoader.loadBasicJson(file); |
|||
|
|||
// 拆分文档 |
|||
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
|||
|
|||
// 添加元数据 |
|||
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
|||
|
|||
// 转成向量并存入数据库 |
|||
pgVectorVectorStore.add(enrichedDocuments); |
|||
|
|||
return ResponseEntity.ok(Map.of( |
|||
"success", true, |
|||
"message", "JSON文件(基本方式)上传并向量化成功", |
|||
"documentCount", enrichedDocuments.size() |
|||
)); |
|||
} catch (Exception e) { |
|||
return ResponseEntity.status(500).body(Map.of( |
|||
"success", false, |
|||
"message", "上传失败:" + e.getMessage() |
|||
)); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 上传 JSON 文件(按字段提取) |
|||
* 用于提取指定字段文本 |
|||
*/ |
|||
@PostMapping("/upload/json/fields") |
|||
public ResponseEntity<Map<String, Object>> uploadJsonWithFields( |
|||
@RequestParam("file") MultipartFile file, |
|||
@RequestParam("fields") List<String> fields) { |
|||
try { |
|||
List<Document> documents = jsonDocumentLoader.loadJsonByFields(file, fields.toArray(new String[0])); |
|||
|
|||
// 拆分文档 |
|||
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
|||
|
|||
// 添加元数据 |
|||
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
|||
|
|||
// 转成向量并存入数据库 |
|||
pgVectorVectorStore.add(enrichedDocuments); |
|||
|
|||
return ResponseEntity.ok(Map.of( |
|||
"success", true, |
|||
"message", "JSON文件(按字段)上传并向量化成功", |
|||
"documentCount", enrichedDocuments.size(), |
|||
"extractedFields", fields |
|||
)); |
|||
} catch (Exception e) { |
|||
return ResponseEntity.status(500).body(Map.of( |
|||
"success", false, |
|||
"message", "上传失败:" + e.getMessage() |
|||
)); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 上传 JSON 文件(按指针拆分) |
|||
* 用于拆分数组元素,常用来分段成多文档 |
|||
*/ |
|||
@PostMapping("/upload/json/pointer") |
|||
public ResponseEntity<Map<String, Object>> uploadJsonWithPointer( |
|||
@RequestParam("file") MultipartFile file, |
|||
@RequestParam("pointer") String pointer) { |
|||
try { |
|||
List<Document> documents = jsonDocumentLoader.loadJsonByPointer(file, pointer); |
|||
|
|||
// 拆分文档 |
|||
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
|||
|
|||
// 添加元数据 |
|||
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
|||
|
|||
// 转成向量并存入数据库 |
|||
pgVectorVectorStore.add(enrichedDocuments); |
|||
|
|||
return ResponseEntity.ok(Map.of( |
|||
"success", true, |
|||
"message", "JSON文件(按指针)上传并向量化成功", |
|||
"documentCount", enrichedDocuments.size(), |
|||
"pointer", pointer |
|||
)); |
|||
} catch (Exception e) { |
|||
return ResponseEntity.status(500).body(Map.of( |
|||
"success", false, |
|||
"message", "上传失败:" + e.getMessage() |
|||
)); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,59 @@ |
|||
package com.wok.supportbot.document.extract; |
|||
|
|||
import lombok.extern.slf4j.Slf4j; |
|||
import org.springframework.ai.document.Document; |
|||
import org.springframework.ai.reader.JsonReader; |
|||
import org.springframework.core.io.FileSystemResource; |
|||
import org.springframework.core.io.Resource; |
|||
import org.springframework.stereotype.Component; |
|||
import org.springframework.web.multipart.MultipartFile; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.util.List; |
|||
|
|||
@Component |
|||
@Slf4j |
|||
public class JsonDocumentLoader { |
|||
|
|||
/** |
|||
* 基本读取方式 |
|||
*/ |
|||
public List<Document> loadBasicJson(MultipartFile file) { |
|||
Resource resource = toResource(file); |
|||
JsonReader reader = new JsonReader(resource); |
|||
return reader.get(); |
|||
} |
|||
|
|||
/** |
|||
* 指定字段读取方式(例如 description、features 字段) |
|||
*/ |
|||
public List<Document> loadJsonByFields(MultipartFile file, String... fields) { |
|||
Resource resource = toResource(file); |
|||
JsonReader reader = new JsonReader(resource, fields); |
|||
return reader.get(); |
|||
} |
|||
|
|||
/** |
|||
* 使用 JSON Pointer 提取数组路径内容(如 /items) |
|||
*/ |
|||
public List<Document> loadJsonByPointer(MultipartFile file, String pointer) { |
|||
Resource resource = toResource(file); |
|||
JsonReader reader = new JsonReader(resource); |
|||
return reader.get(pointer); |
|||
} |
|||
|
|||
/** |
|||
* 将 MultipartFile 转换为 Resource |
|||
*/ |
|||
private Resource toResource(MultipartFile file) { |
|||
try { |
|||
File temp = File.createTempFile("upload-", file.getOriginalFilename()); |
|||
file.transferTo(temp); |
|||
return new FileSystemResource(temp); |
|||
} catch (IOException e) { |
|||
log.error("JSON 文件转换失败", e); |
|||
throw new RuntimeException("JSON 文件转换失败", e); |
|||
} |
|||
} |
|||
} |
|||
@ -1,37 +0,0 @@ |
|||
package com.wok.supportbot.document.extract; |
|||
|
|||
import org.springframework.ai.document.Document; |
|||
import org.springframework.ai.reader.JsonReader; |
|||
import org.springframework.beans.factory.annotation.Value; |
|||
import org.springframework.core.io.Resource; |
|||
import org.springframework.stereotype.Component; |
|||
|
|||
import java.util.List; |
|||
|
|||
// 从 classpath 下的 JSON 文件中读取文档 |
|||
@Component |
|||
class MyJsonReader { |
|||
private final Resource resource; |
|||
|
|||
MyJsonReader(@Value("classpath:products.json") Resource resource) { |
|||
this.resource = resource; |
|||
} |
|||
|
|||
// 基本用法 |
|||
List<Document> loadBasicJsonDocuments() { |
|||
JsonReader jsonReader = new JsonReader(this.resource); |
|||
return jsonReader.get(); |
|||
} |
|||
|
|||
// 指定使用哪些 JSON 字段作为文档内容 |
|||
List<Document> loadJsonWithSpecificFields() { |
|||
JsonReader jsonReader = new JsonReader(this.resource, "description", "features"); |
|||
return jsonReader.get(); |
|||
} |
|||
|
|||
// 使用 JSON 指针精确提取文档内容 |
|||
List<Document> loadJsonWithPointer() { |
|||
JsonReader jsonReader = new JsonReader(this.resource); |
|||
return jsonReader.get("/items"); // 提取 items 数组内的内容 |
|||
} |
|||
} |
|||
@ -0,0 +1,20 @@ |
|||
package com.wok.supportbot.document.extract; |
|||
|
|||
import org.springframework.ai.document.Document; |
|||
import org.springframework.stereotype.Component; |
|||
|
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
import java.util.UUID; |
|||
|
|||
@Component |
|||
public class SimpleStringDocumentReader { |
|||
|
|||
public List<Document> read(String content) { |
|||
Document doc = Document.builder() |
|||
.id(UUID.randomUUID().toString()) |
|||
.text(content) |
|||
.build(); |
|||
return Collections.singletonList(doc); |
|||
} |
|||
} |
|||
@ -0,0 +1,44 @@ |
|||
package com.wok.supportbot.document.extract; |
|||
|
|||
import lombok.extern.slf4j.Slf4j; |
|||
import org.apache.tika.Tika; |
|||
import org.apache.tika.exception.TikaException; |
|||
import org.springframework.ai.document.Document; |
|||
import org.springframework.core.io.Resource; |
|||
import org.springframework.core.io.FileSystemResource; |
|||
import org.springframework.stereotype.Component; |
|||
import org.springframework.web.multipart.MultipartFile; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
import java.util.UUID; |
|||
|
|||
@Component |
|||
@Slf4j |
|||
public class TikaDocumentReader { |
|||
|
|||
public List<Document> read(MultipartFile file) { |
|||
try { |
|||
// MultipartFile 转 Resource |
|||
File tempFile = File.createTempFile("upload-", file.getOriginalFilename()); |
|||
file.transferTo(tempFile); |
|||
Resource resource = new FileSystemResource(tempFile); |
|||
|
|||
Tika tika = new Tika(); |
|||
String text = tika.parseToString(resource.getInputStream()); |
|||
|
|||
Document doc = Document.builder() |
|||
.id(UUID.randomUUID().toString()) |
|||
.text(text) |
|||
.build(); |
|||
|
|||
return Collections.singletonList(doc); |
|||
|
|||
} catch (IOException | TikaException e) { |
|||
log.error("Tika 文件解析失败", e); |
|||
throw new RuntimeException("Tika 文件解析失败", e); |
|||
} |
|||
} |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue