9 changed files with 387 additions and 68 deletions
-
6pom.xml
-
3src/main/java/com/wok/supportbot/controller/AiController.java
-
232src/main/java/com/wok/supportbot/controller/DocumentController.java
-
59src/main/java/com/wok/supportbot/document/extract/JsonDocumentLoader.java
-
44src/main/java/com/wok/supportbot/document/extract/MarkdownDocumentLoader.java
-
37src/main/java/com/wok/supportbot/document/extract/MyJsonReader.java
-
20src/main/java/com/wok/supportbot/document/extract/SimpleStringDocumentReader.java
-
44src/main/java/com/wok/supportbot/document/extract/TikaDocumentReader.java
-
2src/main/java/com/wok/supportbot/document/transform/MyTokenTextSplitter.java
@ -0,0 +1,232 @@ |
|||||
|
package com.wok.supportbot.controller; |
||||
|
|
||||
|
import com.wok.supportbot.document.extract.JsonDocumentLoader; |
||||
|
import com.wok.supportbot.document.extract.MarkdownDocumentLoader; |
||||
|
import com.wok.supportbot.document.extract.SimpleStringDocumentReader; |
||||
|
import com.wok.supportbot.document.extract.TikaDocumentReader; |
||||
|
import com.wok.supportbot.document.transform.MyKeywordEnricher; |
||||
|
import com.wok.supportbot.document.transform.MyTokenTextSplitter; |
||||
|
import org.springframework.ai.document.Document; |
||||
|
import org.springframework.ai.vectorstore.VectorStore; |
||||
|
import org.springframework.beans.factory.annotation.Autowired; |
||||
|
import org.springframework.http.ResponseEntity; |
||||
|
import org.springframework.web.bind.annotation.*; |
||||
|
import org.springframework.web.multipart.MultipartFile; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
@RestController |
||||
|
@RequestMapping("/document") |
||||
|
public class DocumentController { |
||||
|
|
||||
|
@Autowired |
||||
|
private TikaDocumentReader tikaDocumentReader; |
||||
|
|
||||
|
@Autowired |
||||
|
private SimpleStringDocumentReader simpleStringDocumentReader; |
||||
|
|
||||
|
@Autowired |
||||
|
private MarkdownDocumentLoader markdownDocumentLoader; |
||||
|
|
||||
|
@Autowired |
||||
|
private JsonDocumentLoader jsonDocumentLoader; |
||||
|
|
||||
|
@Autowired |
||||
|
private MyTokenTextSplitter myTokenTextSplitter; |
||||
|
|
||||
|
@Autowired |
||||
|
private MyKeywordEnricher myKeywordEnricher; |
||||
|
|
||||
|
@Autowired |
||||
|
private VectorStore pgVectorVectorStore; |
||||
|
|
||||
|
/** |
||||
|
* 上传普通文件(支持多种格式),用 Tika 解析 |
||||
|
*/ |
||||
|
@PostMapping("/upload/file") |
||||
|
public ResponseEntity<Map<String, Object>> uploadFile(@RequestParam("file") MultipartFile file) { |
||||
|
try { |
||||
|
List<Document> documents = tikaDocumentReader.read(file); |
||||
|
|
||||
|
// 拆分文档 |
||||
|
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
||||
|
|
||||
|
// 添加元数据 |
||||
|
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
||||
|
|
||||
|
// 转成向量并存入数据库 |
||||
|
pgVectorVectorStore.add(enrichedDocuments); |
||||
|
|
||||
|
return ResponseEntity.ok(Map.of( |
||||
|
"success", true, |
||||
|
"message", "文件上传并向量化成功", |
||||
|
"documentCount", enrichedDocuments.size() |
||||
|
)); |
||||
|
} catch (Exception e) { |
||||
|
return ResponseEntity.status(500).body(Map.of( |
||||
|
"success", false, |
||||
|
"message", "上传失败:" + e.getMessage() |
||||
|
)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 上传字符串内容 |
||||
|
*/ |
||||
|
@PostMapping("/upload/string") |
||||
|
public ResponseEntity<Map<String, Object>> uploadString(@RequestBody String content) { |
||||
|
try { |
||||
|
List<Document> documents = simpleStringDocumentReader.read(content); |
||||
|
|
||||
|
// 拆分文档 |
||||
|
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
||||
|
|
||||
|
// 添加元数据 |
||||
|
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
||||
|
|
||||
|
// 转成向量并存入数据库 |
||||
|
pgVectorVectorStore.add(enrichedDocuments); |
||||
|
|
||||
|
return ResponseEntity.ok(Map.of( |
||||
|
"success", true, |
||||
|
"message", "文本内容上传并向量化成功", |
||||
|
"documentCount", enrichedDocuments.size() |
||||
|
)); |
||||
|
} catch (Exception e) { |
||||
|
return ResponseEntity.status(500).body(Map.of( |
||||
|
"success", false, |
||||
|
"message", "上传失败:" + e.getMessage() |
||||
|
)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 上传 Markdown 文件 |
||||
|
*/ |
||||
|
@PostMapping("/upload/markdown") |
||||
|
public ResponseEntity<Map<String, Object>> uploadMarkdown(@RequestParam("file") MultipartFile file) { |
||||
|
try { |
||||
|
List<Document> documents = markdownDocumentLoader.loadMarkdownFromFile(file); |
||||
|
|
||||
|
// 拆分文档 |
||||
|
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
||||
|
|
||||
|
// 添加元数据 |
||||
|
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
||||
|
|
||||
|
// 转成向量并存入数据库 |
||||
|
pgVectorVectorStore.add(enrichedDocuments); |
||||
|
|
||||
|
return ResponseEntity.ok(Map.of( |
||||
|
"success", true, |
||||
|
"message", "Markdown文件上传并向量化成功", |
||||
|
"documentCount", enrichedDocuments.size() |
||||
|
)); |
||||
|
} catch (Exception e) { |
||||
|
return ResponseEntity.status(500).body(Map.of( |
||||
|
"success", false, |
||||
|
"message", "上传失败:" + e.getMessage() |
||||
|
)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 上传 JSON 文件(基本方式) |
||||
|
* 把 JSON 根节点当成一个整体文档 |
||||
|
*/ |
||||
|
@PostMapping("/upload/json/basic") |
||||
|
public ResponseEntity<Map<String, Object>> uploadJsonBasic(@RequestParam("file") MultipartFile file) { |
||||
|
try { |
||||
|
List<Document> documents = jsonDocumentLoader.loadBasicJson(file); |
||||
|
|
||||
|
// 拆分文档 |
||||
|
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
||||
|
|
||||
|
// 添加元数据 |
||||
|
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
||||
|
|
||||
|
// 转成向量并存入数据库 |
||||
|
pgVectorVectorStore.add(enrichedDocuments); |
||||
|
|
||||
|
return ResponseEntity.ok(Map.of( |
||||
|
"success", true, |
||||
|
"message", "JSON文件(基本方式)上传并向量化成功", |
||||
|
"documentCount", enrichedDocuments.size() |
||||
|
)); |
||||
|
} catch (Exception e) { |
||||
|
return ResponseEntity.status(500).body(Map.of( |
||||
|
"success", false, |
||||
|
"message", "上传失败:" + e.getMessage() |
||||
|
)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 上传 JSON 文件(按字段提取) |
||||
|
* 用于提取指定字段文本 |
||||
|
*/ |
||||
|
@PostMapping("/upload/json/fields") |
||||
|
public ResponseEntity<Map<String, Object>> uploadJsonWithFields( |
||||
|
@RequestParam("file") MultipartFile file, |
||||
|
@RequestParam("fields") List<String> fields) { |
||||
|
try { |
||||
|
List<Document> documents = jsonDocumentLoader.loadJsonByFields(file, fields.toArray(new String[0])); |
||||
|
|
||||
|
// 拆分文档 |
||||
|
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
||||
|
|
||||
|
// 添加元数据 |
||||
|
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
||||
|
|
||||
|
// 转成向量并存入数据库 |
||||
|
pgVectorVectorStore.add(enrichedDocuments); |
||||
|
|
||||
|
return ResponseEntity.ok(Map.of( |
||||
|
"success", true, |
||||
|
"message", "JSON文件(按字段)上传并向量化成功", |
||||
|
"documentCount", enrichedDocuments.size(), |
||||
|
"extractedFields", fields |
||||
|
)); |
||||
|
} catch (Exception e) { |
||||
|
return ResponseEntity.status(500).body(Map.of( |
||||
|
"success", false, |
||||
|
"message", "上传失败:" + e.getMessage() |
||||
|
)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 上传 JSON 文件(按指针拆分) |
||||
|
* 用于拆分数组元素,常用来分段成多文档 |
||||
|
*/ |
||||
|
@PostMapping("/upload/json/pointer") |
||||
|
public ResponseEntity<Map<String, Object>> uploadJsonWithPointer( |
||||
|
@RequestParam("file") MultipartFile file, |
||||
|
@RequestParam("pointer") String pointer) { |
||||
|
try { |
||||
|
List<Document> documents = jsonDocumentLoader.loadJsonByPointer(file, pointer); |
||||
|
|
||||
|
// 拆分文档 |
||||
|
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents); |
||||
|
|
||||
|
// 添加元数据 |
||||
|
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); |
||||
|
|
||||
|
// 转成向量并存入数据库 |
||||
|
pgVectorVectorStore.add(enrichedDocuments); |
||||
|
|
||||
|
return ResponseEntity.ok(Map.of( |
||||
|
"success", true, |
||||
|
"message", "JSON文件(按指针)上传并向量化成功", |
||||
|
"documentCount", enrichedDocuments.size(), |
||||
|
"pointer", pointer |
||||
|
)); |
||||
|
} catch (Exception e) { |
||||
|
return ResponseEntity.status(500).body(Map.of( |
||||
|
"success", false, |
||||
|
"message", "上传失败:" + e.getMessage() |
||||
|
)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,59 @@ |
|||||
|
package com.wok.supportbot.document.extract; |
||||
|
|
||||
|
import lombok.extern.slf4j.Slf4j; |
||||
|
import org.springframework.ai.document.Document; |
||||
|
import org.springframework.ai.reader.JsonReader; |
||||
|
import org.springframework.core.io.FileSystemResource; |
||||
|
import org.springframework.core.io.Resource; |
||||
|
import org.springframework.stereotype.Component; |
||||
|
import org.springframework.web.multipart.MultipartFile; |
||||
|
|
||||
|
import java.io.File; |
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
|
||||
|
@Component |
||||
|
@Slf4j |
||||
|
public class JsonDocumentLoader { |
||||
|
|
||||
|
/** |
||||
|
* 基本读取方式 |
||||
|
*/ |
||||
|
public List<Document> loadBasicJson(MultipartFile file) { |
||||
|
Resource resource = toResource(file); |
||||
|
JsonReader reader = new JsonReader(resource); |
||||
|
return reader.get(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 指定字段读取方式(例如 description、features 字段) |
||||
|
*/ |
||||
|
public List<Document> loadJsonByFields(MultipartFile file, String... fields) { |
||||
|
Resource resource = toResource(file); |
||||
|
JsonReader reader = new JsonReader(resource, fields); |
||||
|
return reader.get(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 使用 JSON Pointer 提取数组路径内容(如 /items) |
||||
|
*/ |
||||
|
public List<Document> loadJsonByPointer(MultipartFile file, String pointer) { |
||||
|
Resource resource = toResource(file); |
||||
|
JsonReader reader = new JsonReader(resource); |
||||
|
return reader.get(pointer); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 将 MultipartFile 转换为 Resource |
||||
|
*/ |
||||
|
private Resource toResource(MultipartFile file) { |
||||
|
try { |
||||
|
File temp = File.createTempFile("upload-", file.getOriginalFilename()); |
||||
|
file.transferTo(temp); |
||||
|
return new FileSystemResource(temp); |
||||
|
} catch (IOException e) { |
||||
|
log.error("JSON 文件转换失败", e); |
||||
|
throw new RuntimeException("JSON 文件转换失败", e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -1,37 +0,0 @@ |
|||||
package com.wok.supportbot.document.extract; |
|
||||
|
|
||||
import org.springframework.ai.document.Document; |
|
||||
import org.springframework.ai.reader.JsonReader; |
|
||||
import org.springframework.beans.factory.annotation.Value; |
|
||||
import org.springframework.core.io.Resource; |
|
||||
import org.springframework.stereotype.Component; |
|
||||
|
|
||||
import java.util.List; |
|
||||
|
|
||||
// 从 classpath 下的 JSON 文件中读取文档 |
|
||||
@Component |
|
||||
class MyJsonReader { |
|
||||
private final Resource resource; |
|
||||
|
|
||||
MyJsonReader(@Value("classpath:products.json") Resource resource) { |
|
||||
this.resource = resource; |
|
||||
} |
|
||||
|
|
||||
// 基本用法 |
|
||||
List<Document> loadBasicJsonDocuments() { |
|
||||
JsonReader jsonReader = new JsonReader(this.resource); |
|
||||
return jsonReader.get(); |
|
||||
} |
|
||||
|
|
||||
// 指定使用哪些 JSON 字段作为文档内容 |
|
||||
List<Document> loadJsonWithSpecificFields() { |
|
||||
JsonReader jsonReader = new JsonReader(this.resource, "description", "features"); |
|
||||
return jsonReader.get(); |
|
||||
} |
|
||||
|
|
||||
// 使用 JSON 指针精确提取文档内容 |
|
||||
List<Document> loadJsonWithPointer() { |
|
||||
JsonReader jsonReader = new JsonReader(this.resource); |
|
||||
return jsonReader.get("/items"); // 提取 items 数组内的内容 |
|
||||
} |
|
||||
} |
|
||||
@ -0,0 +1,20 @@ |
|||||
|
package com.wok.supportbot.document.extract; |
||||
|
|
||||
|
import org.springframework.ai.document.Document; |
||||
|
import org.springframework.stereotype.Component; |
||||
|
|
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
import java.util.UUID; |
||||
|
|
||||
|
@Component |
||||
|
public class SimpleStringDocumentReader { |
||||
|
|
||||
|
public List<Document> read(String content) { |
||||
|
Document doc = Document.builder() |
||||
|
.id(UUID.randomUUID().toString()) |
||||
|
.text(content) |
||||
|
.build(); |
||||
|
return Collections.singletonList(doc); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,44 @@ |
|||||
|
package com.wok.supportbot.document.extract; |
||||
|
|
||||
|
import lombok.extern.slf4j.Slf4j; |
||||
|
import org.apache.tika.Tika; |
||||
|
import org.apache.tika.exception.TikaException; |
||||
|
import org.springframework.ai.document.Document; |
||||
|
import org.springframework.core.io.Resource; |
||||
|
import org.springframework.core.io.FileSystemResource; |
||||
|
import org.springframework.stereotype.Component; |
||||
|
import org.springframework.web.multipart.MultipartFile; |
||||
|
|
||||
|
import java.io.File; |
||||
|
import java.io.IOException; |
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
import java.util.UUID; |
||||
|
|
||||
|
@Component |
||||
|
@Slf4j |
||||
|
public class TikaDocumentReader { |
||||
|
|
||||
|
public List<Document> read(MultipartFile file) { |
||||
|
try { |
||||
|
// MultipartFile 转 Resource |
||||
|
File tempFile = File.createTempFile("upload-", file.getOriginalFilename()); |
||||
|
file.transferTo(tempFile); |
||||
|
Resource resource = new FileSystemResource(tempFile); |
||||
|
|
||||
|
Tika tika = new Tika(); |
||||
|
String text = tika.parseToString(resource.getInputStream()); |
||||
|
|
||||
|
Document doc = Document.builder() |
||||
|
.id(UUID.randomUUID().toString()) |
||||
|
.text(text) |
||||
|
.build(); |
||||
|
|
||||
|
return Collections.singletonList(doc); |
||||
|
|
||||
|
} catch (IOException | TikaException e) { |
||||
|
log.error("Tika 文件解析失败", e); |
||||
|
throw new RuntimeException("Tika 文件解析失败", e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue