本地 RAG 知识库
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

630 lines
24 KiB

package com.wok.supportbot.service;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.wok.supportbot.dao.KnowledgeCategoryMapper;
import com.wok.supportbot.dao.KnowledgeDocumentMapper;
import com.wok.supportbot.document.extract.JsonDocumentLoader;
import com.wok.supportbot.document.extract.MarkdownDocumentLoader;
import com.wok.supportbot.document.extract.SimpleStringDocumentReader;
import com.wok.supportbot.document.extract.TikaDocumentReader;
import com.wok.supportbot.document.transform.MyKeywordEnricher;
import com.wok.supportbot.document.transform.MyTokenTextSplitter;
import com.wok.supportbot.entity.CategoryNode;
import com.wok.supportbot.entity.KnowledgeCategory;
import com.wok.supportbot.entity.KnowledgeDocument;
import com.wok.supportbot.entity.SearchResult;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.document.Document;
import org.springframework.ai.vectorstore.SearchRequest;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.ai.vectorstore.filter.Filter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.web.multipart.MultipartFile;
import java.util.*;
import java.util.stream.Collectors;
/**
* 知识库文档管理服务
* 统一管理文档的上传、删除、搜索、统计、分类等操作
*/
@Service
@Slf4j
public class DocumentService {
@Autowired
private KnowledgeDocumentMapper documentMapper;
@Autowired
private KnowledgeCategoryMapper categoryMapper;
@Autowired
private JdbcTemplate jdbcTemplate;
@Autowired
private VectorStore pgVectorVectorStore;
@Autowired
private MyTokenTextSplitter myTokenTextSplitter;
@Autowired
private MyKeywordEnricher myKeywordEnricher;
@Autowired
private TikaDocumentReader tikaDocumentReader;
@Autowired
private SimpleStringDocumentReader simpleStringDocumentReader;
@Autowired
private MarkdownDocumentLoader markdownDocumentLoader;
@Autowired
private JsonDocumentLoader jsonDocumentLoader;
// ==================== 文档上传 ====================
/**
* 统一文档上传流程:创建记录 -> 分块 -> 关键词 -> 向量化 -> 更新状态
*
* @param documents 解析后的文档列表
* @param title 文档标题
* @param sourceName 源文件名
* @param fileType 文件类型
* @param fileSize 文件大小
* @param content 原文内容(截断预览)
* @param categoryId 分类ID
* @param tags 标签列表
* @return 创建完成的文档记录
*/
@Transactional(rollbackFor = Exception.class)
public KnowledgeDocument uploadDocument(List<Document> documents, String title, String sourceName,
String fileType, Long fileSize, String content,
Long categoryId, List<String> tags) {
// 1. 创建文档记录(状态 PROCESSING)
KnowledgeDocument docRecord = KnowledgeDocument.builder()
.title(title != null ? title : sourceName)
.sourceName(sourceName)
.fileType(fileType)
.fileSize(fileSize != null ? fileSize : 0L)
.content(content != null && content.length() > 2000 ? content.substring(0, 2000) : content)
.categoryId(categoryId != null ? categoryId : 0L)
.tags(tags != null ? Map.of("tags", tags) : null)
.status("PROCESSING")
.chunkCount(0)
.build();
documentMapper.insert(docRecord);
try {
// 2. 分块处理
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);
// 3. 为每个分块设置 documentId 等元数据
for (int i = 0; i < splitDocuments.size(); i++) {
Document doc = splitDocuments.get(i);
Map<String, Object> meta = new HashMap<>(doc.getMetadata());
meta.put("documentId", String.valueOf(docRecord.getId()));
meta.put("chunkIndex", i);
meta.put("sourceName", sourceName);
meta.put("title", title != null ? title : sourceName);
if (categoryId != null) {
meta.put("categoryId", String.valueOf(categoryId));
}
if (tags != null && !tags.isEmpty()) {
meta.put("tags", tags);
}
splitDocuments.set(i, new Document(doc.getId(), doc.getText(), meta));
}
// 4. 关键词提取
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);
// 5. 向量化存储
pgVectorVectorStore.add(enrichedDocuments);
// 6. 更新文档状态为 READY
docRecord.setStatus("READY");
docRecord.setChunkCount(enrichedDocuments.size());
documentMapper.updateById(docRecord);
log.info("文档上传成功: id={}, title={}, chunks={}", docRecord.getId(), docRecord.getTitle(), enrichedDocuments.size());
} catch (Exception e) {
// 标记为失败
docRecord.setStatus("FAILED");
docRecord.setErrorMessage(e.getMessage());
documentMapper.updateById(docRecord);
log.error("文档上传失败: id={}, title={}", docRecord.getId(), docRecord.getTitle(), e);
throw new RuntimeException("文档处理失败: " + e.getMessage(), e);
}
return docRecord;
}
/**
* 解析文件并上传
*/
public KnowledgeDocument uploadFile(MultipartFile file, String title, Long categoryId, List<String> tags) {
List<Document> documents = tikaDocumentReader.read(file);
String fileType = getFileExtension(file.getOriginalFilename());
return uploadDocument(documents,
title != null ? title : file.getOriginalFilename(),
file.getOriginalFilename(),
fileType,
file.getSize(),
documents.get(0).getText(),
categoryId,
tags);
}
/**
* 解析字符串并上传
*/
public KnowledgeDocument uploadString(String content, String title, Long categoryId, List<String> tags) {
List<Document> documents = simpleStringDocumentReader.read(content);
return uploadDocument(documents, title, title, "txt",
(long) content.length(), content, categoryId, tags);
}
/**
* 解析 Markdown 文件并上传
*/
public KnowledgeDocument uploadMarkdown(MultipartFile file, String title, Long categoryId, List<String> tags) {
List<Document> documents = markdownDocumentLoader.loadMarkdownFromFile(file);
String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n"));
return uploadDocument(documents,
title != null ? title : file.getOriginalFilename(),
file.getOriginalFilename(),
"md",
file.getSize(),
content,
categoryId,
tags);
}
/**
* 解析 JSON 文件(基本方式)并上传
*/
public KnowledgeDocument uploadJsonBasic(MultipartFile file, String title, Long categoryId, List<String> tags) {
List<Document> documents = jsonDocumentLoader.loadBasicJson(file);
String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n"));
return uploadDocument(documents,
title != null ? title : file.getOriginalFilename(),
file.getOriginalFilename(),
"json",
file.getSize(),
content,
categoryId,
tags);
}
/**
* 解析 JSON 文件(按字段)并上传
*/
public KnowledgeDocument uploadJsonFields(MultipartFile file, List<String> fields, String title, Long categoryId, List<String> tags) {
List<Document> documents = jsonDocumentLoader.loadJsonByFields(file, fields.toArray(new String[0]));
String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n"));
return uploadDocument(documents,
title != null ? title : file.getOriginalFilename(),
file.getOriginalFilename(),
"json",
file.getSize(),
content,
categoryId,
tags);
}
/**
* 解析 JSON 文件(按指针)并上传
*/
public KnowledgeDocument uploadJsonPointer(MultipartFile file, String pointer, String title, Long categoryId, List<String> tags) {
List<Document> documents = jsonDocumentLoader.loadJsonByPointer(file, pointer);
String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n"));
return uploadDocument(documents,
title != null ? title : file.getOriginalFilename(),
file.getOriginalFilename(),
"json",
file.getSize(),
content,
categoryId,
tags);
}
// ==================== 文档管理 ====================
/**
* 分页查询文档列表(手动分页)
*/
public Map<String, Object> listDocuments(int page, int size, Long categoryId, String status) {
// 构建基础条件(用于 count 和 list)
QueryWrapper<KnowledgeDocument> countWrapper = new QueryWrapper<>();
if (categoryId != null && categoryId > 0) {
countWrapper.eq("category_id", categoryId);
}
if (status != null && !status.isEmpty()) {
countWrapper.eq("status", status);
}
// 先查询总数(不加 ORDER BY)
Long total = documentMapper.selectCount(countWrapper);
// 构建列表查询条件
QueryWrapper<KnowledgeDocument> listWrapper = new QueryWrapper<>();
if (categoryId != null && categoryId > 0) {
listWrapper.eq("category_id", categoryId);
}
if (status != null && !status.isEmpty()) {
listWrapper.eq("status", status);
}
listWrapper.orderByDesc("create_time");
listWrapper.last("LIMIT " + size + " OFFSET " + (page - 1) * size);
List<KnowledgeDocument> records = documentMapper.selectList(listWrapper);
Map<String, Object> result = new HashMap<>();
result.put("records", records);
result.put("total", total);
result.put("page", page);
result.put("size", size);
result.put("pages", (total + size - 1) / size);
return result;
}
/**
* 获取文档详情
*/
public KnowledgeDocument getDocumentDetail(Long id) {
return documentMapper.selectById(id);
}
/**
* 获取文档的所有分块
*/
public List<Map<String, Object>> getDocumentChunks(Long id) {
String sql = "SELECT id::text as id, content, metadata, create_time FROM vector_store " +
"WHERE metadata->>'documentId' = ? ORDER BY (metadata->>'chunkIndex')::int";
return jdbcTemplate.queryForList(sql, String.valueOf(id));
}
/**
* 删除文档(逻辑删除 + 级联删除向量)
*/
@Transactional(rollbackFor = Exception.class)
public int deleteDocument(Long id) {
KnowledgeDocument doc = documentMapper.selectById(id);
if (doc == null) {
throw new RuntimeException("文档不存在");
}
// 删除关联的向量
int vectorCount = deleteVectorsByDocumentId(String.valueOf(id));
// 逻辑删除文档记录
documentMapper.deleteById(id);
log.info("删除文档: id={}, title={}, 删除向量数={}", id, doc.getTitle(), vectorCount);
return vectorCount;
}
/**
* 重新处理文档(重新分块 + 向量化)
*/
@Transactional(rollbackFor = Exception.class)
public KnowledgeDocument reprocessDocument(Long id) {
KnowledgeDocument doc = documentMapper.selectById(id);
if (doc == null) {
throw new RuntimeException("文档不存在");
}
if (doc.getContent() == null || doc.getContent().isEmpty()) {
throw new RuntimeException("文档无内容,无法重新处理");
}
// 删除旧向量
deleteVectorsByDocumentId(String.valueOf(id));
// 重新解析并处理
List<Document> documents = simpleStringDocumentReader.read(doc.getContent());
doc.setStatus("PROCESSING");
doc.setChunkCount(0);
doc.setErrorMessage(null);
documentMapper.updateById(doc);
try {
List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);
for (int i = 0; i < splitDocuments.size(); i++) {
Document d = splitDocuments.get(i);
Map<String, Object> meta = new HashMap<>(d.getMetadata());
meta.put("documentId", String.valueOf(doc.getId()));
meta.put("chunkIndex", i);
meta.put("sourceName", doc.getSourceName());
meta.put("title", doc.getTitle());
if (doc.getCategoryId() != null && doc.getCategoryId() > 0) {
meta.put("categoryId", String.valueOf(doc.getCategoryId()));
}
if (doc.getTags() != null && doc.getTags().containsKey("tags")) {
meta.put("tags", doc.getTags().get("tags"));
}
splitDocuments.set(i, new Document(d.getId(), d.getText(), meta));
}
List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);
pgVectorVectorStore.add(enrichedDocuments);
doc.setStatus("READY");
doc.setChunkCount(enrichedDocuments.size());
documentMapper.updateById(doc);
log.info("重新处理文档成功: id={}, title={}, chunks={}", doc.getId(), doc.getTitle(), enrichedDocuments.size());
} catch (Exception e) {
doc.setStatus("FAILED");
doc.setErrorMessage(e.getMessage());
documentMapper.updateById(doc);
log.error("重新处理文档失败: id={}, title={}", doc.getId(), doc.getTitle(), e);
throw new RuntimeException("重新处理失败: " + e.getMessage(), e);
}
return doc;
}
/**
* 更新文档元信息
*/
public void updateDocumentMetadata(Long id, String title, Long categoryId, List<String> tags) {
KnowledgeDocument doc = documentMapper.selectById(id);
if (doc == null) {
throw new RuntimeException("文档不存在");
}
if (title != null && !title.isEmpty()) {
doc.setTitle(title);
}
if (categoryId != null) {
doc.setCategoryId(categoryId);
}
if (tags != null) {
doc.setTags(Map.of("tags", tags));
}
documentMapper.updateById(doc);
// 同步更新 vector_store 中对应的 metadata
// 注意:Spring AI 当前没有直接更新 metadata 的 API
// 这里我们先更新文档记录,metadata 的同步留到后续优化
log.info("更新文档元信息: id={}, title={}", id, doc.getTitle());
}
// ==================== 语义搜索 ====================
/**
* 语义搜索
*/
public List<SearchResult> searchDocuments(String query, int topK, double similarityThreshold, Long categoryId) {
SearchRequest.Builder searchBuilder = SearchRequest.builder()
.query(query)
.topK(topK)
.similarityThreshold(similarityThreshold);
// 如果指定了分类,添加过滤条件(当前 Spring AI 1.0.0-M6 的 filter 支持有限)
// 这里先不做分类过滤,后续升级 Spring AI 版本后再完善
List<Document> results = pgVectorVectorStore.similaritySearch(searchBuilder.build());
List<SearchResult> searchResults = new ArrayList<>();
for (Document doc : results) {
Map<String, Object> metadata = doc.getMetadata();
SearchResult result = SearchResult.builder()
.id(doc.getId())
.content(doc.getText())
.score(metadata.containsKey("distance") ? ((Number) metadata.get("distance")).doubleValue() : null)
.sourceName(getStringFromMetadata(metadata, "sourceName"))
.title(getStringFromMetadata(metadata, "title"))
.chunkIndex(getIntegerFromMetadata(metadata, "chunkIndex"))
.documentId(getStringFromMetadata(metadata, "documentId"))
.metadata(metadata)
.build();
searchResults.add(result);
}
return searchResults;
}
// ==================== 统计 ====================
/**
* 获取知识库统计信息
*/
public Map<String, Object> getStats() {
// 文档统计
Long totalDocuments = documentMapper.selectCount(null);
// 按文件类型统计
String typeSql = "SELECT file_type, COUNT(*) as count FROM knowledge_document WHERE is_delete = false GROUP BY file_type";
List<Map<String, Object>> typeStats = jdbcTemplate.queryForList(typeSql);
Map<String, Long> byFileType = typeStats.stream()
.collect(Collectors.toMap(
r -> (String) r.get("file_type"),
r -> ((Number) r.get("count")).longValue()
));
// 按分类统计
String catSql = "SELECT c.name, COUNT(d.id) as count FROM knowledge_document d " +
"LEFT JOIN knowledge_category c ON d.category_id = c.id " +
"WHERE d.is_delete = false GROUP BY c.name";
List<Map<String, Object>> catStats;
try {
catStats = jdbcTemplate.queryForList(catSql);
} catch (Exception e) {
catStats = new ArrayList<>();
}
// 向量总数
String vectorSql = "SELECT COUNT(*) FROM vector_store";
Long totalVectors;
try {
totalVectors = jdbcTemplate.queryForObject(vectorSql, Long.class);
} catch (Exception e) {
totalVectors = 0L;
}
// 最近上传时间
String lastUploadSql = "SELECT MAX(create_time) FROM knowledge_document WHERE is_delete = false";
Date lastUploadTime = jdbcTemplate.queryForObject(lastUploadSql, Date.class);
Map<String, Object> stats = new LinkedHashMap<>();
stats.put("totalDocuments", totalDocuments);
stats.put("totalVectors", totalVectors);
stats.put("lastUploadTime", lastUploadTime);
stats.put("byFileType", byFileType);
stats.put("byCategory", catStats);
return stats;
}
// ==================== 分类管理 ====================
/**
* 获取分类树
*/
public List<CategoryNode> getCategoryTree() {
List<KnowledgeCategory> categories = categoryMapper.selectList(
new QueryWrapper<KnowledgeCategory>().orderByAsc("sort_order"));
Map<Long, CategoryNode> nodeMap = new LinkedHashMap<>();
List<CategoryNode> rootNodes = new ArrayList<>();
for (KnowledgeCategory cat : categories) {
CategoryNode node = CategoryNode.builder()
.id(cat.getId())
.name(cat.getName())
.description(cat.getDescription())
.parentId(cat.getParentId())
.sortOrder(cat.getSortOrder())
.documentCount(cat.getDocumentCount())
.children(new ArrayList<>())
.build();
nodeMap.put(cat.getId(), node);
}
for (CategoryNode node : nodeMap.values()) {
if (node.getParentId() == null || node.getParentId() == 0) {
rootNodes.add(node);
} else {
CategoryNode parent = nodeMap.get(node.getParentId());
if (parent != null) {
parent.getChildren().add(node);
} else {
rootNodes.add(node);
}
}
}
return rootNodes;
}
/**
* 获取分类列表
*/
public List<KnowledgeCategory> listCategories() {
return categoryMapper.selectList(
new QueryWrapper<KnowledgeCategory>().orderByAsc("sort_order"));
}
/**
* 创建分类
*/
public KnowledgeCategory createCategory(String name, String description, Long parentId, Integer sortOrder) {
KnowledgeCategory category = KnowledgeCategory.builder()
.name(name)
.description(description)
.parentId(parentId != null ? parentId : 0L)
.sortOrder(sortOrder != null ? sortOrder : 0)
.documentCount(0)
.build();
categoryMapper.insert(category);
return category;
}
/**
* 更新分类
*/
public void updateCategory(Long id, String name, String description, Integer sortOrder) {
KnowledgeCategory category = categoryMapper.selectById(id);
if (category == null) {
throw new RuntimeException("分类不存在");
}
if (name != null && !name.isEmpty()) {
category.setName(name);
}
if (description != null) {
category.setDescription(description);
}
if (sortOrder != null) {
category.setSortOrder(sortOrder);
}
categoryMapper.updateById(category);
}
/**
* 删除分类(不删除文档,仅清空关联)
*/
@Transactional(rollbackFor = Exception.class)
public void deleteCategory(Long id) {
// 将关联的文档 category_id 设为 0
KnowledgeDocument updateDoc = new KnowledgeDocument();
updateDoc.setCategoryId(0L);
documentMapper.update(updateDoc, new QueryWrapper<KnowledgeDocument>().eq("category_id", id));
// 逻辑删除分类
categoryMapper.deleteById(id);
}
// ==================== 内部方法 ====================
/**
* 根据文档ID删除 vector_store 中关联的所有向量
*/
private int deleteVectorsByDocumentId(String documentId) {
String sql = "SELECT id::text FROM vector_store WHERE metadata->>'documentId' = ?";
List<String> ids = jdbcTemplate.queryForList(sql, String.class, documentId);
if (!ids.isEmpty()) {
pgVectorVectorStore.delete(ids);
log.debug("删除向量: documentId={}, count={}", documentId, ids.size());
}
return ids.size();
}
/**
* 获取文件扩展名
*/
private String getFileExtension(String filename) {
if (filename == null || !filename.contains(".")) {
return "unknown";
}
return filename.substring(filename.lastIndexOf(".") + 1).toLowerCase();
}
/**
* 从 metadata 中安全获取字符串值
*/
private String getStringFromMetadata(Map<String, Object> metadata, String key) {
Object value = metadata.get(key);
return value != null ? value.toString() : null;
}
/**
* 从 metadata 中安全获取整数值
*/
private Integer getIntegerFromMetadata(Map<String, Object> metadata, String key) {
Object value = metadata.get(key);
if (value == null) return null;
if (value instanceof Number) {
return ((Number) value).intValue();
}
try {
return Integer.parseInt(value.toString());
} catch (NumberFormatException e) {
return null;
}
}
}