package com.wok.supportbot.service; import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper; import com.wok.supportbot.dao.KnowledgeCategoryMapper; import com.wok.supportbot.dao.KnowledgeDocumentMapper; import com.wok.supportbot.document.extract.JsonDocumentLoader; import com.wok.supportbot.document.extract.MarkdownDocumentLoader; import com.wok.supportbot.document.extract.SimpleStringDocumentReader; import com.wok.supportbot.document.extract.TikaDocumentReader; import com.wok.supportbot.document.transform.MyKeywordEnricher; import com.wok.supportbot.document.transform.MyTokenTextSplitter; import com.wok.supportbot.entity.CategoryNode; import com.wok.supportbot.entity.KnowledgeCategory; import com.wok.supportbot.entity.KnowledgeDocument; import com.wok.supportbot.entity.SearchResult; import lombok.extern.slf4j.Slf4j; import org.springframework.ai.document.Document; import org.springframework.ai.vectorstore.SearchRequest; import org.springframework.ai.vectorstore.VectorStore; import org.springframework.ai.vectorstore.filter.Filter; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; import org.springframework.web.multipart.MultipartFile; import java.util.*; import java.util.stream.Collectors; /** * 知识库文档管理服务 * 统一管理文档的上传、删除、搜索、统计、分类等操作 */ @Service @Slf4j public class DocumentService { @Autowired private KnowledgeDocumentMapper documentMapper; @Autowired private KnowledgeCategoryMapper categoryMapper; @Autowired private JdbcTemplate jdbcTemplate; @Autowired private VectorStore pgVectorVectorStore; @Autowired private MyTokenTextSplitter myTokenTextSplitter; @Autowired private MyKeywordEnricher myKeywordEnricher; @Autowired private TikaDocumentReader tikaDocumentReader; @Autowired private SimpleStringDocumentReader simpleStringDocumentReader; @Autowired private MarkdownDocumentLoader markdownDocumentLoader; @Autowired private JsonDocumentLoader jsonDocumentLoader; // ==================== 文档上传 ==================== /** * 统一文档上传流程:创建记录 -> 分块 -> 关键词 -> 向量化 -> 更新状态 * * @param documents 解析后的文档列表 * @param title 文档标题 * @param sourceName 源文件名 * @param fileType 文件类型 * @param fileSize 文件大小 * @param content 原文内容(截断预览) * @param categoryId 分类ID * @param tags 标签列表 * @return 创建完成的文档记录 */ @Transactional(rollbackFor = Exception.class) public KnowledgeDocument uploadDocument(List documents, String title, String sourceName, String fileType, Long fileSize, String content, Long categoryId, List tags) { // 1. 创建文档记录(状态 PROCESSING) KnowledgeDocument docRecord = KnowledgeDocument.builder() .title(title != null ? title : sourceName) .sourceName(sourceName) .fileType(fileType) .fileSize(fileSize != null ? fileSize : 0L) .content(content != null && content.length() > 2000 ? content.substring(0, 2000) : content) .categoryId(categoryId != null ? categoryId : 0L) .tags(tags != null ? Map.of("tags", tags) : null) .status("PROCESSING") .chunkCount(0) .build(); documentMapper.insert(docRecord); try { // 2. 分块处理 List splitDocuments = myTokenTextSplitter.splitDocuments(documents); // 3. 为每个分块设置 documentId 等元数据 for (int i = 0; i < splitDocuments.size(); i++) { Document doc = splitDocuments.get(i); Map meta = new HashMap<>(doc.getMetadata()); meta.put("documentId", String.valueOf(docRecord.getId())); meta.put("chunkIndex", i); meta.put("sourceName", sourceName); meta.put("title", title != null ? title : sourceName); if (categoryId != null) { meta.put("categoryId", String.valueOf(categoryId)); } if (tags != null && !tags.isEmpty()) { meta.put("tags", tags); } splitDocuments.set(i, new Document(doc.getId(), doc.getText(), meta)); } // 4. 关键词提取 List enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); // 5. 向量化存储 pgVectorVectorStore.add(enrichedDocuments); // 6. 更新文档状态为 READY docRecord.setStatus("READY"); docRecord.setChunkCount(enrichedDocuments.size()); documentMapper.updateById(docRecord); log.info("文档上传成功: id={}, title={}, chunks={}", docRecord.getId(), docRecord.getTitle(), enrichedDocuments.size()); } catch (Exception e) { // 标记为失败 docRecord.setStatus("FAILED"); docRecord.setErrorMessage(e.getMessage()); documentMapper.updateById(docRecord); log.error("文档上传失败: id={}, title={}", docRecord.getId(), docRecord.getTitle(), e); throw new RuntimeException("文档处理失败: " + e.getMessage(), e); } return docRecord; } /** * 解析文件并上传 */ public KnowledgeDocument uploadFile(MultipartFile file, String title, Long categoryId, List tags) { List documents = tikaDocumentReader.read(file); String fileType = getFileExtension(file.getOriginalFilename()); return uploadDocument(documents, title != null ? title : file.getOriginalFilename(), file.getOriginalFilename(), fileType, file.getSize(), documents.get(0).getText(), categoryId, tags); } /** * 解析字符串并上传 */ public KnowledgeDocument uploadString(String content, String title, Long categoryId, List tags) { List documents = simpleStringDocumentReader.read(content); return uploadDocument(documents, title, title, "txt", (long) content.length(), content, categoryId, tags); } /** * 解析 Markdown 文件并上传 */ public KnowledgeDocument uploadMarkdown(MultipartFile file, String title, Long categoryId, List tags) { List documents = markdownDocumentLoader.loadMarkdownFromFile(file); String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n")); return uploadDocument(documents, title != null ? title : file.getOriginalFilename(), file.getOriginalFilename(), "md", file.getSize(), content, categoryId, tags); } /** * 解析 JSON 文件(基本方式)并上传 */ public KnowledgeDocument uploadJsonBasic(MultipartFile file, String title, Long categoryId, List tags) { List documents = jsonDocumentLoader.loadBasicJson(file); String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n")); return uploadDocument(documents, title != null ? title : file.getOriginalFilename(), file.getOriginalFilename(), "json", file.getSize(), content, categoryId, tags); } /** * 解析 JSON 文件(按字段)并上传 */ public KnowledgeDocument uploadJsonFields(MultipartFile file, List fields, String title, Long categoryId, List tags) { List documents = jsonDocumentLoader.loadJsonByFields(file, fields.toArray(new String[0])); String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n")); return uploadDocument(documents, title != null ? title : file.getOriginalFilename(), file.getOriginalFilename(), "json", file.getSize(), content, categoryId, tags); } /** * 解析 JSON 文件(按指针)并上传 */ public KnowledgeDocument uploadJsonPointer(MultipartFile file, String pointer, String title, Long categoryId, List tags) { List documents = jsonDocumentLoader.loadJsonByPointer(file, pointer); String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n")); return uploadDocument(documents, title != null ? title : file.getOriginalFilename(), file.getOriginalFilename(), "json", file.getSize(), content, categoryId, tags); } // ==================== 文档管理 ==================== /** * 分页查询文档列表(手动分页) */ public Map listDocuments(int page, int size, Long categoryId, String status) { // 构建基础条件(用于 count 和 list) QueryWrapper countWrapper = new QueryWrapper<>(); if (categoryId != null && categoryId > 0) { countWrapper.eq("category_id", categoryId); } if (status != null && !status.isEmpty()) { countWrapper.eq("status", status); } // 先查询总数(不加 ORDER BY) Long total = documentMapper.selectCount(countWrapper); // 构建列表查询条件 QueryWrapper listWrapper = new QueryWrapper<>(); if (categoryId != null && categoryId > 0) { listWrapper.eq("category_id", categoryId); } if (status != null && !status.isEmpty()) { listWrapper.eq("status", status); } listWrapper.orderByDesc("create_time"); listWrapper.last("LIMIT " + size + " OFFSET " + (page - 1) * size); List records = documentMapper.selectList(listWrapper); Map result = new HashMap<>(); result.put("records", records); result.put("total", total); result.put("page", page); result.put("size", size); result.put("pages", (total + size - 1) / size); return result; } /** * 获取文档详情 */ public KnowledgeDocument getDocumentDetail(Long id) { return documentMapper.selectById(id); } /** * 获取文档的所有分块 */ public List> getDocumentChunks(Long id) { String sql = "SELECT id::text as id, content, metadata, create_time FROM vector_store " + "WHERE metadata->>'documentId' = ? ORDER BY (metadata->>'chunkIndex')::int"; return jdbcTemplate.queryForList(sql, String.valueOf(id)); } /** * 删除文档(逻辑删除 + 级联删除向量) */ @Transactional(rollbackFor = Exception.class) public int deleteDocument(Long id) { KnowledgeDocument doc = documentMapper.selectById(id); if (doc == null) { throw new RuntimeException("文档不存在"); } // 删除关联的向量 int vectorCount = deleteVectorsByDocumentId(String.valueOf(id)); // 逻辑删除文档记录 documentMapper.deleteById(id); log.info("删除文档: id={}, title={}, 删除向量数={}", id, doc.getTitle(), vectorCount); return vectorCount; } /** * 重新处理文档(重新分块 + 向量化) */ @Transactional(rollbackFor = Exception.class) public KnowledgeDocument reprocessDocument(Long id) { KnowledgeDocument doc = documentMapper.selectById(id); if (doc == null) { throw new RuntimeException("文档不存在"); } if (doc.getContent() == null || doc.getContent().isEmpty()) { throw new RuntimeException("文档无内容,无法重新处理"); } // 删除旧向量 deleteVectorsByDocumentId(String.valueOf(id)); // 重新解析并处理 List documents = simpleStringDocumentReader.read(doc.getContent()); doc.setStatus("PROCESSING"); doc.setChunkCount(0); doc.setErrorMessage(null); documentMapper.updateById(doc); try { List splitDocuments = myTokenTextSplitter.splitDocuments(documents); for (int i = 0; i < splitDocuments.size(); i++) { Document d = splitDocuments.get(i); Map meta = new HashMap<>(d.getMetadata()); meta.put("documentId", String.valueOf(doc.getId())); meta.put("chunkIndex", i); meta.put("sourceName", doc.getSourceName()); meta.put("title", doc.getTitle()); if (doc.getCategoryId() != null && doc.getCategoryId() > 0) { meta.put("categoryId", String.valueOf(doc.getCategoryId())); } if (doc.getTags() != null && doc.getTags().containsKey("tags")) { meta.put("tags", doc.getTags().get("tags")); } splitDocuments.set(i, new Document(d.getId(), d.getText(), meta)); } List enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments); pgVectorVectorStore.add(enrichedDocuments); doc.setStatus("READY"); doc.setChunkCount(enrichedDocuments.size()); documentMapper.updateById(doc); log.info("重新处理文档成功: id={}, title={}, chunks={}", doc.getId(), doc.getTitle(), enrichedDocuments.size()); } catch (Exception e) { doc.setStatus("FAILED"); doc.setErrorMessage(e.getMessage()); documentMapper.updateById(doc); log.error("重新处理文档失败: id={}, title={}", doc.getId(), doc.getTitle(), e); throw new RuntimeException("重新处理失败: " + e.getMessage(), e); } return doc; } /** * 更新文档元信息 */ public void updateDocumentMetadata(Long id, String title, Long categoryId, List tags) { KnowledgeDocument doc = documentMapper.selectById(id); if (doc == null) { throw new RuntimeException("文档不存在"); } if (title != null && !title.isEmpty()) { doc.setTitle(title); } if (categoryId != null) { doc.setCategoryId(categoryId); } if (tags != null) { doc.setTags(Map.of("tags", tags)); } documentMapper.updateById(doc); // 同步更新 vector_store 中对应的 metadata // 注意:Spring AI 当前没有直接更新 metadata 的 API // 这里我们先更新文档记录,metadata 的同步留到后续优化 log.info("更新文档元信息: id={}, title={}", id, doc.getTitle()); } // ==================== 语义搜索 ==================== /** * 语义搜索 */ public List searchDocuments(String query, int topK, double similarityThreshold, Long categoryId) { SearchRequest.Builder searchBuilder = SearchRequest.builder() .query(query) .topK(topK) .similarityThreshold(similarityThreshold); // 如果指定了分类,添加过滤条件(当前 Spring AI 1.0.0-M6 的 filter 支持有限) // 这里先不做分类过滤,后续升级 Spring AI 版本后再完善 List results = pgVectorVectorStore.similaritySearch(searchBuilder.build()); List searchResults = new ArrayList<>(); for (Document doc : results) { Map metadata = doc.getMetadata(); SearchResult result = SearchResult.builder() .id(doc.getId()) .content(doc.getText()) .score(metadata.containsKey("distance") ? ((Number) metadata.get("distance")).doubleValue() : null) .sourceName(getStringFromMetadata(metadata, "sourceName")) .title(getStringFromMetadata(metadata, "title")) .chunkIndex(getIntegerFromMetadata(metadata, "chunkIndex")) .documentId(getStringFromMetadata(metadata, "documentId")) .metadata(metadata) .build(); searchResults.add(result); } return searchResults; } // ==================== 统计 ==================== /** * 获取知识库统计信息 */ public Map getStats() { // 文档统计 Long totalDocuments = documentMapper.selectCount(null); // 按文件类型统计 String typeSql = "SELECT file_type, COUNT(*) as count FROM knowledge_document WHERE is_delete = false GROUP BY file_type"; List> typeStats = jdbcTemplate.queryForList(typeSql); Map byFileType = typeStats.stream() .collect(Collectors.toMap( r -> (String) r.get("file_type"), r -> ((Number) r.get("count")).longValue() )); // 按分类统计 String catSql = "SELECT c.name, COUNT(d.id) as count FROM knowledge_document d " + "LEFT JOIN knowledge_category c ON d.category_id = c.id " + "WHERE d.is_delete = false GROUP BY c.name"; List> catStats; try { catStats = jdbcTemplate.queryForList(catSql); } catch (Exception e) { catStats = new ArrayList<>(); } // 向量总数 String vectorSql = "SELECT COUNT(*) FROM vector_store"; Long totalVectors; try { totalVectors = jdbcTemplate.queryForObject(vectorSql, Long.class); } catch (Exception e) { totalVectors = 0L; } // 最近上传时间 String lastUploadSql = "SELECT MAX(create_time) FROM knowledge_document WHERE is_delete = false"; Date lastUploadTime = jdbcTemplate.queryForObject(lastUploadSql, Date.class); Map stats = new LinkedHashMap<>(); stats.put("totalDocuments", totalDocuments); stats.put("totalVectors", totalVectors); stats.put("lastUploadTime", lastUploadTime); stats.put("byFileType", byFileType); stats.put("byCategory", catStats); return stats; } // ==================== 分类管理 ==================== /** * 获取分类树 */ public List getCategoryTree() { List categories = categoryMapper.selectList( new QueryWrapper().orderByAsc("sort_order")); Map nodeMap = new LinkedHashMap<>(); List rootNodes = new ArrayList<>(); for (KnowledgeCategory cat : categories) { CategoryNode node = CategoryNode.builder() .id(cat.getId()) .name(cat.getName()) .description(cat.getDescription()) .parentId(cat.getParentId()) .sortOrder(cat.getSortOrder()) .documentCount(cat.getDocumentCount()) .children(new ArrayList<>()) .build(); nodeMap.put(cat.getId(), node); } for (CategoryNode node : nodeMap.values()) { if (node.getParentId() == null || node.getParentId() == 0) { rootNodes.add(node); } else { CategoryNode parent = nodeMap.get(node.getParentId()); if (parent != null) { parent.getChildren().add(node); } else { rootNodes.add(node); } } } return rootNodes; } /** * 获取分类列表 */ public List listCategories() { return categoryMapper.selectList( new QueryWrapper().orderByAsc("sort_order")); } /** * 创建分类 */ public KnowledgeCategory createCategory(String name, String description, Long parentId, Integer sortOrder) { KnowledgeCategory category = KnowledgeCategory.builder() .name(name) .description(description) .parentId(parentId != null ? parentId : 0L) .sortOrder(sortOrder != null ? sortOrder : 0) .documentCount(0) .build(); categoryMapper.insert(category); return category; } /** * 更新分类 */ public void updateCategory(Long id, String name, String description, Integer sortOrder) { KnowledgeCategory category = categoryMapper.selectById(id); if (category == null) { throw new RuntimeException("分类不存在"); } if (name != null && !name.isEmpty()) { category.setName(name); } if (description != null) { category.setDescription(description); } if (sortOrder != null) { category.setSortOrder(sortOrder); } categoryMapper.updateById(category); } /** * 删除分类(不删除文档,仅清空关联) */ @Transactional(rollbackFor = Exception.class) public void deleteCategory(Long id) { // 将关联的文档 category_id 设为 0 KnowledgeDocument updateDoc = new KnowledgeDocument(); updateDoc.setCategoryId(0L); documentMapper.update(updateDoc, new QueryWrapper().eq("category_id", id)); // 逻辑删除分类 categoryMapper.deleteById(id); } // ==================== 内部方法 ==================== /** * 根据文档ID删除 vector_store 中关联的所有向量 */ private int deleteVectorsByDocumentId(String documentId) { String sql = "SELECT id::text FROM vector_store WHERE metadata->>'documentId' = ?"; List ids = jdbcTemplate.queryForList(sql, String.class, documentId); if (!ids.isEmpty()) { pgVectorVectorStore.delete(ids); log.debug("删除向量: documentId={}, count={}", documentId, ids.size()); } return ids.size(); } /** * 获取文件扩展名 */ private String getFileExtension(String filename) { if (filename == null || !filename.contains(".")) { return "unknown"; } return filename.substring(filename.lastIndexOf(".") + 1).toLowerCase(); } /** * 从 metadata 中安全获取字符串值 */ private String getStringFromMetadata(Map metadata, String key) { Object value = metadata.get(key); return value != null ? value.toString() : null; } /** * 从 metadata 中安全获取整数值 */ private Integer getIntegerFromMetadata(Map metadata, String key) { Object value = metadata.get(key); if (value == null) return null; if (value instanceof Number) { return ((Number) value).intValue(); } try { return Integer.parseInt(value.toString()); } catch (NumberFormatException e) { return null; } } }