RAG/src/main/java/com/wok/supportbot/service/DocumentService.java


								package com.wok.supportbot.service;


								import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;

								import com.wok.supportbot.dao.KnowledgeCategoryMapper;

								import com.wok.supportbot.dao.KnowledgeDocumentMapper;

								import com.wok.supportbot.document.extract.JsonDocumentLoader;

								import com.wok.supportbot.document.extract.MarkdownDocumentLoader;

								import com.wok.supportbot.document.extract.SimpleStringDocumentReader;

								import com.wok.supportbot.document.extract.TikaDocumentReader;

								import com.wok.supportbot.document.transform.MyKeywordEnricher;

								import com.wok.supportbot.document.transform.MyTokenTextSplitter;

								import com.wok.supportbot.entity.CategoryNode;

								import com.wok.supportbot.entity.KnowledgeCategory;

								import com.wok.supportbot.entity.KnowledgeDocument;

								import com.wok.supportbot.entity.SearchResult;

								import lombok.extern.slf4j.Slf4j;

								import org.springframework.ai.document.Document;

								import org.springframework.ai.vectorstore.SearchRequest;

								import org.springframework.ai.vectorstore.VectorStore;

								import org.springframework.ai.vectorstore.filter.Filter;

								import org.springframework.beans.factory.annotation.Autowired;

								import org.springframework.jdbc.core.JdbcTemplate;

								import org.springframework.stereotype.Service;

								import org.springframework.transaction.annotation.Transactional;

								import org.springframework.web.multipart.MultipartFile;


								import java.util.*;

								import java.util.stream.Collectors;


								/**

								 * 知识库文档管理服务

								 * 统一管理文档的上传、删除、搜索、统计、分类等操作

								 */

								@Service

								@Slf4j

								public class DocumentService {


								    @Autowired

								    private KnowledgeDocumentMapper documentMapper;


								    @Autowired

								    private KnowledgeCategoryMapper categoryMapper;


								    @Autowired

								    private JdbcTemplate jdbcTemplate;


								    @Autowired

								    private VectorStore pgVectorVectorStore;


								    @Autowired

								    private MyTokenTextSplitter myTokenTextSplitter;


								    @Autowired

								    private MyKeywordEnricher myKeywordEnricher;


								    @Autowired

								    private TikaDocumentReader tikaDocumentReader;


								    @Autowired

								    private SimpleStringDocumentReader simpleStringDocumentReader;


								    @Autowired

								    private MarkdownDocumentLoader markdownDocumentLoader;


								    @Autowired

								    private JsonDocumentLoader jsonDocumentLoader;


								    // ==================== 文档上传 ====================


								    /**

								     * 统一文档上传流程：创建记录 -> 分块 -> 关键词 -> 向量化 -> 更新状态

								     *

								     * @param documents   解析后的文档列表

								     * @param title       文档标题

								     * @param sourceName  源文件名

								     * @param fileType    文件类型

								     * @param fileSize    文件大小

								     * @param content     原文内容（截断预览）

								     * @param categoryId  分类ID

								     * @param tags        标签列表

								     * @return 创建完成的文档记录

								     */

								    @Transactional(rollbackFor = Exception.class)

								    public KnowledgeDocument uploadDocument(List<Document> documents, String title, String sourceName,

								                                             String fileType, Long fileSize, String content,

								                                             Long categoryId, List<String> tags) {

								        // 1. 创建文档记录（状态 PROCESSING）

								        KnowledgeDocument docRecord = KnowledgeDocument.builder()

								                .title(title != null ? title : sourceName)

								                .sourceName(sourceName)

								                .fileType(fileType)

								                .fileSize(fileSize != null ? fileSize : 0L)

								                .content(content != null && content.length() > 2000 ? content.substring(0, 2000) : content)

								                .categoryId(categoryId != null ? categoryId : 0L)

								                .tags(tags != null ? Map.of("tags", tags) : null)

								                .status("PROCESSING")

								                .chunkCount(0)

								                .build();

								        documentMapper.insert(docRecord);


								        try {

								            // 2. 分块处理

								            List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);


								            // 3. 为每个分块设置 documentId 等元数据

								            for (int i = 0; i < splitDocuments.size(); i++) {

								                Document doc = splitDocuments.get(i);

								                Map<String, Object> meta = new HashMap<>(doc.getMetadata());

								                meta.put("documentId", String.valueOf(docRecord.getId()));

								                meta.put("chunkIndex", i);

								                meta.put("sourceName", sourceName);

								                meta.put("title", title != null ? title : sourceName);

								                if (categoryId != null) {

								                    meta.put("categoryId", String.valueOf(categoryId));

								                }

								                if (tags != null && !tags.isEmpty()) {

								                    meta.put("tags", tags);

								                }

								                splitDocuments.set(i, new Document(doc.getId(), doc.getText(), meta));

								            }


								            // 4. 关键词提取

								            List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);


								            // 5. 向量化存储

								            pgVectorVectorStore.add(enrichedDocuments);


								            // 6. 更新文档状态为 READY

								            docRecord.setStatus("READY");

								            docRecord.setChunkCount(enrichedDocuments.size());

								            documentMapper.updateById(docRecord);


								            log.info("文档上传成功: id={}, title={}, chunks={}", docRecord.getId(), docRecord.getTitle(), enrichedDocuments.size());


								        } catch (Exception e) {

								            // 标记为失败

								            docRecord.setStatus("FAILED");

								            docRecord.setErrorMessage(e.getMessage());

								            documentMapper.updateById(docRecord);

								            log.error("文档上传失败: id={}, title={}", docRecord.getId(), docRecord.getTitle(), e);

								            throw new RuntimeException("文档处理失败: " + e.getMessage(), e);

								        }


								        return docRecord;

								    }


								    /**

								     * 解析文件并上传

								     */

								    public KnowledgeDocument uploadFile(MultipartFile file, String title, Long categoryId, List<String> tags) {

								        List<Document> documents = tikaDocumentReader.read(file);

								        String fileType = getFileExtension(file.getOriginalFilename());

								        return uploadDocument(documents,

								                title != null ? title : file.getOriginalFilename(),

								                file.getOriginalFilename(),

								                fileType,

								                file.getSize(),

								                documents.get(0).getText(),

								                categoryId,

								                tags);

								    }


								    /**

								     * 解析字符串并上传

								     */

								    public KnowledgeDocument uploadString(String content, String title, Long categoryId, List<String> tags) {

								        List<Document> documents = simpleStringDocumentReader.read(content);

								        return uploadDocument(documents, title, title, "txt",

								                (long) content.length(), content, categoryId, tags);

								    }


								    /**

								     * 解析 Markdown 文件并上传

								     */

								    public KnowledgeDocument uploadMarkdown(MultipartFile file, String title, Long categoryId, List<String> tags) {

								        List<Document> documents = markdownDocumentLoader.loadMarkdownFromFile(file);

								        String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n"));

								        return uploadDocument(documents,

								                title != null ? title : file.getOriginalFilename(),

								                file.getOriginalFilename(),

								                "md",

								                file.getSize(),

								                content,

								                categoryId,

								                tags);

								    }


								    /**

								     * 解析 JSON 文件（基本方式）并上传

								     */

								    public KnowledgeDocument uploadJsonBasic(MultipartFile file, String title, Long categoryId, List<String> tags) {

								        List<Document> documents = jsonDocumentLoader.loadBasicJson(file);

								        String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n"));

								        return uploadDocument(documents,

								                title != null ? title : file.getOriginalFilename(),

								                file.getOriginalFilename(),

								                "json",

								                file.getSize(),

								                content,

								                categoryId,

								                tags);

								    }


								    /**

								     * 解析 JSON 文件（按字段）并上传

								     */

								    public KnowledgeDocument uploadJsonFields(MultipartFile file, List<String> fields, String title, Long categoryId, List<String> tags) {

								        List<Document> documents = jsonDocumentLoader.loadJsonByFields(file, fields.toArray(new String[0]));

								        String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n"));

								        return uploadDocument(documents,

								                title != null ? title : file.getOriginalFilename(),

								                file.getOriginalFilename(),

								                "json",

								                file.getSize(),

								                content,

								                categoryId,

								                tags);

								    }


								    /**

								     * 解析 JSON 文件（按指针）并上传

								     */

								    public KnowledgeDocument uploadJsonPointer(MultipartFile file, String pointer, String title, Long categoryId, List<String> tags) {

								        List<Document> documents = jsonDocumentLoader.loadJsonByPointer(file, pointer);

								        String content = documents.stream().map(Document::getText).collect(Collectors.joining("\n"));

								        return uploadDocument(documents,

								                title != null ? title : file.getOriginalFilename(),

								                file.getOriginalFilename(),

								                "json",

								                file.getSize(),

								                content,

								                categoryId,

								                tags);

								    }


								    // ==================== 文档管理 ====================


								    /**

								     * 分页查询文档列表（手动分页）

								     */

								    public Map<String, Object> listDocuments(int page, int size, Long categoryId, String status) {

								        // 构建基础条件（用于 count 和 list）

								        QueryWrapper<KnowledgeDocument> countWrapper = new QueryWrapper<>();

								        if (categoryId != null && categoryId > 0) {

								            countWrapper.eq("category_id", categoryId);

								        }

								        if (status != null && !status.isEmpty()) {

								            countWrapper.eq("status", status);

								        }


								        // 先查询总数（不加 ORDER BY）

								        Long total = documentMapper.selectCount(countWrapper);


								        // 构建列表查询条件

								        QueryWrapper<KnowledgeDocument> listWrapper = new QueryWrapper<>();

								        if (categoryId != null && categoryId > 0) {

								            listWrapper.eq("category_id", categoryId);

								        }

								        if (status != null && !status.isEmpty()) {

								            listWrapper.eq("status", status);

								        }

								        listWrapper.orderByDesc("create_time");

								        listWrapper.last("LIMIT " + size + " OFFSET " + (page - 1) * size);

								        List<KnowledgeDocument> records = documentMapper.selectList(listWrapper);


								        Map<String, Object> result = new HashMap<>();

								        result.put("records", records);

								        result.put("total", total);

								        result.put("page", page);

								        result.put("size", size);

								        result.put("pages", (total + size - 1) / size);

								        return result;

								    }


								    /**

								     * 获取文档详情

								     */

								    public KnowledgeDocument getDocumentDetail(Long id) {

								        return documentMapper.selectById(id);

								    }


								    /**

								     * 获取文档的所有分块

								     */

								    public List<Map<String, Object>> getDocumentChunks(Long id) {

								        String sql = "SELECT id::text as id, content, metadata, create_time FROM vector_store " +

								                "WHERE metadata->>'documentId' = ? ORDER BY (metadata->>'chunkIndex')::int";

								        return jdbcTemplate.queryForList(sql, String.valueOf(id));

								    }


								    /**

								     * 删除文档（逻辑删除 + 级联删除向量）

								     */

								    @Transactional(rollbackFor = Exception.class)

								    public int deleteDocument(Long id) {

								        KnowledgeDocument doc = documentMapper.selectById(id);

								        if (doc == null) {

								            throw new RuntimeException("文档不存在");

								        }

								        // 删除关联的向量

								        int vectorCount = deleteVectorsByDocumentId(String.valueOf(id));

								        // 逻辑删除文档记录

								        documentMapper.deleteById(id);

								        log.info("删除文档: id={}, title={}, 删除向量数={}", id, doc.getTitle(), vectorCount);

								        return vectorCount;

								    }


								    /**

								     * 重新处理文档（重新分块 + 向量化）

								     */

								    @Transactional(rollbackFor = Exception.class)

								    public KnowledgeDocument reprocessDocument(Long id) {

								        KnowledgeDocument doc = documentMapper.selectById(id);

								        if (doc == null) {

								            throw new RuntimeException("文档不存在");

								        }

								        if (doc.getContent() == null || doc.getContent().isEmpty()) {

								            throw new RuntimeException("文档无内容，无法重新处理");

								        }


								        // 删除旧向量

								        deleteVectorsByDocumentId(String.valueOf(id));


								        // 重新解析并处理

								        List<Document> documents = simpleStringDocumentReader.read(doc.getContent());


								        doc.setStatus("PROCESSING");

								        doc.setChunkCount(0);

								        doc.setErrorMessage(null);

								        documentMapper.updateById(doc);


								        try {

								            List<Document> splitDocuments = myTokenTextSplitter.splitDocuments(documents);


								            for (int i = 0; i < splitDocuments.size(); i++) {

								                Document d = splitDocuments.get(i);

								                Map<String, Object> meta = new HashMap<>(d.getMetadata());

								                meta.put("documentId", String.valueOf(doc.getId()));

								                meta.put("chunkIndex", i);

								                meta.put("sourceName", doc.getSourceName());

								                meta.put("title", doc.getTitle());

								                if (doc.getCategoryId() != null && doc.getCategoryId() > 0) {

								                    meta.put("categoryId", String.valueOf(doc.getCategoryId()));

								                }

								                if (doc.getTags() != null && doc.getTags().containsKey("tags")) {

								                    meta.put("tags", doc.getTags().get("tags"));

								                }

								                splitDocuments.set(i, new Document(d.getId(), d.getText(), meta));

								            }


								            List<Document> enrichedDocuments = myKeywordEnricher.enrichDocuments(splitDocuments);

								            pgVectorVectorStore.add(enrichedDocuments);


								            doc.setStatus("READY");

								            doc.setChunkCount(enrichedDocuments.size());

								            documentMapper.updateById(doc);


								            log.info("重新处理文档成功: id={}, title={}, chunks={}", doc.getId(), doc.getTitle(), enrichedDocuments.size());


								        } catch (Exception e) {

								            doc.setStatus("FAILED");

								            doc.setErrorMessage(e.getMessage());

								            documentMapper.updateById(doc);

								            log.error("重新处理文档失败: id={}, title={}", doc.getId(), doc.getTitle(), e);

								            throw new RuntimeException("重新处理失败: " + e.getMessage(), e);

								        }


								        return doc;

								    }


								    /**

								     * 更新文档元信息

								     */

								    public void updateDocumentMetadata(Long id, String title, Long categoryId, List<String> tags) {

								        KnowledgeDocument doc = documentMapper.selectById(id);

								        if (doc == null) {

								            throw new RuntimeException("文档不存在");

								        }

								        if (title != null && !title.isEmpty()) {

								            doc.setTitle(title);

								        }

								        if (categoryId != null) {

								            doc.setCategoryId(categoryId);

								        }

								        if (tags != null) {

								            doc.setTags(Map.of("tags", tags));

								        }

								        documentMapper.updateById(doc);


								        // 同步更新 vector_store 中对应的 metadata

								        // 注意：Spring AI 当前没有直接更新 metadata 的 API

								        // 这里我们先更新文档记录，metadata 的同步留到后续优化

								        log.info("更新文档元信息: id={}, title={}", id, doc.getTitle());

								    }


								    // ==================== 语义搜索 ====================


								    /**

								     * 语义搜索

								     */

								    public List<SearchResult> searchDocuments(String query, int topK, double similarityThreshold, Long categoryId) {

								        SearchRequest.Builder searchBuilder = SearchRequest.builder()

								                .query(query)

								                .topK(topK)

								                .similarityThreshold(similarityThreshold);


								        // 如果指定了分类，添加过滤条件（当前 Spring AI 1.0.0-M6 的 filter 支持有限）

								        // 这里先不做分类过滤，后续升级 Spring AI 版本后再完善


								        List<Document> results = pgVectorVectorStore.similaritySearch(searchBuilder.build());


								        List<SearchResult> searchResults = new ArrayList<>();

								        for (Document doc : results) {

								            Map<String, Object> metadata = doc.getMetadata();

								            SearchResult result = SearchResult.builder()

								                    .id(doc.getId())

								                    .content(doc.getText())

								                    .score(metadata.containsKey("distance") ? ((Number) metadata.get("distance")).doubleValue() : null)

								                    .sourceName(getStringFromMetadata(metadata, "sourceName"))

								                    .title(getStringFromMetadata(metadata, "title"))

								                    .chunkIndex(getIntegerFromMetadata(metadata, "chunkIndex"))

								                    .documentId(getStringFromMetadata(metadata, "documentId"))

								                    .metadata(metadata)

								                    .build();

								            searchResults.add(result);

								        }


								        return searchResults;

								    }


								    // ==================== 统计 ====================


								    /**

								     * 获取知识库统计信息

								     */

								    public Map<String, Object> getStats() {

								        // 文档统计

								        Long totalDocuments = documentMapper.selectCount(null);


								        // 按文件类型统计

								        String typeSql = "SELECT file_type, COUNT(*) as count FROM knowledge_document WHERE is_delete = false GROUP BY file_type";

								        List<Map<String, Object>> typeStats = jdbcTemplate.queryForList(typeSql);

								        Map<String, Long> byFileType = typeStats.stream()

								                .collect(Collectors.toMap(

								                        r -> (String) r.get("file_type"),

								                        r -> ((Number) r.get("count")).longValue()

								                ));


								        // 按分类统计

								        String catSql = "SELECT c.name, COUNT(d.id) as count FROM knowledge_document d " +

								                "LEFT JOIN knowledge_category c ON d.category_id = c.id " +

								                "WHERE d.is_delete = false GROUP BY c.name";

								        List<Map<String, Object>> catStats;

								        try {

								            catStats = jdbcTemplate.queryForList(catSql);

								        } catch (Exception e) {

								            catStats = new ArrayList<>();

								        }


								        // 向量总数

								        String vectorSql = "SELECT COUNT(*) FROM vector_store";

								        Long totalVectors;

								        try {

								            totalVectors = jdbcTemplate.queryForObject(vectorSql, Long.class);

								        } catch (Exception e) {

								            totalVectors = 0L;

								        }


								        // 最近上传时间

								        String lastUploadSql = "SELECT MAX(create_time) FROM knowledge_document WHERE is_delete = false";

								        Date lastUploadTime = jdbcTemplate.queryForObject(lastUploadSql, Date.class);


								        Map<String, Object> stats = new LinkedHashMap<>();

								        stats.put("totalDocuments", totalDocuments);

								        stats.put("totalVectors", totalVectors);

								        stats.put("lastUploadTime", lastUploadTime);

								        stats.put("byFileType", byFileType);

								        stats.put("byCategory", catStats);


								        return stats;

								    }


								    // ==================== 分类管理 ====================


								    /**

								     * 获取分类树

								     */

								    public List<CategoryNode> getCategoryTree() {

								        List<KnowledgeCategory> categories = categoryMapper.selectList(

								                new QueryWrapper<KnowledgeCategory>().orderByAsc("sort_order"));


								        Map<Long, CategoryNode> nodeMap = new LinkedHashMap<>();

								        List<CategoryNode> rootNodes = new ArrayList<>();


								        for (KnowledgeCategory cat : categories) {

								            CategoryNode node = CategoryNode.builder()

								                    .id(cat.getId())

								                    .name(cat.getName())

								                    .description(cat.getDescription())

								                    .parentId(cat.getParentId())

								                    .sortOrder(cat.getSortOrder())

								                    .documentCount(cat.getDocumentCount())

								                    .children(new ArrayList<>())

								                    .build();

								            nodeMap.put(cat.getId(), node);

								        }


								        for (CategoryNode node : nodeMap.values()) {

								            if (node.getParentId() == null || node.getParentId() == 0) {

								                rootNodes.add(node);

								            } else {

								                CategoryNode parent = nodeMap.get(node.getParentId());

								                if (parent != null) {

								                    parent.getChildren().add(node);

								                } else {

								                    rootNodes.add(node);

								                }

								            }

								        }


								        return rootNodes;

								    }


								    /**

								     * 获取分类列表

								     */

								    public List<KnowledgeCategory> listCategories() {

								        return categoryMapper.selectList(

								                new QueryWrapper<KnowledgeCategory>().orderByAsc("sort_order"));

								    }


								    /**

								     * 创建分类

								     */

								    public KnowledgeCategory createCategory(String name, String description, Long parentId, Integer sortOrder) {

								        KnowledgeCategory category = KnowledgeCategory.builder()

								                .name(name)

								                .description(description)

								                .parentId(parentId != null ? parentId : 0L)

								                .sortOrder(sortOrder != null ? sortOrder : 0)

								                .documentCount(0)

								                .build();

								        categoryMapper.insert(category);

								        return category;

								    }


								    /**

								     * 更新分类

								     */

								    public void updateCategory(Long id, String name, String description, Integer sortOrder) {

								        KnowledgeCategory category = categoryMapper.selectById(id);

								        if (category == null) {

								            throw new RuntimeException("分类不存在");

								        }

								        if (name != null && !name.isEmpty()) {

								            category.setName(name);

								        }

								        if (description != null) {

								            category.setDescription(description);

								        }

								        if (sortOrder != null) {

								            category.setSortOrder(sortOrder);

								        }

								        categoryMapper.updateById(category);

								    }


								    /**

								     * 删除分类（不删除文档，仅清空关联）

								     */

								    @Transactional(rollbackFor = Exception.class)

								    public void deleteCategory(Long id) {

								        // 将关联的文档 category_id 设为 0

								        KnowledgeDocument updateDoc = new KnowledgeDocument();

								        updateDoc.setCategoryId(0L);

								        documentMapper.update(updateDoc, new QueryWrapper<KnowledgeDocument>().eq("category_id", id));


								        // 逻辑删除分类

								        categoryMapper.deleteById(id);

								    }


								    // ==================== 内部方法 ====================


								    /**

								     * 根据文档ID删除 vector_store 中关联的所有向量

								     */

								    private int deleteVectorsByDocumentId(String documentId) {

								        String sql = "SELECT id::text FROM vector_store WHERE metadata->>'documentId' = ?";

								        List<String> ids = jdbcTemplate.queryForList(sql, String.class, documentId);


								        if (!ids.isEmpty()) {

								            pgVectorVectorStore.delete(ids);

								            log.debug("删除向量: documentId={}, count={}", documentId, ids.size());

								        }

								        return ids.size();

								    }


								    /**

								     * 获取文件扩展名

								     */

								    private String getFileExtension(String filename) {

								        if (filename == null || !filename.contains(".")) {

								            return "unknown";

								        }

								        return filename.substring(filename.lastIndexOf(".") + 1).toLowerCase();

								    }


								    /**

								     * 从 metadata 中安全获取字符串值

								     */

								    private String getStringFromMetadata(Map<String, Object> metadata, String key) {

								        Object value = metadata.get(key);

								        return value != null ? value.toString() : null;

								    }


								    /**

								     * 从 metadata 中安全获取整数值

								     */

								    private Integer getIntegerFromMetadata(Map<String, Object> metadata, String key) {

								        Object value = metadata.get(key);

								        if (value == null) return null;

								        if (value instanceof Number) {

								            return ((Number) value).intValue();

								        }

								        try {

								            return Integer.parseInt(value.toString());

								        } catch (NumberFormatException e) {

								            return null;

								        }

								    }

								}