import { v4 as uuidv4 } from 'uuid'; import db from './db'; import { NormalizedDocument, ClauseChunk, DiffEvent } from './models'; // Simple logic for topic tagging based on keywords function tagTopics(text: string): string[] { const topics: string[] = []; const keywordMap: Record = { 'sdk_disclosure': ['SDK', '第三方', '软件开发工具包'], 'permission_usage': ['权限', '相机', '相册', '定位', '麦克风', '通讯录'], 'sensitive_personal_info': ['敏感', '身份证', '生物识别', '人脸'], 'minor_protection': ['未成年', '儿童', '监护人'], 'data_retention': ['存储', '保留', '期限'], 'third_party_sharing': ['共享', '转让', '提供给第三方'], 'cross_border_transfer': ['出境', '境外', '跨境'], 'user_rights': ['注销', '更正', '访问', '撤回', '权利'], 'biometric_info': ['指纹', '人脸', '虹膜', '生物识别'] }; for (const [topic, keywords] of Object.entries(keywordMap)) { if (keywords.some(kw => text.includes(kw))) { topics.push(topic); } } return topics; } export function sliceAndDiff(newDoc: NormalizedDocument) { // Find the previous active document to diff against const oldDoc = db.prepare(` SELECT * FROM normalized_document WHERE source_id = ? AND doc_status = 'archived' ORDER BY created_at DESC LIMIT 1 `).get(newDoc.source_id) as any; // Simple clause slicing: splitting by newlines or rudimentary sections const chunks = newDoc.normalized_text.split(/(?:\r?\n)+/).filter(line => line.trim().length > 10); const newChunks: ClauseChunk[] = chunks.map((text, idx) => { const chunk_id = `chunk_${uuidv4().replace(/-/g, '').substring(0, 16)}`; const sectionTitle = text.length > 50 ? text.substring(0, 50) + '...' : text; const topicTags = tagTopics(text); return { chunk_id, doc_id: newDoc.doc_id, section_path: `Paragraph ${idx + 1}`, section_title: sectionTitle, clause_text: text, topic_tags: topicTags, embedding_status: 'ready', chunk_order: idx, created_at: new Date().toISOString() }; }); const insertChunkStmt = db.prepare(` INSERT INTO clause_chunk ( chunk_id, doc_id, section_path, section_title, clause_text, topic_tags, embedding_status, chunk_order, created_at ) VALUES ( @chunk_id, @doc_id, @section_path, @section_title, @clause_text, @topic_tags, @embedding_status, @chunk_order, @created_at ) `); const insertManyChunks = db.transaction((chunksToInsert: ClauseChunk[]) => { for (const chunk of chunksToInsert) { insertChunkStmt.run({ ...chunk, topic_tags: JSON.stringify(chunk.topic_tags) }); } }); insertManyChunks(newChunks); // Version Comparison (Diff Event) if (!oldDoc) { // If no old doc, all sections are 'added' const event_id = `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`; const newExcerpt = newDoc.normalized_text.substring(0, 200) + '...'; const event: DiffEvent = { event_id, source_id: newDoc.source_id, from_doc_id: '', to_doc_id: newDoc.doc_id, change_type: 'added', section_title: 'All Content', old_excerpt: '', new_excerpt: newExcerpt, topic_tags: tagTopics(newExcerpt), impact_level: 'medium', detected_at: new Date().toISOString() }; insertDiffEvent(event); } else { // Diffing const oldChunks = db.prepare(` SELECT * FROM clause_chunk WHERE doc_id = ? ORDER BY chunk_order ASC `).all(oldDoc.doc_id) as any[]; // Simplified Diffing logic: // If old text contains a chunk that new doesn't, it's removed. // If new text contains a chunk that old doesn't, it's added. // In a real scenario, more sophisticated diffing (e.g. sequence alignment) is needed. const oldTexts = oldChunks.map(c => c.clause_text); const newTexts = newChunks.map(c => c.clause_text); const added = newChunks.filter(c => !oldTexts.includes(c.clause_text)); const removed = oldChunks.filter(c => !newTexts.includes(c.clause_text)); // Combine added and removed into diff events for (const chunk of added) { const event: DiffEvent = { event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`, source_id: newDoc.source_id, from_doc_id: oldDoc.doc_id, to_doc_id: newDoc.doc_id, change_type: 'added', section_title: chunk.section_title, old_excerpt: '', new_excerpt: chunk.clause_text, topic_tags: chunk.topic_tags, impact_level: 'low', detected_at: new Date().toISOString() }; insertDiffEvent(event); } for (const chunk of removed) { const tags = JSON.parse(chunk.topic_tags || '[]'); const event: DiffEvent = { event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`, source_id: newDoc.source_id, from_doc_id: oldDoc.doc_id, to_doc_id: newDoc.doc_id, change_type: 'removed', section_title: chunk.section_title, old_excerpt: chunk.clause_text, new_excerpt: '', topic_tags: tags, impact_level: 'low', detected_at: new Date().toISOString() }; insertDiffEvent(event); } } } function insertDiffEvent(event: DiffEvent) { const stmt = db.prepare(` INSERT INTO diff_event ( event_id, source_id, from_doc_id, to_doc_id, change_type, section_title, old_excerpt, new_excerpt, topic_tags, impact_level, detected_at ) VALUES ( @event_id, @source_id, @from_doc_id, @to_doc_id, @change_type, @section_title, @old_excerpt, @new_excerpt, @topic_tags, @impact_level, @detected_at ) `); stmt.run({ ...event, topic_tags: JSON.stringify(event.topic_tags) }); }