| import { v4 as uuidv4 } from 'uuid'; |
| import db from './db'; |
| import { NormalizedDocument, ClauseChunk, DiffEvent } from './models'; |
|
|
| |
| function tagTopics(text: string): string[] { |
| const topics: string[] = []; |
| const keywordMap: Record<string, string[]> = { |
| 'sdk_disclosure': ['SDK', '第三方', '软件开发工具包'], |
| 'permission_usage': ['权限', '相机', '相册', '定位', '麦克风', '通讯录'], |
| 'sensitive_personal_info': ['敏感', '身份证', '生物识别', '人脸'], |
| 'minor_protection': ['未成年', '儿童', '监护人'], |
| 'data_retention': ['存储', '保留', '期限'], |
| 'third_party_sharing': ['共享', '转让', '提供给第三方'], |
| 'cross_border_transfer': ['出境', '境外', '跨境'], |
| 'user_rights': ['注销', '更正', '访问', '撤回', '权利'], |
| 'biometric_info': ['指纹', '人脸', '虹膜', '生物识别'] |
| }; |
|
|
| for (const [topic, keywords] of Object.entries(keywordMap)) { |
| if (keywords.some(kw => text.includes(kw))) { |
| topics.push(topic); |
| } |
| } |
|
|
| return topics; |
| } |
|
|
| export function sliceAndDiff(newDoc: NormalizedDocument) { |
| |
| const oldDoc = db.prepare(` |
| SELECT * FROM normalized_document |
| WHERE source_id = ? AND doc_status = 'archived' |
| ORDER BY created_at DESC LIMIT 1 |
| `).get(newDoc.source_id) as any; |
|
|
| |
| const chunks = newDoc.normalized_text.split(/(?:\r?\n)+/).filter(line => line.trim().length > 10); |
| |
| const newChunks: ClauseChunk[] = chunks.map((text, idx) => { |
| const chunk_id = `chunk_${uuidv4().replace(/-/g, '').substring(0, 16)}`; |
| const sectionTitle = text.length > 50 ? text.substring(0, 50) + '...' : text; |
| const topicTags = tagTopics(text); |
| |
| return { |
| chunk_id, |
| doc_id: newDoc.doc_id, |
| section_path: `Paragraph ${idx + 1}`, |
| section_title: sectionTitle, |
| clause_text: text, |
| topic_tags: topicTags, |
| embedding_status: 'ready', |
| chunk_order: idx, |
| created_at: new Date().toISOString() |
| }; |
| }); |
|
|
| const insertChunkStmt = db.prepare(` |
| INSERT INTO clause_chunk ( |
| chunk_id, doc_id, section_path, section_title, clause_text, |
| topic_tags, embedding_status, chunk_order, created_at |
| ) VALUES ( |
| @chunk_id, @doc_id, @section_path, @section_title, @clause_text, |
| @topic_tags, @embedding_status, @chunk_order, @created_at |
| ) |
| `); |
|
|
| const insertManyChunks = db.transaction((chunksToInsert: ClauseChunk[]) => { |
| for (const chunk of chunksToInsert) { |
| insertChunkStmt.run({ |
| ...chunk, |
| topic_tags: JSON.stringify(chunk.topic_tags) |
| }); |
| } |
| }); |
| insertManyChunks(newChunks); |
|
|
| |
| if (!oldDoc) { |
| |
| const event_id = `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`; |
| const newExcerpt = newDoc.normalized_text.substring(0, 200) + '...'; |
| |
| const event: DiffEvent = { |
| event_id, |
| source_id: newDoc.source_id, |
| from_doc_id: '', |
| to_doc_id: newDoc.doc_id, |
| change_type: 'added', |
| section_title: 'All Content', |
| old_excerpt: '', |
| new_excerpt: newExcerpt, |
| topic_tags: tagTopics(newExcerpt), |
| impact_level: 'medium', |
| detected_at: new Date().toISOString() |
| }; |
| insertDiffEvent(event); |
| } else { |
| |
| const oldChunks = db.prepare(` |
| SELECT * FROM clause_chunk WHERE doc_id = ? ORDER BY chunk_order ASC |
| `).all(oldDoc.doc_id) as any[]; |
|
|
| |
| |
| |
| |
| const oldTexts = oldChunks.map(c => c.clause_text); |
| const newTexts = newChunks.map(c => c.clause_text); |
|
|
| const added = newChunks.filter(c => !oldTexts.includes(c.clause_text)); |
| const removed = oldChunks.filter(c => !newTexts.includes(c.clause_text)); |
|
|
| |
| for (const chunk of added) { |
| const event: DiffEvent = { |
| event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`, |
| source_id: newDoc.source_id, |
| from_doc_id: oldDoc.doc_id, |
| to_doc_id: newDoc.doc_id, |
| change_type: 'added', |
| section_title: chunk.section_title, |
| old_excerpt: '', |
| new_excerpt: chunk.clause_text, |
| topic_tags: chunk.topic_tags, |
| impact_level: 'low', |
| detected_at: new Date().toISOString() |
| }; |
| insertDiffEvent(event); |
| } |
|
|
| for (const chunk of removed) { |
| const tags = JSON.parse(chunk.topic_tags || '[]'); |
| const event: DiffEvent = { |
| event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`, |
| source_id: newDoc.source_id, |
| from_doc_id: oldDoc.doc_id, |
| to_doc_id: newDoc.doc_id, |
| change_type: 'removed', |
| section_title: chunk.section_title, |
| old_excerpt: chunk.clause_text, |
| new_excerpt: '', |
| topic_tags: tags, |
| impact_level: 'low', |
| detected_at: new Date().toISOString() |
| }; |
| insertDiffEvent(event); |
| } |
| } |
| } |
|
|
| function insertDiffEvent(event: DiffEvent) { |
| const stmt = db.prepare(` |
| INSERT INTO diff_event ( |
| event_id, source_id, from_doc_id, to_doc_id, change_type, |
| section_title, old_excerpt, new_excerpt, topic_tags, impact_level, detected_at |
| ) VALUES ( |
| @event_id, @source_id, @from_doc_id, @to_doc_id, @change_type, |
| @section_title, @old_excerpt, @new_excerpt, @topic_tags, @impact_level, @detected_at |
| ) |
| `); |
| stmt.run({ |
| ...event, |
| topic_tags: JSON.stringify(event.topic_tags) |
| }); |
| } |
|
|