File size: 5,877 Bytes
f39c319 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | import { v4 as uuidv4 } from 'uuid';
import db from './db';
import { NormalizedDocument, ClauseChunk, DiffEvent } from './models';
// Simple logic for topic tagging based on keywords
function tagTopics(text: string): string[] {
const topics: string[] = [];
const keywordMap: Record<string, string[]> = {
'sdk_disclosure': ['SDK', '第三方', '软件开发工具包'],
'permission_usage': ['权限', '相机', '相册', '定位', '麦克风', '通讯录'],
'sensitive_personal_info': ['敏感', '身份证', '生物识别', '人脸'],
'minor_protection': ['未成年', '儿童', '监护人'],
'data_retention': ['存储', '保留', '期限'],
'third_party_sharing': ['共享', '转让', '提供给第三方'],
'cross_border_transfer': ['出境', '境外', '跨境'],
'user_rights': ['注销', '更正', '访问', '撤回', '权利'],
'biometric_info': ['指纹', '人脸', '虹膜', '生物识别']
};
for (const [topic, keywords] of Object.entries(keywordMap)) {
if (keywords.some(kw => text.includes(kw))) {
topics.push(topic);
}
}
return topics;
}
export function sliceAndDiff(newDoc: NormalizedDocument) {
// Find the previous active document to diff against
const oldDoc = db.prepare(`
SELECT * FROM normalized_document
WHERE source_id = ? AND doc_status = 'archived'
ORDER BY created_at DESC LIMIT 1
`).get(newDoc.source_id) as any;
// Simple clause slicing: splitting by newlines or rudimentary sections
const chunks = newDoc.normalized_text.split(/(?:\r?\n)+/).filter(line => line.trim().length > 10);
const newChunks: ClauseChunk[] = chunks.map((text, idx) => {
const chunk_id = `chunk_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
const sectionTitle = text.length > 50 ? text.substring(0, 50) + '...' : text;
const topicTags = tagTopics(text);
return {
chunk_id,
doc_id: newDoc.doc_id,
section_path: `Paragraph ${idx + 1}`,
section_title: sectionTitle,
clause_text: text,
topic_tags: topicTags,
embedding_status: 'ready',
chunk_order: idx,
created_at: new Date().toISOString()
};
});
const insertChunkStmt = db.prepare(`
INSERT INTO clause_chunk (
chunk_id, doc_id, section_path, section_title, clause_text,
topic_tags, embedding_status, chunk_order, created_at
) VALUES (
@chunk_id, @doc_id, @section_path, @section_title, @clause_text,
@topic_tags, @embedding_status, @chunk_order, @created_at
)
`);
const insertManyChunks = db.transaction((chunksToInsert: ClauseChunk[]) => {
for (const chunk of chunksToInsert) {
insertChunkStmt.run({
...chunk,
topic_tags: JSON.stringify(chunk.topic_tags)
});
}
});
insertManyChunks(newChunks);
// Version Comparison (Diff Event)
if (!oldDoc) {
// If no old doc, all sections are 'added'
const event_id = `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
const newExcerpt = newDoc.normalized_text.substring(0, 200) + '...';
const event: DiffEvent = {
event_id,
source_id: newDoc.source_id,
from_doc_id: '',
to_doc_id: newDoc.doc_id,
change_type: 'added',
section_title: 'All Content',
old_excerpt: '',
new_excerpt: newExcerpt,
topic_tags: tagTopics(newExcerpt),
impact_level: 'medium',
detected_at: new Date().toISOString()
};
insertDiffEvent(event);
} else {
// Diffing
const oldChunks = db.prepare(`
SELECT * FROM clause_chunk WHERE doc_id = ? ORDER BY chunk_order ASC
`).all(oldDoc.doc_id) as any[];
// Simplified Diffing logic:
// If old text contains a chunk that new doesn't, it's removed.
// If new text contains a chunk that old doesn't, it's added.
// In a real scenario, more sophisticated diffing (e.g. sequence alignment) is needed.
const oldTexts = oldChunks.map(c => c.clause_text);
const newTexts = newChunks.map(c => c.clause_text);
const added = newChunks.filter(c => !oldTexts.includes(c.clause_text));
const removed = oldChunks.filter(c => !newTexts.includes(c.clause_text));
// Combine added and removed into diff events
for (const chunk of added) {
const event: DiffEvent = {
event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
source_id: newDoc.source_id,
from_doc_id: oldDoc.doc_id,
to_doc_id: newDoc.doc_id,
change_type: 'added',
section_title: chunk.section_title,
old_excerpt: '',
new_excerpt: chunk.clause_text,
topic_tags: chunk.topic_tags,
impact_level: 'low',
detected_at: new Date().toISOString()
};
insertDiffEvent(event);
}
for (const chunk of removed) {
const tags = JSON.parse(chunk.topic_tags || '[]');
const event: DiffEvent = {
event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
source_id: newDoc.source_id,
from_doc_id: oldDoc.doc_id,
to_doc_id: newDoc.doc_id,
change_type: 'removed',
section_title: chunk.section_title,
old_excerpt: chunk.clause_text,
new_excerpt: '',
topic_tags: tags,
impact_level: 'low',
detected_at: new Date().toISOString()
};
insertDiffEvent(event);
}
}
}
function insertDiffEvent(event: DiffEvent) {
const stmt = db.prepare(`
INSERT INTO diff_event (
event_id, source_id, from_doc_id, to_doc_id, change_type,
section_title, old_excerpt, new_excerpt, topic_tags, impact_level, detected_at
) VALUES (
@event_id, @source_id, @from_doc_id, @to_doc_id, @change_type,
@section_title, @old_excerpt, @new_excerpt, @topic_tags, @impact_level, @detected_at
)
`);
stmt.run({
...event,
topic_tags: JSON.stringify(event.topic_tags)
});
}
|