agent01 / src /crawler /differ.ts
Auto Deployer
Deploy compliance agent services
f39c319
import { v4 as uuidv4 } from 'uuid';
import db from './db';
import { NormalizedDocument, ClauseChunk, DiffEvent } from './models';
// Simple logic for topic tagging based on keywords
function tagTopics(text: string): string[] {
const topics: string[] = [];
const keywordMap: Record<string, string[]> = {
'sdk_disclosure': ['SDK', '第三方', '软件开发工具包'],
'permission_usage': ['权限', '相机', '相册', '定位', '麦克风', '通讯录'],
'sensitive_personal_info': ['敏感', '身份证', '生物识别', '人脸'],
'minor_protection': ['未成年', '儿童', '监护人'],
'data_retention': ['存储', '保留', '期限'],
'third_party_sharing': ['共享', '转让', '提供给第三方'],
'cross_border_transfer': ['出境', '境外', '跨境'],
'user_rights': ['注销', '更正', '访问', '撤回', '权利'],
'biometric_info': ['指纹', '人脸', '虹膜', '生物识别']
};
for (const [topic, keywords] of Object.entries(keywordMap)) {
if (keywords.some(kw => text.includes(kw))) {
topics.push(topic);
}
}
return topics;
}
export function sliceAndDiff(newDoc: NormalizedDocument) {
// Find the previous active document to diff against
const oldDoc = db.prepare(`
SELECT * FROM normalized_document
WHERE source_id = ? AND doc_status = 'archived'
ORDER BY created_at DESC LIMIT 1
`).get(newDoc.source_id) as any;
// Simple clause slicing: splitting by newlines or rudimentary sections
const chunks = newDoc.normalized_text.split(/(?:\r?\n)+/).filter(line => line.trim().length > 10);
const newChunks: ClauseChunk[] = chunks.map((text, idx) => {
const chunk_id = `chunk_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
const sectionTitle = text.length > 50 ? text.substring(0, 50) + '...' : text;
const topicTags = tagTopics(text);
return {
chunk_id,
doc_id: newDoc.doc_id,
section_path: `Paragraph ${idx + 1}`,
section_title: sectionTitle,
clause_text: text,
topic_tags: topicTags,
embedding_status: 'ready',
chunk_order: idx,
created_at: new Date().toISOString()
};
});
const insertChunkStmt = db.prepare(`
INSERT INTO clause_chunk (
chunk_id, doc_id, section_path, section_title, clause_text,
topic_tags, embedding_status, chunk_order, created_at
) VALUES (
@chunk_id, @doc_id, @section_path, @section_title, @clause_text,
@topic_tags, @embedding_status, @chunk_order, @created_at
)
`);
const insertManyChunks = db.transaction((chunksToInsert: ClauseChunk[]) => {
for (const chunk of chunksToInsert) {
insertChunkStmt.run({
...chunk,
topic_tags: JSON.stringify(chunk.topic_tags)
});
}
});
insertManyChunks(newChunks);
// Version Comparison (Diff Event)
if (!oldDoc) {
// If no old doc, all sections are 'added'
const event_id = `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
const newExcerpt = newDoc.normalized_text.substring(0, 200) + '...';
const event: DiffEvent = {
event_id,
source_id: newDoc.source_id,
from_doc_id: '',
to_doc_id: newDoc.doc_id,
change_type: 'added',
section_title: 'All Content',
old_excerpt: '',
new_excerpt: newExcerpt,
topic_tags: tagTopics(newExcerpt),
impact_level: 'medium',
detected_at: new Date().toISOString()
};
insertDiffEvent(event);
} else {
// Diffing
const oldChunks = db.prepare(`
SELECT * FROM clause_chunk WHERE doc_id = ? ORDER BY chunk_order ASC
`).all(oldDoc.doc_id) as any[];
// Simplified Diffing logic:
// If old text contains a chunk that new doesn't, it's removed.
// If new text contains a chunk that old doesn't, it's added.
// In a real scenario, more sophisticated diffing (e.g. sequence alignment) is needed.
const oldTexts = oldChunks.map(c => c.clause_text);
const newTexts = newChunks.map(c => c.clause_text);
const added = newChunks.filter(c => !oldTexts.includes(c.clause_text));
const removed = oldChunks.filter(c => !newTexts.includes(c.clause_text));
// Combine added and removed into diff events
for (const chunk of added) {
const event: DiffEvent = {
event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
source_id: newDoc.source_id,
from_doc_id: oldDoc.doc_id,
to_doc_id: newDoc.doc_id,
change_type: 'added',
section_title: chunk.section_title,
old_excerpt: '',
new_excerpt: chunk.clause_text,
topic_tags: chunk.topic_tags,
impact_level: 'low',
detected_at: new Date().toISOString()
};
insertDiffEvent(event);
}
for (const chunk of removed) {
const tags = JSON.parse(chunk.topic_tags || '[]');
const event: DiffEvent = {
event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
source_id: newDoc.source_id,
from_doc_id: oldDoc.doc_id,
to_doc_id: newDoc.doc_id,
change_type: 'removed',
section_title: chunk.section_title,
old_excerpt: chunk.clause_text,
new_excerpt: '',
topic_tags: tags,
impact_level: 'low',
detected_at: new Date().toISOString()
};
insertDiffEvent(event);
}
}
}
function insertDiffEvent(event: DiffEvent) {
const stmt = db.prepare(`
INSERT INTO diff_event (
event_id, source_id, from_doc_id, to_doc_id, change_type,
section_title, old_excerpt, new_excerpt, topic_tags, impact_level, detected_at
) VALUES (
@event_id, @source_id, @from_doc_id, @to_doc_id, @change_type,
@section_title, @old_excerpt, @new_excerpt, @topic_tags, @impact_level, @detected_at
)
`);
stmt.run({
...event,
topic_tags: JSON.stringify(event.topic_tags)
});
}