Spaces:

luoleyuan
/

agent01

Running

Auto Deployer

Deploy compliance agent services

f39c319 14 days ago

5.88 kB

	import { v4 as uuidv4 } from 'uuid';
	import db from './db';
	import { NormalizedDocument, ClauseChunk, DiffEvent } from './models';

	// Simple logic for topic tagging based on keywords
	function tagTopics(text: string): string[] {
	const topics: string[] = [];
	const keywordMap: Record<string, string[]> = {
	'sdk_disclosure': ['SDK', '第三方', '软件开发工具包'],
	'permission_usage': ['权限', '相机', '相册', '定位', '麦克风', '通讯录'],
	'sensitive_personal_info': ['敏感', '身份证', '生物识别', '人脸'],
	'minor_protection': ['未成年', '儿童', '监护人'],
	'data_retention': ['存储', '保留', '期限'],
	'third_party_sharing': ['共享', '转让', '提供给第三方'],
	'cross_border_transfer': ['出境', '境外', '跨境'],
	'user_rights': ['注销', '更正', '访问', '撤回', '权利'],
	'biometric_info': ['指纹', '人脸', '虹膜', '生物识别']
	};

	for (const [topic, keywords] of Object.entries(keywordMap)) {
	if (keywords.some(kw => text.includes(kw))) {
	topics.push(topic);
	}
	}

	return topics;
	}

	export function sliceAndDiff(newDoc: NormalizedDocument) {
	// Find the previous active document to diff against
	const oldDoc = db.prepare(`
	SELECT * FROM normalized_document
	WHERE source_id = ? AND doc_status = 'archived'
	ORDER BY created_at DESC LIMIT 1
	`).get(newDoc.source_id) as any;

	// Simple clause slicing: splitting by newlines or rudimentary sections
	const chunks = newDoc.normalized_text.split(/(?:\r?\n)+/).filter(line => line.trim().length > 10);

	const newChunks: ClauseChunk[] = chunks.map((text, idx) => {
	const chunk_id = `chunk_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
	const sectionTitle = text.length > 50 ? text.substring(0, 50) + '...' : text;
	const topicTags = tagTopics(text);

	return {
	chunk_id,
	doc_id: newDoc.doc_id,
	section_path: `Paragraph ${idx + 1}`,
	section_title: sectionTitle,
	clause_text: text,
	topic_tags: topicTags,
	embedding_status: 'ready',
	chunk_order: idx,
	created_at: new Date().toISOString()
	};
	});

	const insertChunkStmt = db.prepare(`
	INSERT INTO clause_chunk (
	chunk_id, doc_id, section_path, section_title, clause_text,
	topic_tags, embedding_status, chunk_order, created_at
	) VALUES (
	@chunk_id, @doc_id, @section_path, @section_title, @clause_text,
	@topic_tags, @embedding_status, @chunk_order, @created_at
	)
	`);

	const insertManyChunks = db.transaction((chunksToInsert: ClauseChunk[]) => {
	for (const chunk of chunksToInsert) {
	insertChunkStmt.run({
	...chunk,
	topic_tags: JSON.stringify(chunk.topic_tags)
	});
	}
	});
	insertManyChunks(newChunks);

	// Version Comparison (Diff Event)
	if (!oldDoc) {
	// If no old doc, all sections are 'added'
	const event_id = `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
	const newExcerpt = newDoc.normalized_text.substring(0, 200) + '...';

	const event: DiffEvent = {
	event_id,
	source_id: newDoc.source_id,
	from_doc_id: '',
	to_doc_id: newDoc.doc_id,
	change_type: 'added',
	section_title: 'All Content',
	old_excerpt: '',
	new_excerpt: newExcerpt,
	topic_tags: tagTopics(newExcerpt),
	impact_level: 'medium',
	detected_at: new Date().toISOString()
	};
	insertDiffEvent(event);
	} else {
	// Diffing
	const oldChunks = db.prepare(`
	SELECT * FROM clause_chunk WHERE doc_id = ? ORDER BY chunk_order ASC
	`).all(oldDoc.doc_id) as any[];

	// Simplified Diffing logic:
	// If old text contains a chunk that new doesn't, it's removed.
	// If new text contains a chunk that old doesn't, it's added.
	// In a real scenario, more sophisticated diffing (e.g. sequence alignment) is needed.
	const oldTexts = oldChunks.map(c => c.clause_text);
	const newTexts = newChunks.map(c => c.clause_text);

	const added = newChunks.filter(c => !oldTexts.includes(c.clause_text));
	const removed = oldChunks.filter(c => !newTexts.includes(c.clause_text));

	// Combine added and removed into diff events
	for (const chunk of added) {
	const event: DiffEvent = {
	event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
	source_id: newDoc.source_id,
	from_doc_id: oldDoc.doc_id,
	to_doc_id: newDoc.doc_id,
	change_type: 'added',
	section_title: chunk.section_title,
	old_excerpt: '',
	new_excerpt: chunk.clause_text,
	topic_tags: chunk.topic_tags,
	impact_level: 'low',
	detected_at: new Date().toISOString()
	};
	insertDiffEvent(event);
	}

	for (const chunk of removed) {
	const tags = JSON.parse(chunk.topic_tags \|\| '[]');
	const event: DiffEvent = {
	event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
	source_id: newDoc.source_id,
	from_doc_id: oldDoc.doc_id,
	to_doc_id: newDoc.doc_id,
	change_type: 'removed',
	section_title: chunk.section_title,
	old_excerpt: chunk.clause_text,
	new_excerpt: '',
	topic_tags: tags,
	impact_level: 'low',
	detected_at: new Date().toISOString()
	};
	insertDiffEvent(event);
	}
	}
	}

	function insertDiffEvent(event: DiffEvent) {
	const stmt = db.prepare(`
	INSERT INTO diff_event (
	event_id, source_id, from_doc_id, to_doc_id, change_type,
	section_title, old_excerpt, new_excerpt, topic_tags, impact_level, detected_at
	) VALUES (
	@event_id, @source_id, @from_doc_id, @to_doc_id, @change_type,
	@section_title, @old_excerpt, @new_excerpt, @topic_tags, @impact_level, @detected_at
	)
	`);
	stmt.run({
	...event,
	topic_tags: JSON.stringify(event.topic_tags)
	});
	}