Spaces:

luoleyuan
/

agent01

Running

File size: 5,877 Bytes

f39c319

import { v4 as uuidv4 } from 'uuid';
import db from './db';
import { NormalizedDocument, ClauseChunk, DiffEvent } from './models';

// Simple logic for topic tagging based on keywords
function tagTopics(text: string): string[] {
  const topics: string[] = [];
  const keywordMap: Record<string, string[]> = {
    'sdk_disclosure': ['SDK', '第三方', '软件开发工具包'],
    'permission_usage': ['权限', '相机', '相册', '定位', '麦克风', '通讯录'],
    'sensitive_personal_info': ['敏感', '身份证', '生物识别', '人脸'],
    'minor_protection': ['未成年', '儿童', '监护人'],
    'data_retention': ['存储', '保留', '期限'],
    'third_party_sharing': ['共享', '转让', '提供给第三方'],
    'cross_border_transfer': ['出境', '境外', '跨境'],
    'user_rights': ['注销', '更正', '访问', '撤回', '权利'],
    'biometric_info': ['指纹', '人脸', '虹膜', '生物识别']
  };

  for (const [topic, keywords] of Object.entries(keywordMap)) {
    if (keywords.some(kw => text.includes(kw))) {
      topics.push(topic);
    }
  }

  return topics;
}

export function sliceAndDiff(newDoc: NormalizedDocument) {
  // Find the previous active document to diff against
  const oldDoc = db.prepare(`
    SELECT * FROM normalized_document
    WHERE source_id = ? AND doc_status = 'archived'
    ORDER BY created_at DESC LIMIT 1
  `).get(newDoc.source_id) as any;

  // Simple clause slicing: splitting by newlines or rudimentary sections
  const chunks = newDoc.normalized_text.split(/(?:\r?\n)+/).filter(line => line.trim().length > 10);
  
  const newChunks: ClauseChunk[] = chunks.map((text, idx) => {
    const chunk_id = `chunk_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
    const sectionTitle = text.length > 50 ? text.substring(0, 50) + '...' : text;
    const topicTags = tagTopics(text);
    
    return {
      chunk_id,
      doc_id: newDoc.doc_id,
      section_path: `Paragraph ${idx + 1}`,
      section_title: sectionTitle,
      clause_text: text,
      topic_tags: topicTags,
      embedding_status: 'ready',
      chunk_order: idx,
      created_at: new Date().toISOString()
    };
  });

  const insertChunkStmt = db.prepare(`
    INSERT INTO clause_chunk (
      chunk_id, doc_id, section_path, section_title, clause_text,
      topic_tags, embedding_status, chunk_order, created_at
    ) VALUES (
      @chunk_id, @doc_id, @section_path, @section_title, @clause_text,
      @topic_tags, @embedding_status, @chunk_order, @created_at
    )
  `);

  const insertManyChunks = db.transaction((chunksToInsert: ClauseChunk[]) => {
    for (const chunk of chunksToInsert) {
      insertChunkStmt.run({
        ...chunk,
        topic_tags: JSON.stringify(chunk.topic_tags)
      });
    }
  });
  insertManyChunks(newChunks);

  // Version Comparison (Diff Event)
  if (!oldDoc) {
    // If no old doc, all sections are 'added'
    const event_id = `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
    const newExcerpt = newDoc.normalized_text.substring(0, 200) + '...';
    
    const event: DiffEvent = {
      event_id,
      source_id: newDoc.source_id,
      from_doc_id: '',
      to_doc_id: newDoc.doc_id,
      change_type: 'added',
      section_title: 'All Content',
      old_excerpt: '',
      new_excerpt: newExcerpt,
      topic_tags: tagTopics(newExcerpt),
      impact_level: 'medium',
      detected_at: new Date().toISOString()
    };
    insertDiffEvent(event);
  } else {
    // Diffing
    const oldChunks = db.prepare(`
      SELECT * FROM clause_chunk WHERE doc_id = ? ORDER BY chunk_order ASC
    `).all(oldDoc.doc_id) as any[];

    // Simplified Diffing logic:
    // If old text contains a chunk that new doesn't, it's removed.
    // If new text contains a chunk that old doesn't, it's added.
    // In a real scenario, more sophisticated diffing (e.g. sequence alignment) is needed.
    const oldTexts = oldChunks.map(c => c.clause_text);
    const newTexts = newChunks.map(c => c.clause_text);

    const added = newChunks.filter(c => !oldTexts.includes(c.clause_text));
    const removed = oldChunks.filter(c => !newTexts.includes(c.clause_text));

    // Combine added and removed into diff events
    for (const chunk of added) {
      const event: DiffEvent = {
        event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
        source_id: newDoc.source_id,
        from_doc_id: oldDoc.doc_id,
        to_doc_id: newDoc.doc_id,
        change_type: 'added',
        section_title: chunk.section_title,
        old_excerpt: '',
        new_excerpt: chunk.clause_text,
        topic_tags: chunk.topic_tags,
        impact_level: 'low',
        detected_at: new Date().toISOString()
      };
      insertDiffEvent(event);
    }

    for (const chunk of removed) {
      const tags = JSON.parse(chunk.topic_tags || '[]');
      const event: DiffEvent = {
        event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
        source_id: newDoc.source_id,
        from_doc_id: oldDoc.doc_id,
        to_doc_id: newDoc.doc_id,
        change_type: 'removed',
        section_title: chunk.section_title,
        old_excerpt: chunk.clause_text,
        new_excerpt: '',
        topic_tags: tags,
        impact_level: 'low',
        detected_at: new Date().toISOString()
      };
      insertDiffEvent(event);
    }
  }
}

function insertDiffEvent(event: DiffEvent) {
  const stmt = db.prepare(`
    INSERT INTO diff_event (
      event_id, source_id, from_doc_id, to_doc_id, change_type,
      section_title, old_excerpt, new_excerpt, topic_tags, impact_level, detected_at
    ) VALUES (
      @event_id, @source_id, @from_doc_id, @to_doc_id, @change_type,
      @section_title, @old_excerpt, @new_excerpt, @topic_tags, @impact_level, @detected_at
    )
  `);
  stmt.run({
    ...event,
    topic_tags: JSON.stringify(event.topic_tags)
  });
}