File size: 5,877 Bytes
f39c319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import { v4 as uuidv4 } from 'uuid';
import db from './db';
import { NormalizedDocument, ClauseChunk, DiffEvent } from './models';

// Simple logic for topic tagging based on keywords
function tagTopics(text: string): string[] {
  const topics: string[] = [];
  const keywordMap: Record<string, string[]> = {
    'sdk_disclosure': ['SDK', '第三方', '软件开发工具包'],
    'permission_usage': ['权限', '相机', '相册', '定位', '麦克风', '通讯录'],
    'sensitive_personal_info': ['敏感', '身份证', '生物识别', '人脸'],
    'minor_protection': ['未成年', '儿童', '监护人'],
    'data_retention': ['存储', '保留', '期限'],
    'third_party_sharing': ['共享', '转让', '提供给第三方'],
    'cross_border_transfer': ['出境', '境外', '跨境'],
    'user_rights': ['注销', '更正', '访问', '撤回', '权利'],
    'biometric_info': ['指纹', '人脸', '虹膜', '生物识别']
  };

  for (const [topic, keywords] of Object.entries(keywordMap)) {
    if (keywords.some(kw => text.includes(kw))) {
      topics.push(topic);
    }
  }

  return topics;
}

export function sliceAndDiff(newDoc: NormalizedDocument) {
  // Find the previous active document to diff against
  const oldDoc = db.prepare(`
    SELECT * FROM normalized_document
    WHERE source_id = ? AND doc_status = 'archived'
    ORDER BY created_at DESC LIMIT 1
  `).get(newDoc.source_id) as any;

  // Simple clause slicing: splitting by newlines or rudimentary sections
  const chunks = newDoc.normalized_text.split(/(?:\r?\n)+/).filter(line => line.trim().length > 10);
  
  const newChunks: ClauseChunk[] = chunks.map((text, idx) => {
    const chunk_id = `chunk_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
    const sectionTitle = text.length > 50 ? text.substring(0, 50) + '...' : text;
    const topicTags = tagTopics(text);
    
    return {
      chunk_id,
      doc_id: newDoc.doc_id,
      section_path: `Paragraph ${idx + 1}`,
      section_title: sectionTitle,
      clause_text: text,
      topic_tags: topicTags,
      embedding_status: 'ready',
      chunk_order: idx,
      created_at: new Date().toISOString()
    };
  });

  const insertChunkStmt = db.prepare(`
    INSERT INTO clause_chunk (
      chunk_id, doc_id, section_path, section_title, clause_text,
      topic_tags, embedding_status, chunk_order, created_at
    ) VALUES (
      @chunk_id, @doc_id, @section_path, @section_title, @clause_text,
      @topic_tags, @embedding_status, @chunk_order, @created_at
    )
  `);

  const insertManyChunks = db.transaction((chunksToInsert: ClauseChunk[]) => {
    for (const chunk of chunksToInsert) {
      insertChunkStmt.run({
        ...chunk,
        topic_tags: JSON.stringify(chunk.topic_tags)
      });
    }
  });
  insertManyChunks(newChunks);

  // Version Comparison (Diff Event)
  if (!oldDoc) {
    // If no old doc, all sections are 'added'
    const event_id = `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
    const newExcerpt = newDoc.normalized_text.substring(0, 200) + '...';
    
    const event: DiffEvent = {
      event_id,
      source_id: newDoc.source_id,
      from_doc_id: '',
      to_doc_id: newDoc.doc_id,
      change_type: 'added',
      section_title: 'All Content',
      old_excerpt: '',
      new_excerpt: newExcerpt,
      topic_tags: tagTopics(newExcerpt),
      impact_level: 'medium',
      detected_at: new Date().toISOString()
    };
    insertDiffEvent(event);
  } else {
    // Diffing
    const oldChunks = db.prepare(`
      SELECT * FROM clause_chunk WHERE doc_id = ? ORDER BY chunk_order ASC
    `).all(oldDoc.doc_id) as any[];

    // Simplified Diffing logic:
    // If old text contains a chunk that new doesn't, it's removed.
    // If new text contains a chunk that old doesn't, it's added.
    // In a real scenario, more sophisticated diffing (e.g. sequence alignment) is needed.
    const oldTexts = oldChunks.map(c => c.clause_text);
    const newTexts = newChunks.map(c => c.clause_text);

    const added = newChunks.filter(c => !oldTexts.includes(c.clause_text));
    const removed = oldChunks.filter(c => !newTexts.includes(c.clause_text));

    // Combine added and removed into diff events
    for (const chunk of added) {
      const event: DiffEvent = {
        event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
        source_id: newDoc.source_id,
        from_doc_id: oldDoc.doc_id,
        to_doc_id: newDoc.doc_id,
        change_type: 'added',
        section_title: chunk.section_title,
        old_excerpt: '',
        new_excerpt: chunk.clause_text,
        topic_tags: chunk.topic_tags,
        impact_level: 'low',
        detected_at: new Date().toISOString()
      };
      insertDiffEvent(event);
    }

    for (const chunk of removed) {
      const tags = JSON.parse(chunk.topic_tags || '[]');
      const event: DiffEvent = {
        event_id: `evt_${uuidv4().replace(/-/g, '').substring(0, 16)}`,
        source_id: newDoc.source_id,
        from_doc_id: oldDoc.doc_id,
        to_doc_id: newDoc.doc_id,
        change_type: 'removed',
        section_title: chunk.section_title,
        old_excerpt: chunk.clause_text,
        new_excerpt: '',
        topic_tags: tags,
        impact_level: 'low',
        detected_at: new Date().toISOString()
      };
      insertDiffEvent(event);
    }
  }
}

function insertDiffEvent(event: DiffEvent) {
  const stmt = db.prepare(`
    INSERT INTO diff_event (
      event_id, source_id, from_doc_id, to_doc_id, change_type,
      section_title, old_excerpt, new_excerpt, topic_tags, impact_level, detected_at
    ) VALUES (
      @event_id, @source_id, @from_doc_id, @to_doc_id, @change_type,
      @section_title, @old_excerpt, @new_excerpt, @topic_tags, @impact_level, @detected_at
    )
  `);
  stmt.run({
    ...event,
    topic_tags: JSON.stringify(event.topic_tags)
  });
}