File size: 2,479 Bytes
c9955a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
{
  "current_model": {
    "name": "Granite-107M-Multilingual",
    "repo": "ibm-granite/granite-embedding-107m-multilingual",
    "params": "107M",
    "pros": [
      "Already integrated and working",
      "Fast (107M parameters)",
      "Proven in production tests",
      "Correctly deduplicated Gemma-3 (47.8% dupes)",
      "0% false positives with Qwen2.5 1.5B"
    ],
    "cons": [
      "Smaller model (107M vs 500M+)",
      "May miss nuanced similarities"
    ],
    "test_results": {
      "qwen2.5_1.5b_extraction": {
        "duplicate_rate": "0%",
        "deduplication_accuracy": "100%",
        "note": "Extraction already unique per window"
      },
      "gemma3_1b_extraction": {
        "duplicate_rate": "47.8%",
        "deduplication_accuracy": "100%",
        "note": "Correctly identified all duplicates"
      }
    }
  },
  "alternatives": {
    "bge_m3": {
      "name": "BGE-M3",
      "repo": "BAAI/bge-m3",
      "gguf_repo": "lm-kit/bge-m3-gguf",
      "params": "568M",
      "pros": [
        "SOTA on MTEB Chinese benchmarks",
        "Larger model (568M vs 107M)",
        "Better semantic understanding"
      ],
      "cons": [
        "5x larger (slower)",
        "Requires sentence-transformers (not GGUF)",
        "Unknown if GGUF version works with llama-cpp"
      ],
      "recommendation": "Worth testing if accuracy issues arise"
    },
    "multilingual_e5": {
      "name": "Multilingual-E5-Large",
      "repo": "intfloat/multilingual-e5-large",
      "params": "560M",
      "pros": [
        "Microsoft-backed, widely tested",
        "Excellent for multilingual",
        "Good for Chinese text"
      ],
      "cons": [
        "5x larger than Granite-107M",
        "Requires sentence-transformers",
        "No GGUF version readily available"
      ],
      "recommendation": "Consider if switching to sentence-transformers"
    }
  },
  "recommendation": {
    "current_status": "KEEP Granite-107M",
    "rationale": [
      "Working correctly in production",
      "Fast enough for real-time use",
      "Zero false positives in tests",
      "Simple GGUF integration"
    ],
    "when_to_upgrade": [
      "If false positives/negatives appear in production",
      "If need better semantic matching (not just exact duplicates)",
      "If processing very long texts (need better context understanding)"
    ],
    "suggested_thresholds": {
      "strict": 0.9,
      "default": 0.85,
      "lenient": 0.8
    }
  }
}