File size: 5,309 Bytes
1fed70a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
"""Quick evaluation of ByT5 on Indo NLP test sets - simplified version."""

import sys
from pathlib import Path

project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

import torch
import pandas as pd

from core.decoder import BeamSearchDecoder

def load_test_set(filepath, max_samples=None):
    """Load Indo NLP test set (pairs of lines: Singlish, Sinhala)."""
    samples = []
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]
    
    for i in range(0, len(lines), 2):
        if i + 1 < len(lines):
            samples.append({
                'singlish': lines[i],
                'expected': lines[i + 1]
            })
            if max_samples and len(samples) >= max_samples:
                break
    return samples

def compute_metrics(predicted, expected):
    """Compute CER, WER, BLEU, EM."""
    from difflib import SequenceMatcher
    
    # CER (Character Error Rate)
    matcher_char = SequenceMatcher(None, predicted, expected)
    cer = 1.0 - matcher_char.ratio() if expected else (1.0 if predicted else 0.0)
    
    # WER (Word Error Rate)
    pred_words = predicted.split()
    exp_words = expected.split()
    matcher_word = SequenceMatcher(None, pred_words, exp_words)
    wer = 1.0 - matcher_word.ratio() if exp_words else (1.0 if pred_words else 0.0)
    
    # BLEU (simple unigram overlap)
    if exp_words:
        matches = sum(1 for t in pred_words if t in exp_words)
        bleu = matches / len(exp_words)
    else:
        bleu = 1.0 if not pred_words else 0.0
    
    # EM (Exact Match)
    em = 1 if predicted == expected else 0
    
    return {'cer': cer, 'wer': wer, 'bleu': bleu, 'em': em}

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device: {device}\n")
    
    # Parse command line
    max_samples = int(sys.argv[1]) if len(sys.argv) > 1 else 5
    
    print(f"Loading BeamSearchDecoder...")
    decoder = BeamSearchDecoder(device=device)
    print(f"Decoder loaded!\n")
    
    # Load test sets
    test_dir = Path("IndoNLP-2025-Shared-Task/Test Dataset/Sinhala")
    
    print(f"Loading test sets (max {max_samples} samples each)...")
    formal_samples = load_test_set(test_dir / "Sinhala Test set 1.txt", max_samples=max_samples)
    informal_samples = load_test_set(test_dir / "Sinhala Test set 2.txt", max_samples=max_samples)
    print(f"Formal: {len(formal_samples)}, Informal: {len(informal_samples)}\n")
    
    all_results = []
    
    # Evaluate formal
    print("="*60)
    print(f"FORMAL SUBSET ({len(formal_samples)} samples)")
    print("="*60)
    
    formal_results = []
    for idx, sample in enumerate(formal_samples):
        try:
            predicted, _, _ = decoder.decode(sample['singlish'])
            metrics = compute_metrics(predicted, sample['expected'])
            result = {**sample, 'predicted': predicted, 'subset': 'formal', **metrics}
            formal_results.append(result)
            all_results.append(result)
            print(f"{idx+1}/{len(formal_samples)}: EM={metrics['em']} CER={metrics['cer']:.3f} WER={metrics['wer']:.3f}")
        except Exception as e:
            print(f"{idx+1}/{len(formal_samples)}: ERROR - {e}")
            result = {**sample, 'predicted': '[ERROR]', 'subset': 'formal', 'cer': 1.0, 'wer': 1.0, 'bleu': 0.0, 'em': 0}
            formal_results.append(result)
            all_results.append(result)
    
    # Evaluate informal
    print("\n" + "="*60)
    print(f"INFORMAL SUBSET ({len(informal_samples)} samples)")
    print("="*60)
    
    informal_results = []
    for idx, sample in enumerate(informal_samples):
        try:
            predicted, _, _ = decoder.decode(sample['singlish'])
            metrics = compute_metrics(predicted, sample['expected'])
            result = {**sample, 'predicted': predicted, 'subset': 'informal', **metrics}
            informal_results.append(result)
            all_results.append(result)
            print(f"{idx+1}/{len(informal_samples)}: EM={metrics['em']} CER={metrics['cer']:.3f} WER={metrics['wer']:.3f}")
        except Exception as e:
            print(f"{idx+1}/{len(informal_samples)}: ERROR - {e}")
            result = {**sample, 'predicted': '[ERROR]', 'subset': 'informal', 'cer': 1.0, 'wer': 1.0, 'bleu': 0.0, 'em': 0}
            informal_results.append(result)
            all_results.append(result)
    
    # Summary
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    
    formal_df = pd.DataFrame(formal_results)
    informal_df = pd.DataFrame(informal_results)
    all_df = pd.DataFrame(all_results)
    
    for name, df in [("Formal", formal_df), ("Informal", informal_df), ("Overall", all_df)]:
        print(f"\n{name} (n={len(df)}):")
        print(f"  CER:  {df['cer'].mean():.4f} ± {df['cer'].std():.4f}")
        print(f"  WER:  {df['wer'].mean():.4f} ± {df['wer'].std():.4f}")
        print(f"  BLEU: {df['bleu'].mean():.4f} ± {df['bleu'].std():.4f}")
        print(f"  EM:   {df['em'].mean():.4f} ({int(df['em'].sum())}/{len(df)})")
    
    # Save
    all_df.to_csv("misc/quick_eval_results.csv", index=False)
    print(f"\nResults saved to: misc/quick_eval_results.csv")

if __name__ == "__main__":
    main()