File size: 5,309 Bytes
1fed70a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | #!/usr/bin/env python3
"""Quick evaluation of ByT5 on Indo NLP test sets - simplified version."""
import sys
from pathlib import Path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
import torch
import pandas as pd
from core.decoder import BeamSearchDecoder
def load_test_set(filepath, max_samples=None):
"""Load Indo NLP test set (pairs of lines: Singlish, Sinhala)."""
samples = []
with open(filepath, 'r', encoding='utf-8') as f:
lines = [line.strip() for line in f.readlines() if line.strip()]
for i in range(0, len(lines), 2):
if i + 1 < len(lines):
samples.append({
'singlish': lines[i],
'expected': lines[i + 1]
})
if max_samples and len(samples) >= max_samples:
break
return samples
def compute_metrics(predicted, expected):
"""Compute CER, WER, BLEU, EM."""
from difflib import SequenceMatcher
# CER (Character Error Rate)
matcher_char = SequenceMatcher(None, predicted, expected)
cer = 1.0 - matcher_char.ratio() if expected else (1.0 if predicted else 0.0)
# WER (Word Error Rate)
pred_words = predicted.split()
exp_words = expected.split()
matcher_word = SequenceMatcher(None, pred_words, exp_words)
wer = 1.0 - matcher_word.ratio() if exp_words else (1.0 if pred_words else 0.0)
# BLEU (simple unigram overlap)
if exp_words:
matches = sum(1 for t in pred_words if t in exp_words)
bleu = matches / len(exp_words)
else:
bleu = 1.0 if not pred_words else 0.0
# EM (Exact Match)
em = 1 if predicted == expected else 0
return {'cer': cer, 'wer': wer, 'bleu': bleu, 'em': em}
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}\n")
# Parse command line
max_samples = int(sys.argv[1]) if len(sys.argv) > 1 else 5
print(f"Loading BeamSearchDecoder...")
decoder = BeamSearchDecoder(device=device)
print(f"Decoder loaded!\n")
# Load test sets
test_dir = Path("IndoNLP-2025-Shared-Task/Test Dataset/Sinhala")
print(f"Loading test sets (max {max_samples} samples each)...")
formal_samples = load_test_set(test_dir / "Sinhala Test set 1.txt", max_samples=max_samples)
informal_samples = load_test_set(test_dir / "Sinhala Test set 2.txt", max_samples=max_samples)
print(f"Formal: {len(formal_samples)}, Informal: {len(informal_samples)}\n")
all_results = []
# Evaluate formal
print("="*60)
print(f"FORMAL SUBSET ({len(formal_samples)} samples)")
print("="*60)
formal_results = []
for idx, sample in enumerate(formal_samples):
try:
predicted, _, _ = decoder.decode(sample['singlish'])
metrics = compute_metrics(predicted, sample['expected'])
result = {**sample, 'predicted': predicted, 'subset': 'formal', **metrics}
formal_results.append(result)
all_results.append(result)
print(f"{idx+1}/{len(formal_samples)}: EM={metrics['em']} CER={metrics['cer']:.3f} WER={metrics['wer']:.3f}")
except Exception as e:
print(f"{idx+1}/{len(formal_samples)}: ERROR - {e}")
result = {**sample, 'predicted': '[ERROR]', 'subset': 'formal', 'cer': 1.0, 'wer': 1.0, 'bleu': 0.0, 'em': 0}
formal_results.append(result)
all_results.append(result)
# Evaluate informal
print("\n" + "="*60)
print(f"INFORMAL SUBSET ({len(informal_samples)} samples)")
print("="*60)
informal_results = []
for idx, sample in enumerate(informal_samples):
try:
predicted, _, _ = decoder.decode(sample['singlish'])
metrics = compute_metrics(predicted, sample['expected'])
result = {**sample, 'predicted': predicted, 'subset': 'informal', **metrics}
informal_results.append(result)
all_results.append(result)
print(f"{idx+1}/{len(informal_samples)}: EM={metrics['em']} CER={metrics['cer']:.3f} WER={metrics['wer']:.3f}")
except Exception as e:
print(f"{idx+1}/{len(informal_samples)}: ERROR - {e}")
result = {**sample, 'predicted': '[ERROR]', 'subset': 'informal', 'cer': 1.0, 'wer': 1.0, 'bleu': 0.0, 'em': 0}
informal_results.append(result)
all_results.append(result)
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
formal_df = pd.DataFrame(formal_results)
informal_df = pd.DataFrame(informal_results)
all_df = pd.DataFrame(all_results)
for name, df in [("Formal", formal_df), ("Informal", informal_df), ("Overall", all_df)]:
print(f"\n{name} (n={len(df)}):")
print(f" CER: {df['cer'].mean():.4f} ± {df['cer'].std():.4f}")
print(f" WER: {df['wer'].mean():.4f} ± {df['wer'].std():.4f}")
print(f" BLEU: {df['bleu'].mean():.4f} ± {df['bleu'].std():.4f}")
print(f" EM: {df['em'].mean():.4f} ({int(df['em'].sum())}/{len(df)})")
# Save
all_df.to_csv("misc/quick_eval_results.csv", index=False)
print(f"\nResults saved to: misc/quick_eval_results.csv")
if __name__ == "__main__":
main()
|