| import json
|
| from pathlib import Path
|
| import logging
|
| import torch
|
| from transformers import AutoModelForCausalLM, AutoTokenizer
|
| import numpy as np
|
| from typing import List, Dict, Any
|
| from tqdm import tqdm
|
| import pandas as pd
|
| from rouge_score import rouge_scorer
|
| from sacrebleu.metrics import BLEU
|
| import wandb
|
|
|
|
|
| logging.basicConfig(
|
| level=logging.INFO,
|
| format='%(asctime)s - %(levelname)s - %(message)s'
|
| )
|
| logger = logging.getLogger(__name__)
|
|
|
| class ModelEvaluator:
|
| def __init__(self):
|
| self.model_dir = Path('outputs/model/final')
|
| self.output_dir = Path('outputs/evaluation')
|
| self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
| self.test_prompts = [
|
|
|
| {
|
| "type": "code_generation",
|
| "prompt": "একটি পাইথন ফাংশন লিখুন যা একটি সংখ্যার ফ্যাক্টরিয়াল বের করে।",
|
| "expected": """def factorial(n):
|
| if n == 0 or n == 1:
|
| return 1
|
| return n * factorial(n - 1)"""
|
| },
|
| {
|
| "type": "code_explanation",
|
| "prompt": "নিচের কোডটি ব্যাখ্যা করুন:\ndef bubble_sort(arr):\n n = len(arr)\n for i in range(n):\n for j in range(0, n-i-1):\n if arr[j] > arr[j+1]:\n arr[j], arr[j+1] = arr[j+1], arr[j]",
|
| "expected": "এই কোডটি বাবল সর্ট অ্যালগরিদম বাস্তবায়ন করে। এটি একটি অ্যারেকে ক্রমানুসারে সাজায়।"
|
| },
|
| {
|
| "type": "error_fix",
|
| "prompt": "এই কোডে ভুল আছে, ঠিক করুন:\ndef calculate_sum(numbers)\n total = 0\n for num in numbers\n total += num\n return total",
|
| "expected": """def calculate_sum(numbers):
|
| total = 0
|
| for num in numbers:
|
| total += num
|
| return total"""
|
| },
|
|
|
| {
|
| "type": "algorithm_explanation",
|
| "prompt": "বাইনারি সার্চ অ্যালগরিদম কীভাবে কাজ করে সেটি ব্যাখ্যা করুন।",
|
| "expected": "বাইনারি সার্চ একটি দক্ষ অ্যালগরিদম যা সর্টেড অ্যারেতে একটি এলিমেন্ট খোঁজে। এটি প্রতিবার অ্যারের মধ্যবর্তী এলিমেন্ট চেক করে এবং সার্চ স্পেস অর্ধেক করে কমিয়ে ফেলে।"
|
| }
|
| ]
|
|
|
|
|
| self.bleu = BLEU()
|
| self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
|
|
|
| def load_model_and_tokenizer(self):
|
| """Load the trained model and tokenizer"""
|
| logger.info("Loading model and tokenizer")
|
|
|
| tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
|
| model = AutoModelForCausalLM.from_pretrained(
|
| self.model_dir,
|
| trust_remote_code=True,
|
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| )
|
|
|
| if torch.cuda.is_available():
|
| model = model.to('cuda')
|
|
|
| return model, tokenizer
|
|
|
| def generate_response(self, model, tokenizer, prompt: str, max_length: int = 512) -> str:
|
| """Generate response for a given prompt"""
|
| try:
|
| inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
|
|
|
| if torch.cuda.is_available():
|
| inputs = {k: v.to('cuda') for k, v in inputs.items()}
|
|
|
|
|
| outputs = model.generate(
|
| **inputs,
|
| max_length=max_length,
|
| num_return_sequences=1,
|
| temperature=0.7,
|
| top_p=0.95,
|
| do_sample=True,
|
| pad_token_id=tokenizer.pad_token_id,
|
| eos_token_id=tokenizer.eos_token_id,
|
| repetition_penalty=1.2
|
| )
|
|
|
| response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| return response.replace(prompt, "").strip()
|
|
|
| except Exception as e:
|
| logger.error(f"Error generating response: {str(e)}")
|
| return ""
|
|
|
| def calculate_metrics(self, generated: str, expected: str) -> Dict[str, float]:
|
| """Calculate evaluation metrics"""
|
| try:
|
|
|
| bleu_score = self.bleu.corpus_score(
|
| [generated],
|
| [[expected]]
|
| ).score / 100.0
|
|
|
|
|
| rouge_scores = self.rouge_scorer.score(generated, expected)
|
|
|
| return {
|
| 'bleu': bleu_score,
|
| 'rouge1_f': rouge_scores['rouge1'].fmeasure,
|
| 'rouge2_f': rouge_scores['rouge2'].fmeasure,
|
| 'rougeL_f': rouge_scores['rougeL'].fmeasure
|
| }
|
| except Exception as e:
|
| logger.error(f"Error calculating metrics: {str(e)}")
|
| return {
|
| 'bleu': 0.0,
|
| 'rouge1_f': 0.0,
|
| 'rouge2_f': 0.0,
|
| 'rougeL_f': 0.0
|
| }
|
|
|
| def evaluate(self):
|
| """Main method to evaluate the model"""
|
| try:
|
|
|
| wandb.init(project="bengali-code-llm", name="model-evaluation")
|
|
|
|
|
| model, tokenizer = self.load_model_and_tokenizer()
|
|
|
|
|
| results = []
|
|
|
|
|
| for prompt_data in tqdm(self.test_prompts, desc="Evaluating prompts"):
|
| prompt_type = prompt_data["type"]
|
| prompt = prompt_data["prompt"]
|
| expected = prompt_data["expected"]
|
|
|
|
|
| generated = self.generate_response(model, tokenizer, prompt)
|
|
|
|
|
| metrics = self.calculate_metrics(generated, expected)
|
|
|
|
|
| result = {
|
| "type": prompt_type,
|
| "prompt": prompt,
|
| "generated": generated,
|
| "expected": expected,
|
| **metrics
|
| }
|
| results.append(result)
|
|
|
|
|
| wandb.log({
|
| f"{prompt_type}_bleu": metrics['bleu'],
|
| f"{prompt_type}_rouge1": metrics['rouge1_f'],
|
| f"{prompt_type}_rouge2": metrics['rouge2_f'],
|
| f"{prompt_type}_rougeL": metrics['rougeL_f']
|
| })
|
|
|
|
|
| df = pd.DataFrame(results)
|
| avg_metrics = df.groupby('type')[['bleu', 'rouge1_f', 'rouge2_f', 'rougeL_f']].mean()
|
|
|
|
|
| results_path = self.output_dir / 'evaluation_results.json'
|
| with open(results_path, 'w', encoding='utf-8') as f:
|
| json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
|
|
| metrics_path = self.output_dir / 'average_metrics.csv'
|
| avg_metrics.to_csv(metrics_path)
|
|
|
|
|
| wandb.log({
|
| "avg_bleu": df['bleu'].mean(),
|
| "avg_rouge1": df['rouge1_f'].mean(),
|
| "avg_rouge2": df['rouge2_f'].mean(),
|
| "avg_rougeL": df['rougeL_f'].mean()
|
| })
|
|
|
|
|
| wandb.finish()
|
|
|
| logger.info(f"Evaluation completed. Results saved to {self.output_dir}")
|
|
|
|
|
| return avg_metrics.to_dict()
|
|
|
| except Exception as e:
|
| logger.error(f"Evaluation failed: {str(e)}")
|
| raise
|
| finally:
|
|
|
| if wandb.run is not None:
|
| wandb.finish()
|
|
|
| if __name__ == "__main__":
|
| evaluator = ModelEvaluator()
|
| evaluator.evaluate()
|
|
|