""" Quality metrics and evaluation for summaries """ import logging from typing import Dict, Tuple from rouge_score import rouge_scorer import torch logger = logging.getLogger(__name__) class SummaryEvaluator: """Evaluate summary quality using ROUGE scores and confidence metrics.""" def __init__(self): """Initialize evaluator with ROUGE scorer.""" self.rouge_scorer = rouge_scorer.RougeScorer( ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True ) def calculate_rouge_scores( self, summary: str, reference: str = None ) -> Dict[str, float]: """ Calculate ROUGE scores (optional reference). Args: summary: Generated summary reference: Reference summary (optional) Returns: Dictionary with ROUGE scores """ if not reference: # Self-evaluation based on length and complexity words = summary.split() unique_words = len(set(words)) avg_word_length = sum(len(w) for w in words) / len(words) if words else 0 # Simple heuristics return { 'length_score': min(len(words) / 150, 1.0), 'diversity_score': unique_words / len(words) if words else 0, 'complexity_score': min(avg_word_length / 6, 1.0) } # Calculate ROUGE against reference scores = self.rouge_scorer.score(reference, summary) return { 'rouge1': scores['rouge1'].fmeasure, 'rouge2': scores['rouge2'].fmeasure, 'rougeL': scores['rougeL'].fmeasure } def get_confidence_score( self, model_output: torch.Tensor, summary: str ) -> float: """ Calculate confidence score (0-1). Args: model_output: Raw model output logits (may be None when called without direct access to model outputs, e.g. from the REST API path or main.py single-doc mode). summary: Generated summary Returns: Confidence score (0-1) """ # ── Guard: no model output available ────────────────────────────────── if model_output is None: confidence = 0.5 # Neutral default when tensor not provided elif hasattr(model_output, 'sequences_scores'): scores = model_output.sequences_scores confidence = torch.sigmoid(scores).item() if len(scores) > 0 else 0.5 else: confidence = 0.5 # Adjust based on summary characteristics words = summary.split() if 5 <= len(words) <= 200: # Reasonable length confidence *= 1.1 return min(confidence, 1.0) def evaluate_summary( self, summary: str, reference: str = None, model_output: torch.Tensor = None ) -> Dict[str, any]: """ Complete evaluation of summary. Args: summary: Generated summary reference: Reference summary model_output: Model output for confidence Returns: Comprehensive evaluation metrics """ rouge_scores = self.calculate_rouge_scores(summary, reference) confidence = self.get_confidence_score(model_output, summary) return { 'summary': summary, 'rouge_scores': rouge_scores, 'confidence_score': confidence, 'length': len(summary.split()), 'quality': 'high' if confidence > 0.7 else 'medium' if confidence > 0.5 else 'low' }