| | import difflib |
| | from dataclasses import dataclass |
| | from html import escape |
| | from typing import List, Tuple |
| |
|
| | from utils import preprocess_chinese_text |
| |
|
| |
|
| | @dataclass |
| | class DiffResult: |
| | reference_display: str |
| | hypothesis_display: str |
| | error_pairs: List[Tuple[str, str]] |
| |
|
| |
|
| | def visualize_differences( |
| | ref_text: str, hyp_text: str, include_punctuation: bool = False |
| | ) -> DiffResult: |
| | """ |
| | Create a visualization of the differences between reference and hypothesis texts. |
| | |
| | Args: |
| | ref_text (str): Reference text |
| | hyp_text (str): Hypothesis text |
| | include_punctuation (bool): Whether to include punctuation |
| | |
| | Returns: |
| | DiffResult: Containing formatted reference and hypothesis texts with error highlighting |
| | """ |
| | |
| | ref_processed = preprocess_chinese_text(ref_text, include_punctuation) |
| | hyp_processed = preprocess_chinese_text(hyp_text, include_punctuation) |
| |
|
| | |
| | ref_chars = ref_processed.split() |
| | hyp_chars = hyp_processed.split() |
| |
|
| | |
| | matcher = difflib.SequenceMatcher(None, ref_chars, hyp_chars) |
| |
|
| | ref_formatted = [] |
| | hyp_formatted = [] |
| | error_pairs = [] |
| |
|
| | for op, ref_start, ref_end, hyp_start, hyp_end in matcher.get_opcodes(): |
| | if op == "equal": |
| | ref_formatted.extend(ref_chars[ref_start:ref_end]) |
| | hyp_formatted.extend(hyp_chars[hyp_start:hyp_end]) |
| | elif op == "delete": |
| | |
| | for char in ref_chars[ref_start:ref_end]: |
| | ref_formatted.append(f"[DEL]{char}[/DEL]") |
| | hyp_formatted.append("[DEL]_[/DEL]") |
| | error_pairs.append((char, "_")) |
| | elif op == "insert": |
| | |
| | for char in hyp_chars[hyp_start:hyp_end]: |
| | ref_formatted.append("[INS]_[/INS]") |
| | hyp_formatted.append(f"[INS]{char}[/INS]") |
| | error_pairs.append(("_", char)) |
| | elif op == "replace": |
| | |
| | for ref_char, hyp_char in zip( |
| | ref_chars[ref_start:ref_end], hyp_chars[hyp_start:hyp_end] |
| | ): |
| | ref_formatted.append(f"[SUB]{ref_char}[/SUB]") |
| | hyp_formatted.append(f"[SUB]{hyp_char}[/SUB]") |
| | error_pairs.append((ref_char, hyp_char)) |
| |
|
| | return DiffResult( |
| | reference_display="".join(ref_formatted), |
| | hypothesis_display="".join(hyp_formatted), |
| | error_pairs=error_pairs, |
| | ) |
| |
|
| |
|
| | def generate_html_report( |
| | ref_text: str, hyp_text: str, metrics_no_punct: dict, metrics_with_punct: dict |
| | ) -> str: |
| | """ |
| | Generate an HTML report with error visualization and metrics. |
| | """ |
| | |
| | diff_no_punct = visualize_differences(ref_text, hyp_text, False) |
| | diff_with_punct = visualize_differences(ref_text, hyp_text, True) |
| |
|
| | def format_text_for_html(text: str) -> str: |
| | """Format text with HTML spans for coloring""" |
| | text = escape(text) |
| | text = text.replace("[DEL]", '<span class="deletion">') |
| | text = text.replace("[/DEL]", "</span>") |
| | text = text.replace("[INS]", '<span class="insertion">') |
| | text = text.replace("[/INS]", "</span>") |
| | text = text.replace("[SUB]", '<span class="substitution">') |
| | text = text.replace("[/SUB]", "</span>") |
| | return text |
| |
|
| | def format_error_pairs(pairs: List[Tuple[str, str]]) -> str: |
| | """Format error pairs into HTML table rows""" |
| | rows = [] |
| | for ref_char, hyp_char in pairs: |
| | rows.append( |
| | f"<tr><td>{escape(ref_char)}</td><td>{escape(hyp_char)}</td></tr>" |
| | ) |
| | return "\n".join(rows) |
| |
|
| | |
| | ref_no_punct = preprocess_chinese_text(ref_text, False) |
| | total_chars_no_punct = len(ref_no_punct.split()) |
| | |
| |
|
| | cer_no_punct = metrics_no_punct['wer'] |
| | total_errors_no_punct = metrics_no_punct['substitutions'] + \ |
| | metrics_no_punct['deletions'] + metrics_no_punct['insertions'] |
| | substitutions_no_punct = metrics_no_punct['substitutions'] |
| | deletions_no_punct = metrics_no_punct['deletions'] |
| | insertions_no_punct = metrics_no_punct['insertions'] |
| |
|
| | |
| | ref_with_punct = preprocess_chinese_text(ref_text, True) |
| | total_chars_punct = len(ref_with_punct.split()) |
| | |
| |
|
| | cer_punct = metrics_with_punct['wer'] |
| | total_errors_punct = metrics_with_punct['substitutions'] + \ |
| | metrics_with_punct['deletions'] + metrics_with_punct['insertions'] |
| | substitutions_punct = metrics_with_punct['substitutions'] |
| | deletions_punct = metrics_with_punct['deletions'] |
| | insertions_punct = metrics_with_punct['insertions'] |
| |
|
| | html_template = """ |
| | <!DOCTYPE html> |
| | <html> |
| | <head> |
| | <meta charset="UTF-8"> |
| | <title>CER Analysis Report</title> |
| | <style> |
| | body {{ font-family: Arial, sans-serif; margin: 20px; }} |
| | .container {{ max-width: 100%; margin: 0 auto; }} |
| | .metrics {{ margin: 20px 0; padding: 10px; background: #f5f5f5; }} |
| | .visualization {{ margin: 20px 0; }} |
| | .deletion {{ background-color: #ffd7d7; text-decoration: line-through; }} |
| | .insertion {{ background-color: #d7ffd7; }} |
| | .substitution {{ background-color: #fff3d7; }} |
| | .text-display {{ font-size: 16px; line-height: 1.6; white-space: pre-wrap; }} |
| | table {{ border-collapse: collapse; margin: 10px 0; }} |
| | th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }} |
| | th {{ background-color: #f5f5f5; }} |
| | .legend {{ margin: 20px 0; }} |
| | .legend span {{ padding: 2px 5px; margin-right: 10px; }} |
| | h2 {{ margin-top: 30px; }} |
| | .grid-container {{ display: grid; grid-template-columns: auto auto; column-gap: 24px;}} |
| | .grid-item {{ }} |
| | </style> |
| | </head> |
| | <body> |
| | <div class="container"> |
| | <h1>Character Error Rate Analysis Report</h1> |
| | |
| | <div class="legend"> |
| | <h3>Legend:</h3> |
| | <span class="deletion">Deletion</span> |
| | <span class="insertion">Insertion</span> |
| | <span class="substitution">Substitution</span> |
| | </div> |
| | |
| | <div class="grid-container"> |
| | <div class="grid-item"> |
| | <h2>Without Punctuation</h2> |
| | <table class="metrics"> |
| | <thead> |
| | <tr> |
| | <th>Total Chars</th> |
| | <th>CER</th> |
| | <th>Total Errors</th> |
| | <th>Substitutions</th> |
| | <th>Deletions</th> |
| | <th>Insertions</th> |
| | </tr> |
| | </thead> |
| | <tbody> |
| | <tr> |
| | <td>{total_chars_no_punct}</td> |
| | <td>{cer_no_punct:.3f}</td> |
| | <td>{total_errors_no_punct}</td> |
| | <td>{substitutions_no_punct}</td> |
| | <td>{deletions_no_punct}</td> |
| | <td>{insertions_no_punct}</td> |
| | </tr> |
| | </tbody> |
| | </table> |
| | <div class="visualization"> |
| | <h3>Reference Text:</h3> |
| | <div class="text-display">{ref_no_punct}</div> |
| | <h3>Hypothesis Text:</h3> |
| | <div class="text-display">{hyp_no_punct}</div> |
| | |
| | <h3>Error Pairs:</h3> |
| | <table> |
| | <tr><th>Reference</th><th>Hypothesis</th></tr> |
| | {pairs_no_punct} |
| | </table> |
| | </div> |
| | </div> |
| | |
| | <div class="grid-item"> |
| | <h2>With Punctuation</h2> |
| | <table class="metrics"> |
| | <thead> |
| | <tr> |
| | <th>Total Chars</th> |
| | <th>CER</th> |
| | <th>Total Errors</th> |
| | <th>Substitutions</th> |
| | <th>Deletions</th> |
| | <th>Insertions</th> |
| | </tr> |
| | </thead> |
| | <tbody> |
| | <tr> |
| | <td>{total_chars_punct}</td> |
| | <td>{cer_punct:.3f}</td> |
| | <td>{total_errors_punct}</td> |
| | <td>{substitutions_punct}</td> |
| | <td>{deletions_punct}</td> |
| | <td>{insertions_punct}</td> |
| | </tr> |
| | </tbody> |
| | </table> |
| | <div class="visualization"> |
| | <h3>Reference Text:</h3> |
| | <div class="text-display">{ref_with_punct}</div> |
| | <h3>Hypothesis Text:</h3> |
| | <div class="text-display">{hyp_with_punct}</div> |
| | |
| | <h3>Error Pairs:</h3> |
| | <table> |
| | <tr><th>Reference</th><th>Hypothesis</th></tr> |
| | {pairs_with_punct} |
| | </table> |
| | </div> |
| | </div> |
| | </div> |
| | </div> |
| | </body> |
| | </html> |
| | """ |
| |
|
| | return html_template.format( |
| | cer_no_punct=cer_no_punct, |
| | total_errors_no_punct=total_errors_no_punct, |
| | insertions_no_punct=insertions_no_punct, |
| | deletions_no_punct=deletions_no_punct, |
| | substitutions_no_punct=substitutions_no_punct, |
| | cer_punct=cer_punct, |
| | total_errors_punct=total_errors_punct, |
| | insertions_punct=insertions_punct, |
| | deletions_punct=deletions_punct, |
| | substitutions_punct=substitutions_punct, |
| | total_chars_no_punct=total_chars_no_punct, |
| | total_chars_punct=total_chars_punct, |
| | ref_no_punct=format_text_for_html(diff_no_punct.reference_display), |
| | hyp_no_punct=format_text_for_html(diff_no_punct.hypothesis_display), |
| | pairs_no_punct=format_error_pairs(diff_no_punct.error_pairs), |
| | ref_with_punct=format_text_for_html(diff_with_punct.reference_display), |
| | hyp_with_punct=format_text_for_html( |
| | diff_with_punct.hypothesis_display), |
| | pairs_with_punct=format_error_pairs(diff_with_punct.error_pairs), |
| | ) |
| |
|