Spaces:
Running
Running
Add tokenization details
Browse files
app.py
CHANGED
|
@@ -146,12 +146,40 @@ def make_html_block(student_tokenizer, teacher_tokenizer, text, idx):
|
|
| 146 |
)
|
| 147 |
|
| 148 |
highlighted = highlight_groups(student_tokenizer, teacher_tokenizer, s_ids, t_ids, s_groups, t_groups)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
return (
|
| 150 |
f'<div style="border:1px solid #ccc; padding:10px; margin:10px 0; '
|
| 151 |
f'border-radius:5px; white-space:pre-wrap; font-family:monospace; font-size:13px;">'
|
| 152 |
f"<strong>Text {idx + 1}</strong> "
|
| 153 |
f"(student tokens: {len(s_ids)}, teacher tokens: {len(t_ids)})<br><br>"
|
| 154 |
-
f"{
|
|
|
|
|
|
|
| 155 |
)
|
| 156 |
|
| 157 |
|
|
|
|
| 146 |
)
|
| 147 |
|
| 148 |
highlighted = highlight_groups(student_tokenizer, teacher_tokenizer, s_ids, t_ids, s_groups, t_groups)
|
| 149 |
+
|
| 150 |
+
# Build tokenized views
|
| 151 |
+
s_tokens = [student_tokenizer.decode([tid], skip_special_tokens=False, clean_up_tokenization_spaces=False) for tid in s_ids]
|
| 152 |
+
t_tokens = [teacher_tokenizer.decode([tid], skip_special_tokens=False, clean_up_tokenization_spaces=False) for tid in t_ids]
|
| 153 |
+
|
| 154 |
+
s_tokens_html = " | ".join(html.escape(f'"{t}"') for t in s_tokens)
|
| 155 |
+
t_tokens_html = " | ".join(html.escape(f'"{t}"') for t in t_tokens)
|
| 156 |
+
|
| 157 |
+
tokenized_section = f'''
|
| 158 |
+
<div style="margin-bottom:15px;">
|
| 159 |
+
<details style="margin-bottom:10px;">
|
| 160 |
+
<summary style="cursor:pointer; font-weight:bold; user-select:none;">Show tokenization details</summary>
|
| 161 |
+
<div style="display:grid; grid-template-columns:1fr 1fr; gap:15px; margin-top:10px;">
|
| 162 |
+
<div style="border:1px solid #ddd; padding:10px; border-radius:5px;">
|
| 163 |
+
<strong style="color:#f57c00;">Student Tokens ({len(s_ids)})</strong>
|
| 164 |
+
<div style="margin-top:8px; font-size:12px; word-break:break-word;">{s_tokens_html}</div>
|
| 165 |
+
</div>
|
| 166 |
+
<div style="border:1px solid #ddd; padding:10px; border-radius:5px;">
|
| 167 |
+
<strong style="color:#1976d2;">Teacher Tokens ({len(t_ids)})</strong>
|
| 168 |
+
<div style="margin-top:8px; font-size:12px; word-break:break-word;">{t_tokens_html}</div>
|
| 169 |
+
</div>
|
| 170 |
+
</div>
|
| 171 |
+
</details>
|
| 172 |
+
</div>
|
| 173 |
+
'''
|
| 174 |
+
|
| 175 |
return (
|
| 176 |
f'<div style="border:1px solid #ccc; padding:10px; margin:10px 0; '
|
| 177 |
f'border-radius:5px; white-space:pre-wrap; font-family:monospace; font-size:13px;">'
|
| 178 |
f"<strong>Text {idx + 1}</strong> "
|
| 179 |
f"(student tokens: {len(s_ids)}, teacher tokens: {len(t_ids)})<br><br>"
|
| 180 |
+
f"{tokenized_section}"
|
| 181 |
+
f"{highlighted}"
|
| 182 |
+
f"</div>"
|
| 183 |
)
|
| 184 |
|
| 185 |
|