cmpatino HF Staff commited on
Commit
042c934
·
1 Parent(s): 2597b43

Add tokenization details

Browse files
Files changed (1) hide show
  1. app.py +29 -1
app.py CHANGED
@@ -146,12 +146,40 @@ def make_html_block(student_tokenizer, teacher_tokenizer, text, idx):
146
  )
147
 
148
  highlighted = highlight_groups(student_tokenizer, teacher_tokenizer, s_ids, t_ids, s_groups, t_groups)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  return (
150
  f'<div style="border:1px solid #ccc; padding:10px; margin:10px 0; '
151
  f'border-radius:5px; white-space:pre-wrap; font-family:monospace; font-size:13px;">'
152
  f"<strong>Text {idx + 1}</strong> "
153
  f"(student tokens: {len(s_ids)}, teacher tokens: {len(t_ids)})<br><br>"
154
- f"{highlighted}</div>"
 
 
155
  )
156
 
157
 
 
146
  )
147
 
148
  highlighted = highlight_groups(student_tokenizer, teacher_tokenizer, s_ids, t_ids, s_groups, t_groups)
149
+
150
+ # Build tokenized views
151
+ s_tokens = [student_tokenizer.decode([tid], skip_special_tokens=False, clean_up_tokenization_spaces=False) for tid in s_ids]
152
+ t_tokens = [teacher_tokenizer.decode([tid], skip_special_tokens=False, clean_up_tokenization_spaces=False) for tid in t_ids]
153
+
154
+ s_tokens_html = " | ".join(html.escape(f'"{t}"') for t in s_tokens)
155
+ t_tokens_html = " | ".join(html.escape(f'"{t}"') for t in t_tokens)
156
+
157
+ tokenized_section = f'''
158
+ <div style="margin-bottom:15px;">
159
+ <details style="margin-bottom:10px;">
160
+ <summary style="cursor:pointer; font-weight:bold; user-select:none;">Show tokenization details</summary>
161
+ <div style="display:grid; grid-template-columns:1fr 1fr; gap:15px; margin-top:10px;">
162
+ <div style="border:1px solid #ddd; padding:10px; border-radius:5px;">
163
+ <strong style="color:#f57c00;">Student Tokens ({len(s_ids)})</strong>
164
+ <div style="margin-top:8px; font-size:12px; word-break:break-word;">{s_tokens_html}</div>
165
+ </div>
166
+ <div style="border:1px solid #ddd; padding:10px; border-radius:5px;">
167
+ <strong style="color:#1976d2;">Teacher Tokens ({len(t_ids)})</strong>
168
+ <div style="margin-top:8px; font-size:12px; word-break:break-word;">{t_tokens_html}</div>
169
+ </div>
170
+ </div>
171
+ </details>
172
+ </div>
173
+ '''
174
+
175
  return (
176
  f'<div style="border:1px solid #ccc; padding:10px; margin:10px 0; '
177
  f'border-radius:5px; white-space:pre-wrap; font-family:monospace; font-size:13px;">'
178
  f"<strong>Text {idx + 1}</strong> "
179
  f"(student tokens: {len(s_ids)}, teacher tokens: {len(t_ids)})<br><br>"
180
+ f"{tokenized_section}"
181
+ f"{highlighted}"
182
+ f"</div>"
183
  )
184
 
185