Spaces:
Runtime error
Runtime error
File size: 3,653 Bytes
e92dc53 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | import gradio as gr
import json
import re
def tokenize_text(text: str, method: str = "wordpiece") -> str:
"""Tokenize text and return token count and details."""
if not text:
return json.dumps({"error": "No text provided"})
if method == "whitespace":
tokens = text.split()
elif method == "character":
tokens = list(text)
elif method == "wordpiece":
words = re.findall(r"\S+|\s+", text)
tokens = []
for w in words:
if len(w) <= 4:
tokens.append(w)
else:
for i in range(0, len(w), 4):
prefix = "##" if i > 0 else ""
tokens.append(prefix + w[i:i+4])
elif method == "bpe":
words = re.findall(r"\w+|\W", text)
tokens = []
for w in words:
if w.isalpha() and len(w) <= 6:
tokens.append(w)
elif w.isalpha():
mid = len(w) // 2
tokens.append(w[:mid])
tokens.append("##" + w[mid:])
else:
tokens.append(w)
return json.dumps({
"method": method,
"token_count": len(tokens),
"char_count": len(text),
"tokens": tokens[:50],
"avg_token_length": round(sum(len(t) for t in tokens) / len(tokens), 1) if tokens else 0,
}, indent=2)
def compare_methods(text: str) -> str:
"""Compare tokenization across methods."""
methods = ["whitespace", "character", "wordpiece", "bpe"]
results = {}
for m in methods:
if m == "whitespace":
tokens = text.split()
elif m == "character":
tokens = list(text)
elif m == "wordpiece":
words = re.findall(r"\S+|\s+", text)
tokens = []
for w in words:
if len(w) <= 4: tokens.append(w)
else:
for i in range(0, len(w), 4):
tokens.append(("##" if i > 0 else "") + w[i:i+4])
elif m == "bpe":
words = re.findall(r"\w+|\W", text)
tokens = []
for w in words:
if w.isalpha() and len(w) <= 6: tokens.append(w)
elif w.isalpha():
mid = len(w) // 2
tokens.append(w[:mid])
tokens.append("##" + w[mid:])
else: tokens.append(w)
results[m] = {"token_count": len(tokens), "tokens_per_char": round(len(tokens) / len(text), 3) if text else 0}
return json.dumps({"char_count": len(text), "comparison": results}, indent=2)
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Tokenizer Visualizer") as demo:
gr.Markdown("# 🔤 dispatchAI Tokenizer Visualizer (MCP)")
with gr.Tab("Visualize"):
v_text = gr.Textbox(label="Text", value="The quick brown fox jumps over the lazy dog.", lines=3)
v_method = gr.Dropdown(["whitespace", "character", "wordpiece", "bpe"], value="wordpiece", label="Method")
v_btn = gr.Button("Tokenize", variant="primary")
v_out = gr.Textbox(label="Tokens (JSON)", lines=15)
v_btn.click(fn=tokenize_text, inputs=[v_text, v_method], outputs=v_out)
with gr.Tab("Compare"):
c_text = gr.Textbox(label="Text", value="The quick brown fox.", lines=3)
c_btn = gr.Button("Compare Methods", variant="primary")
c_out = gr.Textbox(label="Comparison (JSON)", lines=12)
c_btn.click(fn=compare_methods, inputs=c_text, outputs=c_out)
gr.Markdown("---\n🚀 [dispatchAI](https://huggingface.co/dispatchAI)")
demo.launch(mcp_server=True)
|