Spaces:

dispatchAI
/

tokenizer-visualizer

Runtime error

File size: 3,653 Bytes

e92dc53

import gradio as gr
import json
import re

def tokenize_text(text: str, method: str = "wordpiece") -> str:
    """Tokenize text and return token count and details."""
    if not text:
        return json.dumps({"error": "No text provided"})
    
    if method == "whitespace":
        tokens = text.split()
    elif method == "character":
        tokens = list(text)
    elif method == "wordpiece":
        words = re.findall(r"\S+|\s+", text)
        tokens = []
        for w in words:
            if len(w) <= 4:
                tokens.append(w)
            else:
                for i in range(0, len(w), 4):
                    prefix = "##" if i > 0 else ""
                    tokens.append(prefix + w[i:i+4])
    elif method == "bpe":
        words = re.findall(r"\w+|\W", text)
        tokens = []
        for w in words:
            if w.isalpha() and len(w) <= 6:
                tokens.append(w)
            elif w.isalpha():
                mid = len(w) // 2
                tokens.append(w[:mid])
                tokens.append("##" + w[mid:])
            else:
                tokens.append(w)
    
    return json.dumps({
        "method": method,
        "token_count": len(tokens),
        "char_count": len(text),
        "tokens": tokens[:50],
        "avg_token_length": round(sum(len(t) for t in tokens) / len(tokens), 1) if tokens else 0,
    }, indent=2)

def compare_methods(text: str) -> str:
    """Compare tokenization across methods."""
    methods = ["whitespace", "character", "wordpiece", "bpe"]
    results = {}
    for m in methods:
        if m == "whitespace":
            tokens = text.split()
        elif m == "character":
            tokens = list(text)
        elif m == "wordpiece":
            words = re.findall(r"\S+|\s+", text)
            tokens = []
            for w in words:
                if len(w) <= 4: tokens.append(w)
                else:
                    for i in range(0, len(w), 4):
                        tokens.append(("##" if i > 0 else "") + w[i:i+4])
        elif m == "bpe":
            words = re.findall(r"\w+|\W", text)
            tokens = []
            for w in words:
                if w.isalpha() and len(w) <= 6: tokens.append(w)
                elif w.isalpha():
                    mid = len(w) // 2
                    tokens.append(w[:mid])
                    tokens.append("##" + w[mid:])
                else: tokens.append(w)
        results[m] = {"token_count": len(tokens), "tokens_per_char": round(len(tokens) / len(text), 3) if text else 0}
    
    return json.dumps({"char_count": len(text), "comparison": results}, indent=2)

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Tokenizer Visualizer") as demo:
    gr.Markdown("# 🔤 dispatchAI Tokenizer Visualizer (MCP)")
    with gr.Tab("Visualize"):
        v_text = gr.Textbox(label="Text", value="The quick brown fox jumps over the lazy dog.", lines=3)
        v_method = gr.Dropdown(["whitespace", "character", "wordpiece", "bpe"], value="wordpiece", label="Method")
        v_btn = gr.Button("Tokenize", variant="primary")
        v_out = gr.Textbox(label="Tokens (JSON)", lines=15)
        v_btn.click(fn=tokenize_text, inputs=[v_text, v_method], outputs=v_out)
    with gr.Tab("Compare"):
        c_text = gr.Textbox(label="Text", value="The quick brown fox.", lines=3)
        c_btn = gr.Button("Compare Methods", variant="primary")
        c_out = gr.Textbox(label="Comparison (JSON)", lines=12)
        c_btn.click(fn=compare_methods, inputs=c_text, outputs=c_out)
    gr.Markdown("---\n🚀 [dispatchAI](https://huggingface.co/dispatchAI)")

demo.launch(mcp_server=True)