File size: 4,973 Bytes
0ce308c
3e47970
0ce308c
3e47970
 
0ce308c
 
 
 
 
 
 
 
3e47970
0ce308c
 
 
 
3e47970
0ce308c
 
3e47970
0ce308c
 
 
 
 
 
 
 
 
 
3e47970
0ce308c
 
 
 
 
 
3e47970
0ce308c
 
 
3e47970
 
 
 
0ce308c
 
 
 
3e47970
 
0ce308c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6549bb3
 
834cd01
6549bb3
 
3e47970
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

# ── Token Resolution (Open Source friendly) ──────────────────────────────────
token = (
    os.environ.get("HF_TOKEN") or
    os.environ.get("TEST_TOKEN") or
    os.environ.get("HUGGINGFACE_TOKEN") or
    os.environ.get("HF_API_TOKEN") or
    None
)

if not token:
    print("⚠️  No HF token found β€” running unauthenticated (rate limits apply)")
else:
    print("βœ…  HF token loaded")

# ── Model ─────────────────────────────────────────────────────────────────────
MODEL = "HuggingFaceTB/SmolLM2-135M-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸ”§  Device: {device}")

tokenizer = AutoTokenizer.from_pretrained(MODEL, token=token)
model = AutoModelForCausalLM.from_pretrained(MODEL, token=token).to(device)
print(f"βœ…  Model loaded: {MODEL}")

# ── Inference ─────────────────────────────────────────────────────────────────
def generate(prompt: str, max_new_tokens: int, temperature: float, system_prompt: str):
    if not prompt.strip():
        return "⚠️ Empty prompt", ""

    messages = []
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    input_tokens = inputs.shape[-1]

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature if temperature > 0 else None,
            do_sample=temperature > 0,
            top_p=0.9 if temperature > 0 else None,
            pad_token_id=tokenizer.eos_token_id,
        )

    new_tokens = outputs[0][input_tokens:]
    result = tokenizer.decode(new_tokens, skip_special_tokens=True)

    stats = f"Input tokens: {input_tokens} | Output tokens: {len(new_tokens)} | Device: {device}"
    return result, stats

# ── UI ────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="SmolLM2 Pipeline Test", theme=gr.themes.Monochrome()) as demo:
    gr.Markdown("""
    # πŸ§ͺ SmolLM2-135M Pipeline Test
    `HuggingFaceTB/SmolLM2-135M-Instruct` β€” CPU/ZeroGPU fallback
    """)

    with gr.Row():
        with gr.Column(scale=2):
            system_prompt = gr.Textbox(
                label="System Prompt (optional)",
                placeholder="You are a helpful assistant.",
                lines=2,
            )
            prompt = gr.Textbox(
                label="User Prompt",
                placeholder="Was ist die Hauptstadt von Deutschland?",
                lines=4,
            )
            with gr.Row():
                max_tokens = gr.Slider(10, 300, value=150, step=10, label="Max New Tokens")
                temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature (0 = greedy)")
            btn = gr.Button("β–Ά Generate", variant="primary")

        with gr.Column(scale=2):
            output = gr.Textbox(label="Output", lines=10, interactive=False)
            stats = gr.Textbox(label="Stats", lines=1, interactive=False)

    # Quick test examples
    gr.Examples(
        examples=[
            ["You are a helpful assistant.", "What is 2+2? Answer in one sentence.", 50, 0.0],
            ["", "Summarize in one sentence: The Eiffel Tower is a wrought-iron lattice tower in Paris, built in 1889.", 80, 0.2],
            ["You are a JSON API. Respond only with valid JSON.", 'Extract name and age from: "I am Klaus, 34 years old."', 100, 0.0],
            ["", "Write a Python function that reverses a string.", 150, 0.3],
        ],
        inputs=[system_prompt, prompt, max_tokens, temperature],
        label="Quick Tests",
    )

    btn.click(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats])
    prompt.submit(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats])

    gr.Markdown(f"""
    ---
    **Token:** `{'βœ… loaded' if token else '⚠️ not set'}` | 
    **Model:** `{MODEL}` | 
    **Device:** `{device}`
    """)

    gr.Markdown("""
    ### πŸ”— Links & Ressourcen
    [WoS](https://www.github.com/wall-of-shames) | [CodeyLab@HF](https://hf.co/codey-lab) | **BadTin & VolkanSah**
    """)

demo.launch()