File size: 8,333 Bytes
b37fe93
 
 
 
 
 
 
 
 
 
 
b419a89
 
 
 
b37fe93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b419a89
 
 
 
 
 
 
 
 
b37fe93
b419a89
 
 
b37fe93
b419a89
b37fe93
 
 
 
 
 
 
 
 
 
b419a89
 
 
 
 
 
 
 
 
 
 
 
b37fe93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b419a89
b37fe93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b419a89
 
b37fe93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b419a89
 
 
 
 
 
 
b37fe93
 
 
b419a89
b37fe93
 
 
 
b419a89
b37fe93
 
b419a89
b37fe93
b419a89
b37fe93
 
7c937ef
b37fe93
 
 
 
 
b419a89
 
b37fe93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b419a89
 
 
b37fe93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
"""
QuickSilver Pro Chat — Hugging Face Space.

A zero-friction try-it demo for QuickSilver Pro. Anyone on HF can chat with
DeepSeek V3 / R1 / Qwen 3.5 through our OpenAI-compatible endpoint, without
creating an account first. The goal is top-of-funnel discoverability: the
banner at the bottom sends them to quicksilverpro.io for their own key.

Single-tenant QSP key (stored as the `QSP_KEY` Space secret) with a monthly
budget cap configured on the QSP side. In-process per-session rate-limit
keeps casual spam from spiking the bill.

Outbound links all carry `?ref=GHKN4L37` — the reserved REFERRAL_CODES entry
earmarked for HF-sourced signups. Lets us attribute signup volume from this
Space separate from other channels (Discord, Twitter, direct).
"""

from __future__ import annotations

import os
import time
from collections import deque
from typing import Iterable

import gradio as gr
from openai import OpenAI

# ────────────────────────── Configuration ──────────────────────────

QSP_KEY = os.environ.get("QSP_KEY", "").strip()
QSP_BASE = os.environ.get("QSP_BASE", "https://api.quicksilverpro.io/v1")

# Attribution code for this Space — reserved from REFERRAL_CODES private pool
# per growth/PROMO.md. Hardcoded here rather than env-configured because it
# never changes (a single Space = a single attribution bucket).
REF_CODE = "GHKN4L37"
SIGNUP_URL = f"https://quicksilverpro.io/?ref={REF_CODE}"
CLI_URL = "https://github.com/machinefi/qspro-cli"

# Gradio Dropdown accepts (display_label, value) tuples; the callback
# receives the value string, so we don't need to parse it back.
MODELS = [
    ("DeepSeek V3  —  general-purpose, fast", "deepseek-v3"),
    ("DeepSeek R1  —  reasoning, slower, deeper", "deepseek-r1"),
    ("Qwen 3.5-35B-A3B  —  262K context, multilingual", "qwen3.5-35b"),
]
DEFAULT_MODEL_VALUE = MODELS[0][1]

DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant."

# Per-session soft rate limit. Not a security boundary — the QSP-side budget
# cap on the shared key is. This just keeps one noisy session from blowing
# through the daily allowance in 90 seconds.
RATE_WINDOW_SEC = 60
RATE_MAX_MSGS = 8

_session_buckets: dict[str, deque] = {}
# Tracks which sessions have already received the first-response nudge, so
# we only attach it once per session instead of on every assistant message.
_session_nudged: set[str] = set()

# Appended to the first assistant response per session. Markdown-safe. The
# "---" horizontal rule visually separates the nudge from real model output
# so users don't confuse it with generated content.
NUDGE_MD = (
    f"\n\n---\n\n"
    f"💡 *Liked this? [Get your own key]({SIGNUP_URL}) — $5 in free credits, "
    f"no card required. Or `pip install quicksilverpro` for the [CLI]({CLI_URL}).*"
)


def _rate_limited(session_hash: str) -> bool:
    now = time.time()
    bucket = _session_buckets.setdefault(session_hash, deque())
    while bucket and now - bucket[0] > RATE_WINDOW_SEC:
        bucket.popleft()
    if len(bucket) >= RATE_MAX_MSGS:
        return True
    bucket.append(now)
    return False


# ────────────────────────── OpenAI client ──────────────────────────

if not QSP_KEY:
    # Don't crash on import — let the UI render a clear error banner instead,
    # so the Space owner sees "QSP_KEY secret not set" rather than a 500.
    client = None
else:
    client = OpenAI(base_url=QSP_BASE, api_key=QSP_KEY)


def respond(
    message: str,
    history: list[tuple[str, str]],
    model: str,
    system_prompt: str,
    temperature: float,
    max_tokens: int,
    request: gr.Request | None = None,
) -> Iterable[str]:
    if client is None:
        yield (
            "⚠️ Space misconfigured: `QSP_KEY` secret is not set. "
            "Owner: configure it in Settings → Variables and secrets."
        )
        return

    session_hash = (request.session_hash if request else "anon") or "anon"
    if _rate_limited(session_hash):
        yield (
            f"⏳ Rate limit reached ({RATE_MAX_MSGS} messages / "
            f"{RATE_WINDOW_SEC}s). Take a breath, then try again."
        )
        return

    is_first_response = not (history or [])

    messages: list[dict[str, str]] = []
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt.strip()})
    for user_msg, assistant_msg in history or []:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})

    try:
        stream = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=float(temperature),
            max_tokens=int(max_tokens),
            stream=True,
        )
    except Exception as e:
        yield f"❌ API error: {type(e).__name__}: {str(e)[:300]}"
        return

    accumulated = ""
    for chunk in stream:
        try:
            delta = chunk.choices[0].delta.content or ""
        except (AttributeError, IndexError):
            delta = ""
        if delta:
            accumulated += delta
            yield accumulated

    # Append the signup nudge to the first assistant response of the session
    # only — a persistent nudge on every turn would feel spammy. Guarded by a
    # set of session hashes so a fast re-click doesn't double-attach.
    if is_first_response and session_hash not in _session_nudged:
        _session_nudged.add(session_hash)
        yield accumulated + NUDGE_MD


# ────────────────────────── UI ──────────────────────────

HEADER_MD = f"""
# ⚡ QuickSilver Pro Chat

Try **DeepSeek V3 / R1** and **Qwen 3.5-35B-A3B** via an OpenAI-compatible API — no signup needed here.

<sub>Running on [QuickSilver Pro]({SIGNUP_URL}) · Get your own key ($5 free credits): [{SIGNUP_URL.replace('https://', '')}]({SIGNUP_URL}) · CLI: `pip install quicksilverpro`</sub>
"""

FOOTER_MD = f"""
---
<sub>Powered by <a href="{SIGNUP_URL}">QuickSilver Pro</a> — open-source LLM inference, OpenAI-compatible, ~20% below OpenRouter / Together / Fireworks. Built by <a href="{SIGNUP_URL}">MachineFi Labs</a>.</sub>
"""

with gr.Blocks(title="QuickSilver Pro Chat") as demo:
    gr.Markdown(HEADER_MD)

    with gr.Row():
        with gr.Column(scale=1):
            model_dropdown = gr.Dropdown(
                choices=MODELS,
                value=DEFAULT_MODEL_VALUE,
                label="Model",
                interactive=True,
            )
            system_prompt = gr.Textbox(
                label="System prompt",
                value=DEFAULT_SYSTEM_PROMPT,
                lines=3,
                max_lines=8,
            )
            temperature = gr.Slider(
                label="Temperature", minimum=0.0, maximum=2.0, step=0.1, value=0.7
            )
            max_tokens = gr.Slider(
                label="Max tokens", minimum=64, maximum=4096, step=64, value=1024
            )
        with gr.Column(scale=3):
            # Gradio 6.0 removed the submit_btn / retry_btn / undo_btn / clear_btn
            # args in favor of a more opinionated default layout; dropping them
            # keeps this compatible with both 5.x and 6.x.
            gr.ChatInterface(
                fn=respond,
                additional_inputs=[model_dropdown, system_prompt, temperature, max_tokens],
                examples=[
                    ["Write a concise git commit message for: fixed off-by-one error in pagination"],
                    ["Explain closures in JavaScript in 2 sentences"],
                    ["What's the fastest sorting algorithm for 100k integers and why?"],
                    ["Translate 'Hello, how are you?' into formal Japanese, Hindi, and Russian"],
                ],
                cache_examples=False,
            )

    gr.Markdown(FOOTER_MD)


if __name__ == "__main__":
    demo.queue(default_concurrency_limit=4, max_size=64).launch()