Spaces:

MachineFi
/

QuickSilverPro-Chat

Running

File size: 8,333 Bytes

"""
QuickSilver Pro Chat — Hugging Face Space.

A zero-friction try-it demo for QuickSilver Pro. Anyone on HF can chat with
DeepSeek V3 / R1 / Qwen 3.5 through our OpenAI-compatible endpoint, without
creating an account first. The goal is top-of-funnel discoverability: the
banner at the bottom sends them to quicksilverpro.io for their own key.

Single-tenant QSP key (stored as the `QSP_KEY` Space secret) with a monthly
budget cap configured on the QSP side. In-process per-session rate-limit
keeps casual spam from spiking the bill.

Outbound links all carry `?ref=GHKN4L37` — the reserved REFERRAL_CODES entry
earmarked for HF-sourced signups. Lets us attribute signup volume from this
Space separate from other channels (Discord, Twitter, direct).
"""

from __future__ import annotations

import os
import time
from collections import deque
from typing import Iterable

import gradio as gr
from openai import OpenAI

# ────────────────────────── Configuration ──────────────────────────

QSP_KEY = os.environ.get("QSP_KEY", "").strip()
QSP_BASE = os.environ.get("QSP_BASE", "https://api.quicksilverpro.io/v1")

# Attribution code for this Space — reserved from REFERRAL_CODES private pool
# per growth/PROMO.md. Hardcoded here rather than env-configured because it
# never changes (a single Space = a single attribution bucket).
REF_CODE = "GHKN4L37"
SIGNUP_URL = f"https://quicksilverpro.io/?ref={REF_CODE}"
CLI_URL = "https://github.com/machinefi/qspro-cli"

# Gradio Dropdown accepts (display_label, value) tuples; the callback
# receives the value string, so we don't need to parse it back.
MODELS = [
    ("DeepSeek V3  —  general-purpose, fast", "deepseek-v3"),
    ("DeepSeek R1  —  reasoning, slower, deeper", "deepseek-r1"),
    ("Qwen 3.5-35B-A3B  —  262K context, multilingual", "qwen3.5-35b"),
]
DEFAULT_MODEL_VALUE = MODELS[0][1]

DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant."

# Per-session soft rate limit. Not a security boundary — the QSP-side budget
# cap on the shared key is. This just keeps one noisy session from blowing
# through the daily allowance in 90 seconds.
RATE_WINDOW_SEC = 60
RATE_MAX_MSGS = 8

_session_buckets: dict[str, deque] = {}
# Tracks which sessions have already received the first-response nudge, so
# we only attach it once per session instead of on every assistant message.
_session_nudged: set[str] = set()

# Appended to the first assistant response per session. Markdown-safe. The
# "---" horizontal rule visually separates the nudge from real model output
# so users don't confuse it with generated content.
NUDGE_MD = (
    f"\n\n---\n\n"
    f"💡 *Liked this? [Get your own key]({SIGNUP_URL}) — $5 in free credits, "
    f"no card required. Or `pip install quicksilverpro` for the [CLI]({CLI_URL}).*"
)


def _rate_limited(session_hash: str) -> bool:
    now = time.time()
    bucket = _session_buckets.setdefault(session_hash, deque())
    while bucket and now - bucket[0] > RATE_WINDOW_SEC:
        bucket.popleft()
    if len(bucket) >= RATE_MAX_MSGS:
        return True
    bucket.append(now)
    return False


# ────────────────────────── OpenAI client ──────────────────────────

if not QSP_KEY:
    # Don't crash on import — let the UI render a clear error banner instead,
    # so the Space owner sees "QSP_KEY secret not set" rather than a 500.
    client = None
else:
    client = OpenAI(base_url=QSP_BASE, api_key=QSP_KEY)


def respond(
    message: str,
    history: list[tuple[str, str]],
    model: str,
    system_prompt: str,
    temperature: float,
    max_tokens: int,
    request: gr.Request | None = None,
) -> Iterable[str]:
    if client is None:
        yield (
            "⚠️ Space misconfigured: `QSP_KEY` secret is not set. "
            "Owner: configure it in Settings → Variables and secrets."
        )
        return

    session_hash = (request.session_hash if request else "anon") or "anon"
    if _rate_limited(session_hash):
        yield (
            f"⏳ Rate limit reached ({RATE_MAX_MSGS} messages / "
            f"{RATE_WINDOW_SEC}s). Take a breath, then try again."
        )
        return

    is_first_response = not (history or [])

    messages: list[dict[str, str]] = []
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt.strip()})
    for user_msg, assistant_msg in history or []:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})

    try:
        stream = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=float(temperature),
            max_tokens=int(max_tokens),
            stream=True,
        )
    except Exception as e:
        yield f"❌ API error: {type(e).__name__}: {str(e)[:300]}"
        return

    accumulated = ""
    for chunk in stream:
        try:
            delta = chunk.choices[0].delta.content or ""
        except (AttributeError, IndexError):
            delta = ""
        if delta:
            accumulated += delta
            yield accumulated

    # Append the signup nudge to the first assistant response of the session
    # only — a persistent nudge on every turn would feel spammy. Guarded by a
    # set of session hashes so a fast re-click doesn't double-attach.
    if is_first_response and session_hash not in _session_nudged:
        _session_nudged.add(session_hash)
        yield accumulated + NUDGE_MD


# ────────────────────────── UI ──────────────────────────

HEADER_MD = f"""
# ⚡ QuickSilver Pro Chat

Try **DeepSeek V3 / R1** and **Qwen 3.5-35B-A3B** via an OpenAI-compatible API — no signup needed here.

<sub>Running on [QuickSilver Pro]({SIGNUP_URL}) · Get your own key ($5 free credits): [{SIGNUP_URL.replace('https://', '')}]({SIGNUP_URL}) · CLI: `pip install quicksilverpro`</sub>
"""

FOOTER_MD = f"""
---
<sub>Powered by <a href="{SIGNUP_URL}">QuickSilver Pro</a> — open-source LLM inference, OpenAI-compatible, ~20% below OpenRouter / Together / Fireworks. Built by <a href="{SIGNUP_URL}">MachineFi Labs</a>.</sub>
"""

with gr.Blocks(title="QuickSilver Pro Chat") as demo:
    gr.Markdown(HEADER_MD)

    with gr.Row():
        with gr.Column(scale=1):
            model_dropdown = gr.Dropdown(
                choices=MODELS,
                value=DEFAULT_MODEL_VALUE,
                label="Model",
                interactive=True,
            )
            system_prompt = gr.Textbox(
                label="System prompt",
                value=DEFAULT_SYSTEM_PROMPT,
                lines=3,
                max_lines=8,
            )
            temperature = gr.Slider(
                label="Temperature", minimum=0.0, maximum=2.0, step=0.1, value=0.7
            )
            max_tokens = gr.Slider(
                label="Max tokens", minimum=64, maximum=4096, step=64, value=1024
            )
        with gr.Column(scale=3):
            # Gradio 6.0 removed the submit_btn / retry_btn / undo_btn / clear_btn
            # args in favor of a more opinionated default layout; dropping them
            # keeps this compatible with both 5.x and 6.x.
            gr.ChatInterface(
                fn=respond,
                additional_inputs=[model_dropdown, system_prompt, temperature, max_tokens],
                examples=[
                    ["Write a concise git commit message for: fixed off-by-one error in pagination"],
                    ["Explain closures in JavaScript in 2 sentences"],
                    ["What's the fastest sorting algorithm for 100k integers and why?"],
                    ["Translate 'Hello, how are you?' into formal Japanese, Hindi, and Russian"],
                ],
                cache_examples=False,
            )

    gr.Markdown(FOOTER_MD)


if __name__ == "__main__":
    demo.queue(default_concurrency_limit=4, max_size=64).launch()