import os
import gradio as ui
from vllm import LLM, SamplingParams
from huggingface_hub import login

HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
    login(token=HF_TOKEN)

MODEL_ID = "coder-vansh/cypher_llm_model"

print("🔥 Spawning ZeroGPU vLLM Layer for Gemma 4...")
# vLLM automatically optimizes Gemma 4 architecture using vVRAM distribution
llm = LLM(model=MODEL_ID, trust_remote_code=True, max_model_len=2048)

def predict(message, history):
    system_prompt = "You are CYPHER, a casual, witty Hinglish AI companion by Vansh & Aditya. Tagline: \"Not just an AI — YOUR AI.\""
    
    formatted_prompt = f"<|system|>\n{system_prompt}"
    if history:
        for user_msg, bot_msg in history:
            formatted_prompt += f"\n<|user|>\n{user_msg}\n<|assistant|>\n{bot_msg}"
            
    formatted_prompt += f"\n<|user|>\n{message}\n<|assistant|>\n"

    sampling_params = SamplingParams(temperature=0.7, max_tokens=250, top_p=0.95)
    
    try:
        outputs = llm.generate([formatted_prompt], sampling_params)
        response_text = outputs[0].outputs[0].text
        return response_text
    except Exception as e:
        return f"⚠️ Production Glitch: {str(e)}"

with ui.Blocks() as demo:
    ui.Markdown("# 🤖 CYPHER AI Live Production Space (ZeroGPU Enabled)")
    ui.Markdown("### *Not just an AI — YOUR AI.* | Developed by Vansh & Aditya")
    
    ui.ChatInterface(
        fn=predict,
        textbox=ui.Textbox(placeholder="Bhai se kuch bhi poocho...", container=False, scale=7),
    )

demo.launch(server_name="0.0.0.0", server_port=7860)