import os import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread hf_token = os.getenv("HF_TOKEN") model_id = "ZyperAI/Z-AI-0.1-1.1B-Code.web" # Professional loading logic: Since Gradio 6.x runs as a persistent server, # global variables are naturally 'cached' for the duration of the process. print("Loading model and tokenizer...") tokenizer = AutoTokenizer.from_pretrained( model_id, token=hf_token, use_fast=False ) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, device_map="cpu", token=hf_token ) print("Model loaded successfully.") def generate_code(prompt, history): messages = history + [{"role": "user", "content": prompt}] # Prepare inputs using the model's chat template inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to("cpu") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( inputs=inputs, streamer=streamer, max_new_tokens=1024, do_sample=True, temperature=0.7, top_p=0.9 ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() response = "" for new_text in streamer: response += new_text yield response # Gradio 6.x UI setup with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gr.Markdown("# ⚡ **Z-AI Web Coder**") # In Gradio 6, type="messages" is the standard for the chatbot component chatbot = gr.Chatbot(height=500, show_copy_button=True, type="messages") with gr.Row(): msg = gr.Textbox( placeholder="E.g., Create a responsive navigation bar with CSS...", show_label=False, scale=9 ) submit = gr.Button("Build", variant="primary", scale=1) msg.submit(generate_code, [msg, chatbot], [chatbot]) submit.click(generate_code, [msg, chatbot], [chatbot]) msg.submit(lambda: "", None, [msg]) submit.click(lambda: "", None, [msg]) if __name__ == "__main__": demo.launch()