Spaces:

Hrushi02
/

Root_Math

Sleeping

App Files Files Community

Hrushi02 commited on Oct 16, 2025

Commit

d004974

verified ·

1 Parent(s): 95908fb

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -54

app.py CHANGED Viewed

@@ -2,34 +2,28 @@ import gradio as gr
 import os
 from threading import Thread
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from peft import PeftModel
-# --- 1. Load your Fine-Tuned Model and Tokenizer ---
-# Make sure to set your HUGGINGFACEHUB_API_TOKEN in your Space's secrets
-api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
-if not api_token:
-    raise ValueError("❌ ERROR: Hugging Face API token is not set. Please set it in your Space secrets.")
-# Define model names
-base_model_name = "unsloth/qwen2.5-math-7b-bnb-4bit"
-peft_model_name = "Hrushi02/Root_Math"
-# Load base model and tokenizer
-base_model = AutoModelForCausalLM.from_pretrained(
-    base_model_name,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    token=api_token
 )
-tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=api_token)
-# Load your fine-tuned PEFT model
-model = PeftModel.from_pretrained(base_model, peft_model_name, token=api_token)
-print("✅ Model loaded successfully!")
-# --- 2. Rewrite the Respond Function to Use YOUR Model ---
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -38,51 +32,38 @@ def respond(
     temperature,
     top_p,
 ):
-    # Create the chat history format
     messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
     messages.append({"role": "user", "content": message})
-    # Prepare for streaming
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    # Tokenize the input
-    inputs = tokenizer.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        return_tensors="pt"
-    ).to(model.device)
-    # Generation arguments
-    generation_kwargs = dict(
-        inputs=inputs,
-        streamer=streamer,
         max_new_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
         do_sample=True,
     )
-    # Start generation in a separate thread
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    # Yield the generated tokens
-    response = ""
-    for token in streamer:
-        response += token
-        yield response
-# --- 3. Launch the Gradio Interface (No Changes Here) ---
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
-        gr.Textbox(value="You are a math assistant. Solve the following math problem.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
@@ -92,6 +73,8 @@ demo = gr.ChatInterface(
             label="Top-p (nucleus sampling)",
         ),
     ],
 )
 if __name__ == "__main__":

 import os
 from threading import Thread
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline
+# --- 1. Load a Standard CPU-Friendly Model ---
+# No PEFT model needed. We are loading a pre-trained chat model directly.
+model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# Load the model and tokenizer
+# No need for tokens if it's a public model. No special settings for CPU.
+model = AutoModelForCausalLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+print(f"✅ Model '{model_name}' loaded successfully on CPU!")
+# --- 2. Create a Pipeline for Easy Inference ---
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
 )
+# --- 3. Define the Respond Function for the Chatbot ---
+# This function takes the user message and history, and generates a response using the pipeline.
 def respond(
     message,
     history: list[tuple[str, str]],
     temperature,
     top_p,
 ):
+    # Build the prompt using the specific chat template for TinyLlama
     messages = [{"role": "system", "content": system_message}]
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
+    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Generate the response
+    # This will be slow on a CPU and will wait for the full response.
+    outputs = pipe(
+        prompt,
         max_new_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
         do_sample=True,
     )
+    # Extract only the generated text from the full output
+    full_response = outputs[0]['generated_text']
+    # The response includes the prompt, so we split it to get only the new part
+    new_response = full_response.split(prompt)[1]
+    return new_response
+# --- 4. Launch the Gradio Interface ---
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
+        gr.Textbox(value="You are a friendly and helpful chatbot.", label="System message"),
+        gr.Slider(minimum=10, maximum=512, value=128, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
             label="Top-p (nucleus sampling)",
         ),
     ],
+    title="TinyLlama 1.1B Chat",
+    description="A simple chatbot running on a CPU-friendly model from Hugging Face."
 )
 if __name__ == "__main__":