Spaces:

Hrushi02
/

Root_Math

Sleeping

App Files Files Community

Root_Math / app.py

Hrushi02

Update app.py

d004974 verified 5 months ago

raw

history blame contribute delete

2.76 kB

	import gradio as gr
	import os
	from threading import Thread
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline

	# --- 1. Load a Standard CPU-Friendly Model ---
	# No PEFT model needed. We are loading a pre-trained chat model directly.
	model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

	# Load the model and tokenizer
	# No need for tokens if it's a public model. No special settings for CPU.
	model = AutoModelForCausalLM.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	print(f"✅ Model '{model_name}' loaded successfully on CPU!")

	# --- 2. Create a Pipeline for Easy Inference ---
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	)

	# --- 3. Define the Respond Function for the Chatbot ---
	# This function takes the user message and history, and generates a response using the pipeline.
	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	# Build the prompt using the specific chat template for TinyLlama
	messages = [{"role": "system", "content": system_message}]
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": assistant_msg})
	messages.append({"role": "user", "content": message})

	prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	# Generate the response
	# This will be slow on a CPU and will wait for the full response.
	outputs = pipe(
	prompt,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	)

	# Extract only the generated text from the full output
	full_response = outputs[0]['generated_text']
	# The response includes the prompt, so we split it to get only the new part
	new_response = full_response.split(prompt)[1]

	return new_response

	# --- 4. Launch the Gradio Interface ---
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="You are a friendly and helpful chatbot.", label="System message"),
	gr.Slider(minimum=10, maximum=512, value=128, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	],
	title="TinyLlama 1.1B Chat",
	description="A simple chatbot running on a CPU-friendly model from Hugging Face."
	)

	if __name__ == "__main__":
	demo.launch()