Hrushi02 commited on
Commit
d004974
·
verified ·
1 Parent(s): 95908fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -54
app.py CHANGED
@@ -2,34 +2,28 @@ import gradio as gr
2
  import os
3
  from threading import Thread
4
  import torch
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
- from peft import PeftModel
7
 
8
- # --- 1. Load your Fine-Tuned Model and Tokenizer ---
9
- # Make sure to set your HUGGINGFACEHUB_API_TOKEN in your Space's secrets
10
- api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
11
 
12
- if not api_token:
13
- raise ValueError("❌ ERROR: Hugging Face API token is not set. Please set it in your Space secrets.")
 
 
14
 
15
- # Define model names
16
- base_model_name = "unsloth/qwen2.5-math-7b-bnb-4bit"
17
- peft_model_name = "Hrushi02/Root_Math"
18
 
19
- # Load base model and tokenizer
20
- base_model = AutoModelForCausalLM.from_pretrained(
21
- base_model_name,
22
- torch_dtype=torch.float16,
23
- device_map="auto",
24
- token=api_token
25
  )
26
- tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=api_token)
27
 
28
- # Load your fine-tuned PEFT model
29
- model = PeftModel.from_pretrained(base_model, peft_model_name, token=api_token)
30
- print("✅ Model loaded successfully!")
31
-
32
- # --- 2. Rewrite the Respond Function to Use YOUR Model ---
33
  def respond(
34
  message,
35
  history: list[tuple[str, str]],
@@ -38,51 +32,38 @@ def respond(
38
  temperature,
39
  top_p,
40
  ):
41
- # Create the chat history format
42
  messages = [{"role": "system", "content": system_message}]
43
- for val in history:
44
- if val[0]:
45
- messages.append({"role": "user", "content": val[0]})
46
- if val[1]:
47
- messages.append({"role": "assistant", "content": val[1]})
48
  messages.append({"role": "user", "content": message})
49
-
50
- # Prepare for streaming
51
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
52
 
53
- # Tokenize the input
54
- inputs = tokenizer.apply_chat_template(
55
- messages,
56
- add_generation_prompt=True,
57
- return_tensors="pt"
58
- ).to(model.device)
59
 
60
- # Generation arguments
61
- generation_kwargs = dict(
62
- inputs=inputs,
63
- streamer=streamer,
64
  max_new_tokens=max_tokens,
65
  temperature=temperature,
66
  top_p=top_p,
67
  do_sample=True,
68
  )
 
 
 
 
 
 
 
69
 
70
- # Start generation in a separate thread
71
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
72
- thread.start()
73
-
74
- # Yield the generated tokens
75
- response = ""
76
- for token in streamer:
77
- response += token
78
- yield response
79
-
80
- # --- 3. Launch the Gradio Interface (No Changes Here) ---
81
  demo = gr.ChatInterface(
82
  respond,
83
  additional_inputs=[
84
- gr.Textbox(value="You are a math assistant. Solve the following math problem.", label="System message"),
85
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
86
  gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
87
  gr.Slider(
88
  minimum=0.1,
@@ -92,6 +73,8 @@ demo = gr.ChatInterface(
92
  label="Top-p (nucleus sampling)",
93
  ),
94
  ],
 
 
95
  )
96
 
97
  if __name__ == "__main__":
 
2
  import os
3
  from threading import Thread
4
  import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline
 
6
 
7
+ # --- 1. Load a Standard CPU-Friendly Model ---
8
+ # No PEFT model needed. We are loading a pre-trained chat model directly.
9
+ model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
10
 
11
+ # Load the model and tokenizer
12
+ # No need for tokens if it's a public model. No special settings for CPU.
13
+ model = AutoModelForCausalLM.from_pretrained(model_name)
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
15
 
16
+ print(f"✅ Model '{model_name}' loaded successfully on CPU!")
 
 
17
 
18
+ # --- 2. Create a Pipeline for Easy Inference ---
19
+ pipe = pipeline(
20
+ "text-generation",
21
+ model=model,
22
+ tokenizer=tokenizer,
 
23
  )
 
24
 
25
+ # --- 3. Define the Respond Function for the Chatbot ---
26
+ # This function takes the user message and history, and generates a response using the pipeline.
 
 
 
27
  def respond(
28
  message,
29
  history: list[tuple[str, str]],
 
32
  temperature,
33
  top_p,
34
  ):
35
+ # Build the prompt using the specific chat template for TinyLlama
36
  messages = [{"role": "system", "content": system_message}]
37
+ for user_msg, assistant_msg in history:
38
+ messages.append({"role": "user", "content": user_msg})
39
+ messages.append({"role": "assistant", "content": assistant_msg})
 
 
40
  messages.append({"role": "user", "content": message})
 
 
 
41
 
42
+ prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
 
 
43
 
44
+ # Generate the response
45
+ # This will be slow on a CPU and will wait for the full response.
46
+ outputs = pipe(
47
+ prompt,
48
  max_new_tokens=max_tokens,
49
  temperature=temperature,
50
  top_p=top_p,
51
  do_sample=True,
52
  )
53
+
54
+ # Extract only the generated text from the full output
55
+ full_response = outputs[0]['generated_text']
56
+ # The response includes the prompt, so we split it to get only the new part
57
+ new_response = full_response.split(prompt)[1]
58
+
59
+ return new_response
60
 
61
+ # --- 4. Launch the Gradio Interface ---
 
 
 
 
 
 
 
 
 
 
62
  demo = gr.ChatInterface(
63
  respond,
64
  additional_inputs=[
65
+ gr.Textbox(value="You are a friendly and helpful chatbot.", label="System message"),
66
+ gr.Slider(minimum=10, maximum=512, value=128, step=1, label="Max new tokens"),
67
  gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
68
  gr.Slider(
69
  minimum=0.1,
 
73
  label="Top-p (nucleus sampling)",
74
  ),
75
  ],
76
+ title="TinyLlama 1.1B Chat",
77
+ description="A simple chatbot running on a CPU-friendly model from Hugging Face."
78
  )
79
 
80
  if __name__ == "__main__":