saadkhi commited on
Commit
22df2c5
Β·
verified Β·
1 Parent(s): bf06a99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -31
app.py CHANGED
@@ -1,26 +1,23 @@
1
- # app.py
2
 
3
  import torch
4
  import gradio as gr
5
- import spaces
6
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
  from peft import PeftModel
8
 
9
  # ────────────────────────────────────────────────────────────────
10
  # Configuration
11
  # ────────────────────────────────────────────────────────────────
12
-
13
  BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
14
- LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
15
 
16
  MAX_NEW_TOKENS = 180
17
- TEMPERATURE = 0.0
18
- DO_SAMPLE = False
19
 
20
  # ────────────────────────────────────────────────────────────────
21
- # Load model safely on CPU first
22
  # ────────────────────────────────────────────────────────────────
23
-
24
  print("Loading base model on CPU...")
25
  try:
26
  bnb_config = BitsAndBytesConfig(
@@ -32,14 +29,14 @@ try:
32
  model = AutoModelForCausalLM.from_pretrained(
33
  BASE_MODEL,
34
  quantization_config=bnb_config,
35
- device_map="cpu", # Critical for ZeroGPU + CPU spaces
36
  trust_remote_code=True,
37
  low_cpu_mem_usage=True
38
  )
39
 
40
  print("Loading and merging LoRA adapters...")
41
  model = PeftModel.from_pretrained(model, LORA_PATH)
42
- model = model.merge_and_unload() # Merge once β†’ faster inference
43
 
44
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
45
  model.eval()
@@ -50,26 +47,20 @@ except Exception as e:
50
  raise
51
 
52
  # ────────────────────────────────────────────────────────────────
53
- # Inference function – GPU only here
54
  # ────────────────────────────────────────────────────────────────
55
-
56
- @spaces.GPU(duration=60) # 60 seconds is usually enough
57
  def generate_sql(prompt: str):
58
  try:
59
  messages = [{"role": "user", "content": prompt.strip()}]
60
-
61
- # Tokenize on CPU
62
  inputs = tokenizer.apply_chat_template(
63
  messages,
64
  tokenize=True,
65
  add_generation_prompt=True,
66
  return_tensors="pt"
67
  )
68
-
69
- # Move to GPU only inside decorated function
70
- if torch.cuda.is_available():
71
- inputs = inputs.to("cuda")
72
-
73
  with torch.inference_mode():
74
  outputs = model.generate(
75
  input_ids=inputs,
@@ -81,7 +72,7 @@ def generate_sql(prompt: str):
81
  )
82
 
83
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
84
-
85
  # Clean output
86
  if "<|assistant|>" in response:
87
  response = response.split("<|assistant|>", 1)[-1].strip()
@@ -98,7 +89,6 @@ def generate_sql(prompt: str):
98
  # ────────────────────────────────────────────────────────────────
99
  # Gradio Interface
100
  # ────────────────────────────────────────────────────────────────
101
-
102
  demo = gr.Interface(
103
  fn=generate_sql,
104
  inputs=gr.Textbox(
@@ -111,10 +101,10 @@ demo = gr.Interface(
111
  label="Generated SQL / Answer",
112
  lines=6
113
  ),
114
- title="SQL Chatbot – Phi-3-mini fine-tuned",
115
  description=(
116
- "Ask questions about SQL queries.\n\n"
117
- "Free CPU version – responses may take 30–120 seconds or more."
118
  ),
119
  examples=[
120
  ["Find duplicate emails in users table"],
@@ -122,20 +112,16 @@ demo = gr.Interface(
122
  ["Count total orders per customer in last 30 days"],
123
  ["Delete duplicate rows based on email column"]
124
  ],
125
- cache_examples=False, # keep this
126
- # allow_flagging="never" ← REMOVE THIS LINE COMPLETELY
127
  )
128
 
129
  if __name__ == "__main__":
130
  print("Starting Gradio server...")
131
- import time
132
- time.sleep(15) # Give extra time for model/Gradio to settle
133
-
134
  demo.launch(
135
  server_name="0.0.0.0",
136
  server_port=7860,
137
  debug=False,
138
  quiet=False,
139
  show_error=True,
140
- prevent_thread_lock=True # Helps in containers
141
  )
 
1
+ # app.py - Fully CPU-safe version for free Hugging Face Spaces
2
 
3
  import torch
4
  import gradio as gr
 
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
6
  from peft import PeftModel
7
 
8
  # ────────────────────────────────────────────────────────────────
9
  # Configuration
10
  # ────────────────────────────────────────────────────────────────
 
11
  BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
12
+ LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
13
 
14
  MAX_NEW_TOKENS = 180
15
+ TEMPERATURE = 0.0
16
+ DO_SAMPLE = False
17
 
18
  # ────────────────────────────────────────────────────────────────
19
+ # Load model safely on CPU
20
  # ────────────────────────────────────────────────────────────────
 
21
  print("Loading base model on CPU...")
22
  try:
23
  bnb_config = BitsAndBytesConfig(
 
29
  model = AutoModelForCausalLM.from_pretrained(
30
  BASE_MODEL,
31
  quantization_config=bnb_config,
32
+ device_map="cpu",
33
  trust_remote_code=True,
34
  low_cpu_mem_usage=True
35
  )
36
 
37
  print("Loading and merging LoRA adapters...")
38
  model = PeftModel.from_pretrained(model, LORA_PATH)
39
+ model = model.merge_and_unload()
40
 
41
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
42
  model.eval()
 
47
  raise
48
 
49
  # ────────────────────────────────────────────────────────────────
50
+ # Inference function (CPU only – no @spaces.GPU)
51
  # ────────────────────────────────────────────────────────────────
 
 
52
  def generate_sql(prompt: str):
53
  try:
54
  messages = [{"role": "user", "content": prompt.strip()}]
55
+
 
56
  inputs = tokenizer.apply_chat_template(
57
  messages,
58
  tokenize=True,
59
  add_generation_prompt=True,
60
  return_tensors="pt"
61
  )
62
+
63
+ # No .to("cuda") – stay on CPU
 
 
 
64
  with torch.inference_mode():
65
  outputs = model.generate(
66
  input_ids=inputs,
 
72
  )
73
 
74
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
75
+
76
  # Clean output
77
  if "<|assistant|>" in response:
78
  response = response.split("<|assistant|>", 1)[-1].strip()
 
89
  # ────────────────────────────────────────────────────────────────
90
  # Gradio Interface
91
  # ────────────────────────────────────────────────────────────────
 
92
  demo = gr.Interface(
93
  fn=generate_sql,
94
  inputs=gr.Textbox(
 
101
  label="Generated SQL / Answer",
102
  lines=6
103
  ),
104
+ title="SQL Chatbot – Phi-3-mini fine-tuned (CPU)",
105
  description=(
106
+ "Free CPU version – first response may take 60–180+ seconds.\n"
107
+ "Subsequent responses will be faster (model stays in memory)."
108
  ),
109
  examples=[
110
  ["Find duplicate emails in users table"],
 
112
  ["Count total orders per customer in last 30 days"],
113
  ["Delete duplicate rows based on email column"]
114
  ],
115
+ cache_examples=False, # Very important on CPU
 
116
  )
117
 
118
  if __name__ == "__main__":
119
  print("Starting Gradio server...")
 
 
 
120
  demo.launch(
121
  server_name="0.0.0.0",
122
  server_port=7860,
123
  debug=False,
124
  quiet=False,
125
  show_error=True,
126
+ prevent_thread_lock=True
127
  )