Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on Feb 2

Commit

0fad5f5

verified ·

1 Parent(s): 81f0e97

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -30

app.py CHANGED Viewed

@@ -1,58 +1,56 @@
-# app.py - CPU SAFE VERSION (No CUDA, No GPU)
 import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
-# ─────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
-LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
-TEMPERATURE = 0.0
-DO_SAMPLE = False
-print("Loading model on CPU...")
-# 4-bit config (works on CPU but slower)
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_compute_dtype=torch.bfloat16
 )
-# Load base model on CPU
 model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     quantization_config=bnb_config,
-    device_map="cpu",
     trust_remote_code=True
 )
-print("Loading LoRA...")
 model = PeftModel.from_pretrained(model, LORA_PATH)
-# Merge LoRA for simpler inference
-model = model.merge_and_unload()
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 model.eval()
-# ─────────────────────────────────────────────
 def generate_sql(prompt: str):
     messages = [{"role": "user", "content": prompt}]
-    # Tokenize (CPU)
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_tensors="pt"
     )
-    input_length = inputs.shape[-1]   # length of prompt tokens
     with torch.inference_mode():
         outputs = model.generate(
             input_ids=inputs,
@@ -63,31 +61,34 @@ def generate_sql(prompt: str):
             pad_token_id=tokenizer.eos_token_id,
         )
-    # 🔑 Remove the prompt tokens from the output
-    generated_tokens = outputs[0][input_length:]
-    response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
     return response
-# ─────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(
-        label="Ask SQL question",
         placeholder="Delete duplicate rows from users table based on email",
         lines=3
     ),
     outputs=gr.Textbox(label="Generated SQL"),
-    title="SQL Chatbot (CPU Mode)",
-    description="Phi-3-mini 4bit + LoRA (CPU only, slower inference)",
     examples=[
         ["Find duplicate emails in users table"],
         ["Top 5 highest paid employees"],
         ["Count orders per customer last month"]
     ],
-    cache_examples=False
 )
 if __name__ == "__main__":
-    demo.launch()

+# app.py - ZeroGPU safe: no caching + CPU load + GPU only in inference
 import torch
 import gradio as gr
+import spaces
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
+# ────────────────────────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
+LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
+TEMPERATURE    = 0.0
+DO_SAMPLE      = False
+print("Loading quantized base model on CPU (GPU only during inference)...")
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_compute_dtype=torch.bfloat16
 )
 model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     quantization_config=bnb_config,
+    device_map="cpu",          # ← Force CPU load at startup
     trust_remote_code=True
 )
+print("Loading & merging LoRA...")
 model = PeftModel.from_pretrained(model, LORA_PATH)
+model = model.merge_and_unload()  # Merge once for speed
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 model.eval()
+# ────────────────────────────────────────────────────────────────
+@spaces.GPU(duration=60)  # Requests GPU slice only here
 def generate_sql(prompt: str):
     messages = [{"role": "user", "content": prompt}]
+    # Tokenize on CPU
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_tensors="pt"
     )
+    # Move to GPU only now (GPU is allocated)
+    inputs = inputs.to("cuda")
     with torch.inference_mode():
         outputs = model.generate(
             input_ids=inputs,
             pad_token_id=tokenizer.eos_token_id,
         )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Clean up output
+    if "<|assistant|>" in response:
+        response = response.split("<|assistant|>", 1)[-1].strip()
+    if "<|end|>" in response:
+        response = response.split("<|end|>")[0].strip()
     return response
+# ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(
+        label="Ask an SQL question",
         placeholder="Delete duplicate rows from users table based on email",
         lines=3
     ),
     outputs=gr.Textbox(label="Generated SQL"),
+    title="SQL Chatbot (ZeroGPU)",
+    description="Phi-3-mini 4bit + LoRA - GPU only during generation",
     examples=[
         ["Find duplicate emails in users table"],
         ["Top 5 highest paid employees"],
         ["Count orders per customer last month"]
     ],
+    cache_examples=False   # ← This is critical! Prevents startup crash
 )
 if __name__ == "__main__":
+    demo.launch()