Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on 15 days ago

Commit

8b67be0

verified ·

1 Parent(s): 0fad5f5

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -58

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py - ZeroGPU safe: no caching + CPU load + GPU only in inference
 import torch
 import gradio as gr
@@ -7,6 +7,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
 # ────────────────────────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
 LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
@@ -14,81 +17,125 @@ MAX_NEW_TOKENS = 180
 TEMPERATURE    = 0.0
 DO_SAMPLE      = False
-print("Loading quantized base model on CPU (GPU only during inference)...")
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
-model = AutoModelForCausalLM.from_pretrained(
-    BASE_MODEL,
-    quantization_config=bnb_config,
-    device_map="cpu",          # ← Force CPU load at startup
-    trust_remote_code=True
-)
-print("Loading & merging LoRA...")
-model = PeftModel.from_pretrained(model, LORA_PATH)
-model = model.merge_and_unload()  # Merge once for speed
-tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
-model.eval()
 # ────────────────────────────────────────────────────────────────
-@spaces.GPU(duration=60)  # Requests GPU slice only here
 def generate_sql(prompt: str):
-    messages = [{"role": "user", "content": prompt}]
-    # Tokenize on CPU
-    inputs = tokenizer.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_tensors="pt"
-    )
-    # Move to GPU only now (GPU is allocated)
-    inputs = inputs.to("cuda")
-    with torch.inference_mode():
-        outputs = model.generate(
-            input_ids=inputs,
-            max_new_tokens=MAX_NEW_TOKENS,
-            temperature=TEMPERATURE,
-            do_sample=DO_SAMPLE,
-            use_cache=True,
-            pad_token_id=tokenizer.eos_token_id,
         )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Clean up output
-    if "<|assistant|>" in response:
-        response = response.split("<|assistant|>", 1)[-1].strip()
-    if "<|end|>" in response:
-        response = response.split("<|end|>")[0].strip()
-    return response
 # ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(
-        label="Ask an SQL question",
-        placeholder="Delete duplicate rows from users table based on email",
-        lines=3
     ),
-    outputs=gr.Textbox(label="Generated SQL"),
-    title="SQL Chatbot (ZeroGPU)",
-    description="Phi-3-mini 4bit + LoRA - GPU only during generation",
     examples=[
         ["Find duplicate emails in users table"],
-        ["Top 5 highest paid employees"],
-        ["Count orders per customer last month"]
     ],
-    cache_examples=False   # ← This is critical! Prevents startup crash
 )
 if __name__ == "__main__":
-    demo.launch()

+# app.py
 import torch
 import gradio as gr
 from peft import PeftModel
 # ────────────────────────────────────────────────────────────────
+# Configuration
+# ────────────────────────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
 LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
 TEMPERATURE    = 0.0
 DO_SAMPLE      = False
+# ────────────────────────────────────────────────────────────────
+# Load model safely on CPU first
+# ────────────────────────────────────────────────────────────────
+print("Loading base model on CPU...")
+try:
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        quantization_config=bnb_config,
+        device_map="cpu",           # Critical for ZeroGPU + CPU spaces
+        trust_remote_code=True,
+        low_cpu_mem_usage=True
+    )
+    print("Loading and merging LoRA adapters...")
+    model = PeftModel.from_pretrained(model, LORA_PATH)
+    model = model.merge_and_unload()  # Merge once → faster inference
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+    model.eval()
+    print("Model successfully loaded on CPU")
+except Exception as e:
+    print(f"Model loading failed: {str(e)}")
+    raise
 # ────────────────────────────────────────────────────────────────
+# Inference function – GPU only here
+# ────────────────────────────────────────────────────────────────
+@spaces.GPU(duration=60)  # 60 seconds is usually enough
 def generate_sql(prompt: str):
+    try:
+        messages = [{"role": "user", "content": prompt.strip()}]
+        # Tokenize on CPU
+        inputs = tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt"
         )
+        # Move to GPU only inside decorated function
+        if torch.cuda.is_available():
+            inputs = inputs.to("cuda")
+        with torch.inference_mode():
+            outputs = model.generate(
+                input_ids=inputs,
+                max_new_tokens=MAX_NEW_TOKENS,
+                temperature=TEMPERATURE,
+                do_sample=DO_SAMPLE,
+                use_cache=True,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Clean output
+        if "<|assistant|>" in response:
+            response = response.split("<|assistant|>", 1)[-1].strip()
+        if "<|end|>" in response:
+            response = response.split("<|end|>")[0].strip()
+        if "<|user|>" in response:
+            response = response.split("<|user|>")[0].strip()
+        return response.strip() or "No valid response generated."
+    except Exception as e:
+        return f"Error during generation: {str(e)}"
+# ────────────────────────────────────────────────────────────────
+# Gradio Interface
 # ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(
+        label="Your SQL-related question",
+        placeholder="e.g. Find duplicate emails in users table",
+        lines=3,
+        max_lines=6
+    ),
+    outputs=gr.Textbox(
+        label="Generated SQL / Answer",
+        lines=6
+    ),
+    title="SQL Chatbot – Phi-3-mini fine-tuned",
+    description=(
+        "Ask questions about SQL queries.\n\n"
+        "ZeroGPU version – first response may take 10–60 seconds (cold start)."
     ),
     examples=[
         ["Find duplicate emails in users table"],
+        ["Top 5 highest paid employees from employees table"],
+        ["Count total orders per customer in last 30 days"],
+        ["Delete duplicate rows based on email column"]
     ],
+    cache_examples=False,          # Very important for ZeroGPU
+    allow_flagging="never"
 )
 if __name__ == "__main__":
+    print("Starting Gradio server...")
+    try:
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            debug=False,
+            quiet=False,
+            show_error=True
+        )
+    except Exception as e:
+        print(f"Launch failed: {str(e)}")
+        raise