Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on Feb 2

Commit

4bc3e8b

verified ·

1 Parent(s): 1b8e088

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -59

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# app.py - Fully CPU-safe version for free Hugging Face Spaces
 import torch
 import gradio as gr
@@ -6,19 +7,21 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
 # ────────────────────────────────────────────────────────────────
-# Configuration
 # ────────────────────────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
-LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
-TEMPERATURE = 0.0
-DO_SAMPLE = False
 # ────────────────────────────────────────────────────────────────
-# Load model safely on CPU
 # ────────────────────────────────────────────────────────────────
-print("Loading base model on CPU...")
 try:
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
@@ -28,30 +31,32 @@ try:
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL,
-        quantization_config=bnb_config,
-        device_map="cpu",
-        trust_remote_code=True,
-        low_cpu_mem_usage=True
     )
-    print("Loading and merging LoRA adapters...")
     model = PeftModel.from_pretrained(model, LORA_PATH)
     model = model.merge_and_unload()
     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
     model.eval()
-    print("Model successfully loaded on CPU")
 except Exception as e:
     print(f"Model loading failed: {str(e)}")
     raise
 # ────────────────────────────────────────────────────────────────
-# Inference function (CPU only – no @spaces.GPU)
 # ────────────────────────────────────────────────────────────────
-def generate_sql(prompt: str):
     try:
-        messages = [{"role": "user", "content": prompt.strip()}]
         inputs = tokenizer.apply_chat_template(
             messages,
@@ -60,69 +65,65 @@ def generate_sql(prompt: str):
             return_tensors="pt"
         )
-        # No .to("cuda") – stay on CPU
         with torch.inference_mode():
             outputs = model.generate(
-                input_ids=inputs,
-                max_new_tokens=MAX_NEW_TOKENS,
-                temperature=TEMPERATURE,
-                do_sample=DO_SAMPLE,
-                use_cache=True,
-                pad_token_id=tokenizer.eos_token_id,
             )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Clean output
-        if "<|assistant|>" in response:
-            response = response.split("<|assistant|>", 1)[-1].strip()
-        if "<|end|>" in response:
-            response = response.split("<|end|>")[0].strip()
-        if "<|user|>" in response:
-            response = response.split("<|user|>")[0].strip()
-        return response.strip() or "No valid response generated."
     except Exception as e:
-        return f"Error during generation: {str(e)}"
 # ────────────────────────────────────────────────────────────────
-# Gradio Interface
 # ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
-    fn=generate_sql,
-    inputs=gr.Textbox(
-        label="Your SQL-related question",
-        placeholder="e.g. Find duplicate emails in users table",
-        lines=3,
-        max_lines=6
     ),
-    outputs=gr.Textbox(
-        label="Generated SQL / Answer",
-        lines=6
     ),
-    title="SQL Chatbot – Phi-3-mini fine-tuned (CPU)",
-    description=(
-        "Free CPU version – first response may take 60–180+ seconds.\n"
-        "Subsequent responses will be faster (model stays in memory)."
     ),
-    examples=[
         ["Find duplicate emails in users table"],
-        ["Top 5 highest paid employees from employees table"],
-        ["Count total orders per customer in last 30 days"],
-        ["Delete duplicate rows based on email column"]
     ],
-    cache_examples=False,          # Very important on CPU
 )
 if __name__ == "__main__":
-    print("Starting Gradio server...")
     demo.launch(
-        server_name="0.0.0.0",
-        # NO server_port here — Gradio will pick the first free one (7860, 7861, ...)
-        debug=False,
-        quiet=False,
-        show_error=True,
-        prevent_thread_lock=True
     )

+# app.py
+# Minimal & stable version for free CPU Hugging Face Space – Phi-3-mini + LoRA
 import torch
 import gradio as gr
 from peft import PeftModel
 # ────────────────────────────────────────────────────────────────
+# Config
 # ────────────────────────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
+LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
+TEMPERATURE    = 0.0
+DO_SAMPLE      = False
 # ────────────────────────────────────────────────────────────────
+# Load model & tokenizer
 # ────────────────────────────────────────────────────────────────
+print("Loading base model (CPU)...")
 try:
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL,
+        quantization_config = bnb_config,
+        device_map          = "cpu",
+        trust_remote_code   = True,
+        low_cpu_mem_usage   = True
     )
+    print("Loading LoRA...")
     model = PeftModel.from_pretrained(model, LORA_PATH)
+    print("Merging LoRA weights...")
     model = model.merge_and_unload()
     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
     model.eval()
+    print("Model & tokenizer loaded successfully")
 except Exception as e:
     print(f"Model loading failed: {str(e)}")
     raise
 # ────────────────────────────────────────────────────────────────
+# Inference function
 # ────────────────────────────────────────────────────────────────
+def generate_sql(question: str):
     try:
+        messages = [{"role": "user", "content": question.strip()}]
         inputs = tokenizer.apply_chat_template(
             messages,
             return_tensors="pt"
         )
         with torch.inference_mode():
             outputs = model.generate(
+                input_ids       = inputs,
+                max_new_tokens  = MAX_NEW_TOKENS,
+                temperature     = TEMPERATURE,
+                do_sample       = DO_SAMPLE,
+                use_cache       = True,
+                pad_token_id    = tokenizer.eos_token_id,
             )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Clean typical Phi-3 output markers
+        for marker in ["<|assistant|>", "<|end|>", "<|user|>"]:
+            if marker in response:
+                response = response.split(marker, 1)[-1].strip()
+        return response.strip() or "(empty response)"
     except Exception as e:
+        return f"Generation error: {str(e)}"
 # ────────────────────────────────────────────────────────────────
+# Gradio UI
 # ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
+    fn              = generate_sql,
+    inputs          = gr.Textbox(
+        label       = "SQL question",
+        placeholder = "Find duplicate emails in users table",
+        lines       = 3,
+        max_lines   = 6
     ),
+    outputs         = gr.Textbox(
+        label       = "Generated SQL",
+        lines       = 8
     ),
+    title           = "SQL Chat – Phi-3-mini fine-tuned (CPU)",
+    description     = (
+        "Free CPU version – first answer usually takes 60–180+ seconds.\n"
+        "Later answers are faster (model stays in memory)."
     ),
+    examples        = [
         ["Find duplicate emails in users table"],
+        ["Top 5 highest paid employees"],
+        ["Count orders per customer last month"],
+        ["Delete duplicate rows based on email"]
     ],
+    cache_examples  = False,
 )
 if __name__ == "__main__":
+    print("Launching interface...")
     demo.launch(
+        server_name       = "0.0.0.0",
+        # NO fixed server_port → let Gradio pick free port automatically
+        debug             = False,
+        quiet             = False,
+        show_error        = True,
+        prevent_thread_lock = True
     )