Spaces:

saadkhi
/

SQL_chatbot_API

Running

App Files Files Community

saadkhi commited on 15 days ago

Commit

22df2c5

verified ·

1 Parent(s): bf06a99

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -31

app.py CHANGED Viewed

@@ -1,26 +1,23 @@
-# app.py
 import torch
 import gradio as gr
-import spaces
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
 # ────────────────────────────────────────────────────────────────
 # Configuration
 # ────────────────────────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
-LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
-TEMPERATURE    = 0.0
-DO_SAMPLE      = False
 # ────────────────────────────────────────────────────────────────
-# Load model safely on CPU first
 # ────────────────────────────────────────────────────────────────
 print("Loading base model on CPU...")
 try:
     bnb_config = BitsAndBytesConfig(
@@ -32,14 +29,14 @@ try:
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL,
         quantization_config=bnb_config,
-        device_map="cpu",           # Critical for ZeroGPU + CPU spaces
         trust_remote_code=True,
         low_cpu_mem_usage=True
     )
     print("Loading and merging LoRA adapters...")
     model = PeftModel.from_pretrained(model, LORA_PATH)
-    model = model.merge_and_unload()  # Merge once → faster inference
     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
     model.eval()
@@ -50,26 +47,20 @@ except Exception as e:
     raise
 # ────────────────────────────────────────────────────────────────
-# Inference function – GPU only here
 # ────────────────────────────────────────────────────────────────
-@spaces.GPU(duration=60)  # 60 seconds is usually enough
 def generate_sql(prompt: str):
     try:
         messages = [{"role": "user", "content": prompt.strip()}]
-        # Tokenize on CPU
         inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt"
         )
-        # Move to GPU only inside decorated function
-        if torch.cuda.is_available():
-            inputs = inputs.to("cuda")
         with torch.inference_mode():
             outputs = model.generate(
                 input_ids=inputs,
@@ -81,7 +72,7 @@ def generate_sql(prompt: str):
             )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         # Clean output
         if "<|assistant|>" in response:
             response = response.split("<|assistant|>", 1)[-1].strip()
@@ -98,7 +89,6 @@ def generate_sql(prompt: str):
 # ────────────────────────────────────────────────────────────────
 # Gradio Interface
 # ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(
@@ -111,10 +101,10 @@ demo = gr.Interface(
         label="Generated SQL / Answer",
         lines=6
     ),
-    title="SQL Chatbot – Phi-3-mini fine-tuned",
     description=(
-        "Ask questions about SQL queries.\n\n"
-        "Free CPU version – responses may take 30–120 seconds or more."
     ),
     examples=[
         ["Find duplicate emails in users table"],
@@ -122,20 +112,16 @@ demo = gr.Interface(
         ["Count total orders per customer in last 30 days"],
         ["Delete duplicate rows based on email column"]
     ],
-    cache_examples=False,          # keep this
-    # allow_flagging="never"       ← REMOVE THIS LINE COMPLETELY
 )
 if __name__ == "__main__":
     print("Starting Gradio server...")
-    import time
-    time.sleep(15)  # Give extra time for model/Gradio to settle
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         debug=False,
         quiet=False,
         show_error=True,
-        prevent_thread_lock=True  # Helps in containers
     )

+# app.py - Fully CPU-safe version for free Hugging Face Spaces
 import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
 # ────────────────────────────────────────────────────────────────
 # Configuration
 # ────────────────────────────────────────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
+LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
+TEMPERATURE = 0.0
+DO_SAMPLE = False
 # ────────────────────────────────────────────────────────────────
+# Load model safely on CPU
 # ────────────────────────────────────────────────────────────────
 print("Loading base model on CPU...")
 try:
     bnb_config = BitsAndBytesConfig(
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL,
         quantization_config=bnb_config,
+        device_map="cpu",
         trust_remote_code=True,
         low_cpu_mem_usage=True
     )
     print("Loading and merging LoRA adapters...")
     model = PeftModel.from_pretrained(model, LORA_PATH)
+    model = model.merge_and_unload()
     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
     model.eval()
     raise
 # ────────────────────────────────────────────────────────────────
+# Inference function (CPU only – no @spaces.GPU)
 # ────────────────────────────────────────────────────────────────
 def generate_sql(prompt: str):
     try:
         messages = [{"role": "user", "content": prompt.strip()}]
         inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt"
         )
+        # No .to("cuda") – stay on CPU
         with torch.inference_mode():
             outputs = model.generate(
                 input_ids=inputs,
             )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         # Clean output
         if "<|assistant|>" in response:
             response = response.split("<|assistant|>", 1)[-1].strip()
 # ────────────────────────────────────────────────────────────────
 # Gradio Interface
 # ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(
         label="Generated SQL / Answer",
         lines=6
     ),
+    title="SQL Chatbot – Phi-3-mini fine-tuned (CPU)",
     description=(
+        "Free CPU version – first response may take 60–180+ seconds.\n"
+        "Subsequent responses will be faster (model stays in memory)."
     ),
     examples=[
         ["Find duplicate emails in users table"],
         ["Count total orders per customer in last 30 days"],
         ["Delete duplicate rows based on email column"]
     ],
+    cache_examples=False,          # Very important on CPU
 )
 if __name__ == "__main__":
     print("Starting Gradio server...")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         debug=False,
         quiet=False,
         show_error=True,
+        prevent_thread_lock=True
     )