Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on 15 days ago

Commit

e62bece

verified ·

1 Parent(s): 4bc3e8b

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -90

app.py CHANGED Viewed

@@ -1,129 +1,125 @@
 # app.py
-# Minimal & stable version for free CPU Hugging Face Space – Phi-3-mini + LoRA
 import torch
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
-# ────────────────────────────────────────────────────────────────
 # Config
-# ────────────────────────────────────────────────────────────────
-BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
 LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
 TEMPERATURE    = 0.0
 DO_SAMPLE      = False
-# ────────────────────────────────────────────────────────────────
-# Load model & tokenizer
-# ────────────────────────────────────────────────────────────────
-print("Loading base model (CPU)...")
-try:
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        quantization_config = bnb_config,
-        device_map          = "cpu",
-        trust_remote_code   = True,
-        low_cpu_mem_usage   = True
-    )
-    print("Loading LoRA...")
-    model = PeftModel.from_pretrained(model, LORA_PATH)
-    print("Merging LoRA weights...")
-    model = model.merge_and_unload()
-    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
-    model.eval()
-    print("Model & tokenizer loaded successfully")
-except Exception as e:
-    print(f"Model loading failed: {str(e)}")
-    raise
-# ────────────────────────────────────────────────────────────────
-# Inference function
-# ────────────────────────────────────────────────────────────────
-def generate_sql(question: str):
-    try:
-        messages = [{"role": "user", "content": question.strip()}]
-        inputs = tokenizer.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_tensors="pt"
         )
-        with torch.inference_mode():
-            outputs = model.generate(
-                input_ids       = inputs,
-                max_new_tokens  = MAX_NEW_TOKENS,
-                temperature     = TEMPERATURE,
-                do_sample       = DO_SAMPLE,
-                use_cache       = True,
-                pad_token_id    = tokenizer.eos_token_id,
-            )
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Clean typical Phi-3 output markers
-        for marker in ["<|assistant|>", "<|end|>", "<|user|>"]:
-            if marker in response:
-                response = response.split(marker, 1)[-1].strip()
-        return response.strip() or "(empty response)"
-    except Exception as e:
-        return f"Generation error: {str(e)}"
-# ────────────────────────────────────────────────────────────────
 # Gradio UI
-# ────────────────────────────────────────────────────────────────
 demo = gr.Interface(
-    fn              = generate_sql,
-    inputs          = gr.Textbox(
-        label       = "SQL question",
-        placeholder = "Find duplicate emails in users table",
-        lines       = 3,
-        max_lines   = 6
     ),
-    outputs         = gr.Textbox(
-        label       = "Generated SQL",
-        lines       = 8
     ),
-    title           = "SQL Chat – Phi-3-mini fine-tuned (CPU)",
-    description     = (
-        "Free CPU version – first answer usually takes 60–180+ seconds.\n"
-        "Later answers are faster (model stays in memory)."
     ),
-    examples        = [
         ["Find duplicate emails in users table"],
         ["Top 5 highest paid employees"],
         ["Count orders per customer last month"],
-        ["Delete duplicate rows based on email"]
     ],
-    cache_examples  = False,
 )
 if __name__ == "__main__":
-    print("Launching interface...")
     demo.launch(
-        server_name       = "0.0.0.0",
-        # NO fixed server_port → let Gradio pick free port automatically
-        debug             = False,
-        quiet             = False,
-        show_error        = True,
-        prevent_thread_lock = True
-    )

 # app.py
+# Stable CPU-only Hugging Face Space
+# Phi-3-mini + LoRA (NO bitsandbytes, NO SSR issues)
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
 import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
+# ─────────────────────────────────────────────
 # Config
+# ─────────────────────────────────────────────
+BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
 LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
 TEMPERATURE    = 0.0
 DO_SAMPLE      = False
+# ─────────────────────────────────────────────
+# Load model & tokenizer (CPU SAFE)
+# ─────────────────────────────────────────────
+print("Loading base model on CPU...")
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    device_map="cpu",
+    torch_dtype=torch.float32,
+    trust_remote_code=True,
+    low_cpu_mem_usage=True,
+)
+print("Loading LoRA adapter...")
+model = PeftModel.from_pretrained(model, LORA_PATH)
+print("Merging LoRA weights...")
+model = model.merge_and_unload()
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+model.eval()
+print("Model & tokenizer loaded successfully")
+# ─────────────────────────────────────────────
+# Inference
+# ─────────────────────────────────────────────
+def generate_sql(question: str) -> str:
+    if not question or not question.strip():
+        return "Please enter a SQL-related question."
+    messages = [
+        {"role": "user", "content": question.strip()}
+    ]
+    input_ids = tokenizer.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    )
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=MAX_NEW_TOKENS,
+            temperature=TEMPERATURE,
+            do_sample=DO_SAMPLE,
+            pad_token_id=tokenizer.eos_token_id,
+            use_cache=True,
         )
+    response = tokenizer.decode(
+        output_ids[0],
+        skip_special_tokens=True
+    )
+    # Clean Phi-3 chat artifacts
+    for token in ["<|assistant|>", "<|user|>", "<|end|>"]:
+        if token in response:
+            response = response.split(token)[-1]
+    return response.strip() or "(empty response)"
+# ─────────────────────────────────────────────
 # Gradio UI
+# ─────────────────────────────────────────────
 demo = gr.Interface(
+    fn=generate_sql,
+    inputs=gr.Textbox(
+        label="SQL Question",
+        placeholder="Find duplicate emails in users table",
+        lines=3,
     ),
+    outputs=gr.Textbox(
+        label="Generated SQL",
+        lines=8,
     ),
+    title="SQL Chat – Phi-3-mini (CPU)",
+    description=(
+        "CPU-only Hugging Face Space.\n"
+        "First response may take 60–180 seconds. "
+        "Subsequent requests are faster."
     ),
+    examples=[
         ["Find duplicate emails in users table"],
         ["Top 5 highest paid employees"],
         ["Count orders per customer last month"],
+        ["Delete duplicate rows based on email"],
     ],
+    cache_examples=False,
 )
+# ─────────────────────────────────────────────
+# Launch
+# ─────────────────────────────────────────────
 if __name__ == "__main__":
+    print("Launching Gradio interface...")
     demo.launch(
+        server_name="0.0.0.0",
+        ssr_mode=False,   # important: avoids asyncio FD bug
+        show_error=True,
+    )