Spaces:

saadkhi
/

SQL_chatbot_API

Sleeping

App Files Files Community

saadkhi commited on 8 days ago

Commit

e95c2d3

verified ·

1 Parent(s): 7f3026b

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -53

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import warnings
 warnings.filterwarnings("ignore")
@@ -6,54 +8,47 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
-# ─────────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
 LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
-model = None
-tokenizer = None
-# ─────────────────────────────
-# Lazy load (VERY IMPORTANT)
-# ─────────────────────────────
-def load_model():
-    global model, tokenizer
-    if model is not None:
-        return
-    print("🔄 Loading model (first request only)...")
-    base = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        device_map="cpu",
-        torch_dtype=torch.float16,   # lighter
-        low_cpu_mem_usage=True,
-        trust_remote_code=True,
-    )
-    base = PeftModel.from_pretrained(base, LORA_PATH)
-    print("Merging LoRA...")
-    model_loaded = base.merge_and_unload()
-    tokenizer_loaded = AutoTokenizer.from_pretrained(BASE_MODEL)
-    model_loaded.eval()
-    model = model_loaded
-    tokenizer = tokenizer_loaded
-    print("✅ Model ready")
-# ─────────────────────────────
 def generate_sql(question):
-    if not question.strip():
-        return "Enter a question"
-    load_model()
     messages = [{"role": "user", "content": question}]
@@ -64,33 +59,33 @@ def generate_sql(question):
         return_tensors="pt",
     )
-    with torch.inference_mode():
-        output_ids = model.generate(
-            input_ids=input_ids,
             max_new_tokens=MAX_NEW_TOKENS,
-            temperature=0.0,
             do_sample=False,
             pad_token_id=tokenizer.eos_token_id,
         )
-    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     for t in ["<|assistant|>", "<|user|>", "<|end|>"]:
-        if t in response:
-            response = response.split(t)[-1]
-    return response.strip()
-# ─────────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(lines=3, label="SQL Question"),
-    outputs=gr.Textbox(lines=8, label="SQL"),
-    title="SQL Chat Phi-3 CPU",
-    description="First request loads model (60-120s)",
 )
-demo.queue(concurrency_count=1, max_size=5)
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", show_error=True)

+# CPU SAFE HuggingFace Space (2026 stable)
 import warnings
 warnings.filterwarnings("ignore")
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
+# reduce CPU overload on free tier
+torch.set_num_threads(1)
+# ─────────────────────────
+# Config
+# ─────────────────────────
 BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
 LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"
 MAX_NEW_TOKENS = 180
+print("Loading model...")
+# ─────────────────────────
+# Load base model
+# ─────────────────────────
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    device_map="cpu",
+    torch_dtype=torch.float32,
+    trust_remote_code=True,
+    low_cpu_mem_usage=True,
+)
+print("Loading LoRA...")
+model = PeftModel.from_pretrained(model, LORA_PATH)
+print("Merging LoRA...")
+model = model.merge_and_unload()
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+model.eval()
+print("Model ready")
+# ─────────────────────────
+# Inference
+# ─────────────────────────
 def generate_sql(question):
+    if not question:
+        return "Enter a SQL question."
     messages = [{"role": "user", "content": question}]
         return_tensors="pt",
     )
+    with torch.no_grad():
+        output = model.generate(
+            input_ids,
             max_new_tokens=MAX_NEW_TOKENS,
+            temperature=0,
             do_sample=False,
             pad_token_id=tokenizer.eos_token_id,
         )
+    text = tokenizer.decode(output[0], skip_special_tokens=True)
+    # clean artifacts
     for t in ["<|assistant|>", "<|user|>", "<|end|>"]:
+        text = text.replace(t, "")
+    return text.strip()
+# ─────────────────────────
+# UI
+# ─────────────────────────
 demo = gr.Interface(
     fn=generate_sql,
     inputs=gr.Textbox(lines=3, label="SQL Question"),
+    outputs=gr.Textbox(lines=8, label="Generated SQL"),
+    title="SQL Chat – Phi-3 mini",
+    description="Free CPU Space. First response may take ~90s",
+    cache_examples=False,
 )
+demo.launch()