saadkhi commited on
Commit
e95c2d3
Β·
verified Β·
1 Parent(s): 7f3026b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -53
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import warnings
2
  warnings.filterwarnings("ignore")
3
 
@@ -6,54 +8,47 @@ import gradio as gr
6
  from transformers import AutoTokenizer, AutoModelForCausalLM
7
  from peft import PeftModel
8
 
9
- # ─────────────────────────────
 
 
 
 
 
10
  BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
11
  LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
12
 
13
  MAX_NEW_TOKENS = 180
14
 
15
- model = None
16
- tokenizer = None
17
-
18
- # ─────────────────────────────
19
- # Lazy load (VERY IMPORTANT)
20
- # ─────────────────────────────
21
- def load_model():
22
- global model, tokenizer
23
-
24
- if model is not None:
25
- return
26
-
27
- print("πŸ”„ Loading model (first request only)...")
28
-
29
- base = AutoModelForCausalLM.from_pretrained(
30
- BASE_MODEL,
31
- device_map="cpu",
32
- torch_dtype=torch.float16, # lighter
33
- low_cpu_mem_usage=True,
34
- trust_remote_code=True,
35
- )
36
-
37
- base = PeftModel.from_pretrained(base, LORA_PATH)
38
-
39
- print("Merging LoRA...")
40
- model_loaded = base.merge_and_unload()
41
 
42
- tokenizer_loaded = AutoTokenizer.from_pretrained(BASE_MODEL)
 
43
 
44
- model_loaded.eval()
 
45
 
46
- model = model_loaded
47
- tokenizer = tokenizer_loaded
48
 
49
- print("βœ… Model ready")
 
50
 
51
- # ─────────────────────────────
 
 
52
  def generate_sql(question):
53
- if not question.strip():
54
- return "Enter a question"
55
-
56
- load_model()
57
 
58
  messages = [{"role": "user", "content": question}]
59
 
@@ -64,33 +59,33 @@ def generate_sql(question):
64
  return_tensors="pt",
65
  )
66
 
67
- with torch.inference_mode():
68
- output_ids = model.generate(
69
- input_ids=input_ids,
70
  max_new_tokens=MAX_NEW_TOKENS,
71
- temperature=0.0,
72
  do_sample=False,
73
  pad_token_id=tokenizer.eos_token_id,
74
  )
75
 
76
- response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
77
 
 
78
  for t in ["<|assistant|>", "<|user|>", "<|end|>"]:
79
- if t in response:
80
- response = response.split(t)[-1]
81
 
82
- return response.strip()
83
 
84
- # ─────────────────────────────
 
 
85
  demo = gr.Interface(
86
  fn=generate_sql,
87
  inputs=gr.Textbox(lines=3, label="SQL Question"),
88
- outputs=gr.Textbox(lines=8, label="SQL"),
89
- title="SQL Chat Phi-3 CPU",
90
- description="First request loads model (60-120s)",
 
91
  )
92
 
93
- demo.queue(concurrency_count=1, max_size=5)
94
-
95
- if __name__ == "__main__":
96
- demo.launch(server_name="0.0.0.0", show_error=True)
 
1
+ # CPU SAFE HuggingFace Space (2026 stable)
2
+
3
  import warnings
4
  warnings.filterwarnings("ignore")
5
 
 
8
  from transformers import AutoTokenizer, AutoModelForCausalLM
9
  from peft import PeftModel
10
 
11
+ # reduce CPU overload on free tier
12
+ torch.set_num_threads(1)
13
+
14
+ # ─────────────────────────
15
+ # Config
16
+ # ─────────────────────────
17
  BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
18
  LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
19
 
20
  MAX_NEW_TOKENS = 180
21
 
22
+ print("Loading model...")
23
+
24
+ # ─────────────────────────
25
+ # Load base model
26
+ # ─────────────────────────
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ BASE_MODEL,
29
+ device_map="cpu",
30
+ torch_dtype=torch.float32,
31
+ trust_remote_code=True,
32
+ low_cpu_mem_usage=True,
33
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ print("Loading LoRA...")
36
+ model = PeftModel.from_pretrained(model, LORA_PATH)
37
 
38
+ print("Merging LoRA...")
39
+ model = model.merge_and_unload()
40
 
41
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 
42
 
43
+ model.eval()
44
+ print("Model ready")
45
 
46
+ # ─────────────────────────
47
+ # Inference
48
+ # ─────────────────────────
49
  def generate_sql(question):
50
+ if not question:
51
+ return "Enter a SQL question."
 
 
52
 
53
  messages = [{"role": "user", "content": question}]
54
 
 
59
  return_tensors="pt",
60
  )
61
 
62
+ with torch.no_grad():
63
+ output = model.generate(
64
+ input_ids,
65
  max_new_tokens=MAX_NEW_TOKENS,
66
+ temperature=0,
67
  do_sample=False,
68
  pad_token_id=tokenizer.eos_token_id,
69
  )
70
 
71
+ text = tokenizer.decode(output[0], skip_special_tokens=True)
72
 
73
+ # clean artifacts
74
  for t in ["<|assistant|>", "<|user|>", "<|end|>"]:
75
+ text = text.replace(t, "")
 
76
 
77
+ return text.strip()
78
 
79
+ # ─────────────────────────
80
+ # UI
81
+ # ─────────────────────────
82
  demo = gr.Interface(
83
  fn=generate_sql,
84
  inputs=gr.Textbox(lines=3, label="SQL Question"),
85
+ outputs=gr.Textbox(lines=8, label="Generated SQL"),
86
+ title="SQL Chat – Phi-3 mini",
87
+ description="Free CPU Space. First response may take ~90s",
88
+ cache_examples=False,
89
  )
90
 
91
+ demo.launch()