saadkhi commited on
Commit
e62bece
Β·
verified Β·
1 Parent(s): 4bc3e8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -90
app.py CHANGED
@@ -1,129 +1,125 @@
1
  # app.py
2
- # Minimal & stable version for free CPU Hugging Face Space – Phi-3-mini + LoRA
 
 
 
 
3
 
4
  import torch
5
  import gradio as gr
6
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
  from peft import PeftModel
8
 
9
- # ────────────────────────────────────────────────────────────────
10
  # Config
11
- # ────────────────────────────────────────────────────────────────
12
-
13
- BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
14
  LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
15
 
16
  MAX_NEW_TOKENS = 180
17
  TEMPERATURE = 0.0
18
  DO_SAMPLE = False
19
 
20
- # ────────────────────────────────────────────────────────────────
21
- # Load model & tokenizer
22
- # ────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
23
 
24
- print("Loading base model (CPU)...")
25
- try:
26
- bnb_config = BitsAndBytesConfig(
27
- load_in_4bit=True,
28
- bnb_4bit_quant_type="nf4",
29
- bnb_4bit_compute_dtype=torch.bfloat16
30
- )
31
 
32
- model = AutoModelForCausalLM.from_pretrained(
33
- BASE_MODEL,
34
- quantization_config = bnb_config,
35
- device_map = "cpu",
36
- trust_remote_code = True,
37
- low_cpu_mem_usage = True
38
- )
39
 
40
- print("Loading LoRA...")
41
- model = PeftModel.from_pretrained(model, LORA_PATH)
42
- print("Merging LoRA weights...")
43
- model = model.merge_and_unload()
44
 
45
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
46
- model.eval()
47
 
48
- print("Model & tokenizer loaded successfully")
49
- except Exception as e:
50
- print(f"Model loading failed: {str(e)}")
51
- raise
 
 
52
 
53
- # ────────────────────────────────────────────────────────────────
54
- # Inference function
55
- # ────────────────────────────────────────────────────────────────
56
 
57
- def generate_sql(question: str):
58
- try:
59
- messages = [{"role": "user", "content": question.strip()}]
 
 
 
60
 
61
- inputs = tokenizer.apply_chat_template(
62
- messages,
63
- tokenize=True,
64
- add_generation_prompt=True,
65
- return_tensors="pt"
 
 
 
66
  )
67
 
68
- with torch.inference_mode():
69
- outputs = model.generate(
70
- input_ids = inputs,
71
- max_new_tokens = MAX_NEW_TOKENS,
72
- temperature = TEMPERATURE,
73
- do_sample = DO_SAMPLE,
74
- use_cache = True,
75
- pad_token_id = tokenizer.eos_token_id,
76
- )
77
-
78
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
-
80
- # Clean typical Phi-3 output markers
81
- for marker in ["<|assistant|>", "<|end|>", "<|user|>"]:
82
- if marker in response:
83
- response = response.split(marker, 1)[-1].strip()
84
 
85
- return response.strip() or "(empty response)"
 
 
 
86
 
87
- except Exception as e:
88
- return f"Generation error: {str(e)}"
89
 
90
- # ────────────────────────────────────────────────────────────────
91
  # Gradio UI
92
- # ────────────────────────────────────────────────────────────────
93
-
94
  demo = gr.Interface(
95
- fn = generate_sql,
96
- inputs = gr.Textbox(
97
- label = "SQL question",
98
- placeholder = "Find duplicate emails in users table",
99
- lines = 3,
100
- max_lines = 6
101
  ),
102
- outputs = gr.Textbox(
103
- label = "Generated SQL",
104
- lines = 8
105
  ),
106
- title = "SQL Chat – Phi-3-mini fine-tuned (CPU)",
107
- description = (
108
- "Free CPU version – first answer usually takes 60–180+ seconds.\n"
109
- "Later answers are faster (model stays in memory)."
 
110
  ),
111
- examples = [
112
  ["Find duplicate emails in users table"],
113
  ["Top 5 highest paid employees"],
114
  ["Count orders per customer last month"],
115
- ["Delete duplicate rows based on email"]
116
  ],
117
- cache_examples = False,
118
  )
119
 
 
 
 
120
  if __name__ == "__main__":
121
- print("Launching interface...")
122
  demo.launch(
123
- server_name = "0.0.0.0",
124
- # NO fixed server_port β†’ let Gradio pick free port automatically
125
- debug = False,
126
- quiet = False,
127
- show_error = True,
128
- prevent_thread_lock = True
129
- )
 
1
  # app.py
2
+ # Stable CPU-only Hugging Face Space
3
+ # Phi-3-mini + LoRA (NO bitsandbytes, NO SSR issues)
4
+
5
+ import warnings
6
+ warnings.filterwarnings("ignore", category=FutureWarning)
7
 
8
  import torch
9
  import gradio as gr
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM
11
  from peft import PeftModel
12
 
13
+ # ─────────────────────────────────────────────
14
  # Config
15
+ # ─────────────────────────────────────────────
16
+ BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
 
17
  LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
18
 
19
  MAX_NEW_TOKENS = 180
20
  TEMPERATURE = 0.0
21
  DO_SAMPLE = False
22
 
23
+ # ─────────────────────────────────────────────
24
+ # Load model & tokenizer (CPU SAFE)
25
+ # ─────────────────────────────────────────────
26
+ print("Loading base model on CPU...")
27
+
28
+ model = AutoModelForCausalLM.from_pretrained(
29
+ BASE_MODEL,
30
+ device_map="cpu",
31
+ torch_dtype=torch.float32,
32
+ trust_remote_code=True,
33
+ low_cpu_mem_usage=True,
34
+ )
35
 
36
+ print("Loading LoRA adapter...")
37
+ model = PeftModel.from_pretrained(model, LORA_PATH)
 
 
 
 
 
38
 
39
+ print("Merging LoRA weights...")
40
+ model = model.merge_and_unload()
 
 
 
 
 
41
 
42
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 
 
 
43
 
44
+ model.eval()
45
+ print("Model & tokenizer loaded successfully")
46
 
47
+ # ─────────────────────────────────────────────
48
+ # Inference
49
+ # ─────────────────────────────────────────────
50
+ def generate_sql(question: str) -> str:
51
+ if not question or not question.strip():
52
+ return "Please enter a SQL-related question."
53
 
54
+ messages = [
55
+ {"role": "user", "content": question.strip()}
56
+ ]
57
 
58
+ input_ids = tokenizer.apply_chat_template(
59
+ messages,
60
+ tokenize=True,
61
+ add_generation_prompt=True,
62
+ return_tensors="pt",
63
+ )
64
 
65
+ with torch.inference_mode():
66
+ output_ids = model.generate(
67
+ input_ids=input_ids,
68
+ max_new_tokens=MAX_NEW_TOKENS,
69
+ temperature=TEMPERATURE,
70
+ do_sample=DO_SAMPLE,
71
+ pad_token_id=tokenizer.eos_token_id,
72
+ use_cache=True,
73
  )
74
 
75
+ response = tokenizer.decode(
76
+ output_ids[0],
77
+ skip_special_tokens=True
78
+ )
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ # Clean Phi-3 chat artifacts
81
+ for token in ["<|assistant|>", "<|user|>", "<|end|>"]:
82
+ if token in response:
83
+ response = response.split(token)[-1]
84
 
85
+ return response.strip() or "(empty response)"
 
86
 
87
+ # ─────────────────────────────────────────────
88
  # Gradio UI
89
+ # ─────────────────────────────────────────────
 
90
  demo = gr.Interface(
91
+ fn=generate_sql,
92
+ inputs=gr.Textbox(
93
+ label="SQL Question",
94
+ placeholder="Find duplicate emails in users table",
95
+ lines=3,
 
96
  ),
97
+ outputs=gr.Textbox(
98
+ label="Generated SQL",
99
+ lines=8,
100
  ),
101
+ title="SQL Chat – Phi-3-mini (CPU)",
102
+ description=(
103
+ "CPU-only Hugging Face Space.\n"
104
+ "First response may take 60–180 seconds. "
105
+ "Subsequent requests are faster."
106
  ),
107
+ examples=[
108
  ["Find duplicate emails in users table"],
109
  ["Top 5 highest paid employees"],
110
  ["Count orders per customer last month"],
111
+ ["Delete duplicate rows based on email"],
112
  ],
113
+ cache_examples=False,
114
  )
115
 
116
+ # ─────────────────────────────────────────────
117
+ # Launch
118
+ # ─────────────────────────────────────────────
119
  if __name__ == "__main__":
120
+ print("Launching Gradio interface...")
121
  demo.launch(
122
+ server_name="0.0.0.0",
123
+ ssr_mode=False, # important: avoids asyncio FD bug
124
+ show_error=True,
125
+ )