Znilsson commited on
Commit
de8eaee
Β·
verified Β·
1 Parent(s): 834726c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -71
app.py CHANGED
@@ -1,97 +1,53 @@
1
  import os
2
  import torch
3
  import gradio as gr
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
5
  from peft import PeftModel
6
 
7
- BASE = "microsoft/phi-3-mini-4k-instruct"
8
- ADAPTER = "Znilsson/survivalai-phi3-lora"
9
- TOKEN = os.environ.get("HF_TOKEN")
10
 
11
- print("Loading base model (this may take 2-4 minutes on first run)...")
12
-
13
- # 4-bit quantization config (huge memory saver)
14
- quant_config = BitsAndBytesConfig(
15
- load_in_4bit=True,
16
- bnb_4bit_quant_type="nf4",
17
- bnb_4bit_compute_dtype=torch.float16,
18
- bnb_4bit_use_double_quant=True,
19
- )
20
 
 
21
  model = AutoModelForCausalLM.from_pretrained(
22
  BASE,
23
- quantization_config=quant_config,
24
- device_map="cpu", # Spaces is CPU-only
25
  trust_remote_code=True,
26
- torch_dtype=torch.float16, # Avoid deprecation warning
27
- attn_implementation="eager", # Bypass flash-attn / window_size issues
28
  low_cpu_mem_usage=True,
29
  )
30
 
31
- print("Attaching LoRA adapter (SurvivalAI fine-tune)...")
32
- model = PeftModel.from_pretrained(
33
- model,
34
- ADAPTER,
35
- token=TOKEN,
36
- is_trainable=False
37
- )
38
-
39
- # Do NOT merge_and_unload() on CPU in Spaces β€” it spikes memory too much
40
- # model = model.merge_and_unload() # Comment this out for now
41
-
42
  model.eval()
43
- tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True)
44
-
45
- print("SurvivalAI is ready! (Running in 4-bit on CPU)")
46
-
47
- def respond(message, history):
48
- # Build full conversation for proper context
49
- messages = []
50
- for user_msg, assistant_msg in history or []:
51
- messages.append({"role": "user", "content": user_msg})
52
- if assistant_msg:
53
- messages.append({"role": "assistant", "content": assistant_msg})
54
- messages.append({"role": "user", "content": message})
55
 
56
- # Apply Phi-3 chat template
57
- inputs = tokenizer.apply_chat_template(
58
- messages,
59
- tokenize=True,
60
- add_generation_prompt=True,
61
- return_tensors="pt"
62
- ).to(model.device)
63
 
 
64
  with torch.no_grad():
65
- outputs = model.generate(
66
- inputs,
67
- max_new_tokens=512, # Increased a bit for better survival answers
68
- do_sample=True,
69
  temperature=0.7,
70
  top_p=0.9,
71
- repetition_penalty=1.1,
72
  pad_token_id=tokenizer.eos_token_id,
73
  )
 
 
74
 
75
- # Decode only the new tokens
76
- response = tokenizer.decode(
77
- outputs[0][inputs.shape[1]:],
78
- skip_special_tokens=True
79
- )
80
- return response.strip()
81
-
82
- # Gradio interface
83
  demo = gr.ChatInterface(
84
- fn=respond,
85
- title="🌲 SurvivalAI β€” Phi-3 LoRA (Survival / Preparedness Expert)",
86
- description="Fine-tuned on survival knowledge from Survivor Library, Army manuals, FEMA, Grokipedia, etc. "
87
- "Running quantized on CPU β€” responses may take 15–60 seconds. Offline-capable foundation for our handheld version.",
88
- examples=[
89
- "How do I purify water from a stream with nothing but a pot?",
90
- "My friend is hypothermic. What are the immediate steps?",
91
- "List three edible wild plants in temperate forests and how to identify them safely.",
92
- "How do I build a basic debris shelter in a forest?",
93
- ],
94
- theme=gr.themes.Soft(),
95
  )
96
 
97
  if __name__ == "__main__":
 
1
  import os
2
  import torch
3
  import gradio as gr
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
  from peft import PeftModel
6
 
7
+ BASE = "microsoft/phi-3-mini-4k-instruct"
8
+ ADAPTER = "Znilsson/survivalai-phi3-lora" # <-- replace if your adapter repo ID differs
9
+ TOKEN = os.environ.get("HF_TOKEN")
10
 
11
+ print("Loading tokenizer...")
12
+ tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True)
 
 
 
 
 
 
 
13
 
14
+ print("Loading base model (fp16)...")
15
  model = AutoModelForCausalLM.from_pretrained(
16
  BASE,
17
+ dtype=torch.float16,
18
+ device_map="auto",
19
  trust_remote_code=True,
 
 
20
  low_cpu_mem_usage=True,
21
  )
22
 
23
+ print("Attaching + merging LoRA adapter...")
24
+ model = PeftModel.from_pretrained(model, ADAPTER, token=TOKEN)
25
+ model = model.merge_and_unload()
 
 
 
 
 
 
 
 
26
  model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ def chat(message, history):
29
+ prompt = ""
30
+ for user, assistant in history:
31
+ prompt += f"<|user|>\n{user}<|end|>\n<|assistant|>\n{assistant}<|end|>\n"
32
+ prompt += f"<|user|>\n{message}<|end|>\n<|assistant|>\n"
 
 
33
 
34
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
35
  with torch.no_grad():
36
+ out = model.generate(
37
+ **inputs,
38
+ max_new_tokens=400,
 
39
  temperature=0.7,
40
  top_p=0.9,
41
+ do_sample=True,
42
  pad_token_id=tokenizer.eos_token_id,
43
  )
44
+ resp = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
45
+ return resp.strip()
46
 
 
 
 
 
 
 
 
 
47
  demo = gr.ChatInterface(
48
+ fn=chat,
49
+ title="SurvivalAI",
50
+ description="Fine-tuned Phi-3-mini on survival & emergency preparedness corpus.",
 
 
 
 
 
 
 
 
51
  )
52
 
53
  if __name__ == "__main__":