GilbertAkham commited on
Commit
9617a8d
·
verified ·
1 Parent(s): d7d127d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -0
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ from peft import PeftModel
4
+
5
+ BASE = "openlm-research/open_llama_3b" # base of your LoRA
6
+ LORA = "GilbertAkham/openlm-llama-lora-codetrans" # your LoRA on HF
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained(BASE)
9
+ model = AutoModelForCausalLM.from_pretrained(
10
+ BASE,
11
+ load_in_8bit=True, # fits on T4/low VRAM
12
+ device_map="auto"
13
+ )
14
+ model = PeftModel.from_pretrained(model, LORA)
15
+ model.eval()
16
+
17
+ def chat_fn(prompt, max_new_tokens=256):
18
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
19
+ with torch.no_grad():
20
+ out = model.generate(
21
+ **inputs,
22
+ max_new_tokens=max_new_tokens,
23
+ do_sample=True,
24
+ temperature=0.3,
25
+ top_p=0.9
26
+ )
27
+ return tokenizer.decode(out[0], skip_special_tokens=True)
28
+
29
+ demo = gr.Interface(
30
+ fn=chat_fn,
31
+ inputs=[gr.Textbox(lines=6, label="Prompt"), gr.Slider(16,1024,256,label="Max new tokens")],
32
+ outputs="text",
33
+ title="openlm-llama-LoRA codetrans",
34
+ )
35
+
36
+ demo.launch(share=True) # share=True gives you a free public link