| import gradio as gr |
| from unsloth import FastLanguageModel |
| from transformers import TextStreamer |
| import torch |
|
|
| |
| def load_model(model_name, max_seq_length, dtype, load_in_4bit, token=None): |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name=model_name, |
| max_seq_length=max_seq_length, |
| dtype=dtype, |
| load_in_4bit=load_in_4bit, |
| token=token |
| ) |
| FastLanguageModel.for_inference(model) |
| return model, tokenizer |
|
|
| |
| model_name = "unsloth/Phi-3-mini-4k-instruct" |
| token = None |
|
|
| model, tokenizer = load_model(model_name, max_seq_length=2048, dtype=None, load_in_4bit=True, token=token) |
|
|
| def generate_response(instruction, input_text, max_new_tokens): |
| alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
| |
| ### Instruction: |
| {} |
| |
| ### Input: |
| {} |
| |
| ### Response: |
| {}""" |
|
|
| inputs = tokenizer( |
| [ |
| alpaca_prompt.format( |
| instruction, |
| input_text, |
| "" |
| ) |
| ], return_tensors="pt").to("cpu") |
|
|
| text_streamer = TextStreamer(tokenizer) |
| output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=max_new_tokens) |
|
|
| response = tokenizer.decode(output[0], skip_special_tokens=True) |
| return response |
|
|
| |
| iface = gr.Interface( |
| fn=generate_response, |
| inputs=[ |
| gr.Textbox(lines=2, label="Instruction", placeholder="Continue the Fibonacci sequence."), |
| gr.Textbox(lines=2, label="Input", placeholder="1, 1, 2, 3, 5, 8"), |
| gr.Slider(1, 2048, value=128, step=1, label="Max New Tokens") |
| ], |
| outputs=gr.Textbox(label="Response", lines=10), |
| title="Language Model Chat UI" |
| ) |
|
|
| iface.launch() |
|
|