| --- |
| license: llama2 |
| --- |
| Quick notes--what I did to get to this point |
|
|
| ``` |
| from optimum.neuron import NeuronModelForCausalLM |
| from transformers import AutoTokenizer |
| model_id = "TencentARC/LLaMA-Pro-8B" |
| compiler_args = {"num_cores": 2, "auto_cast_type": "fp16"} |
| input_shapes = {"sequence_length": 2048, "batch_size": 2 } |
| llm = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **input_shapes, **compiler_args) |
| save_directory = "Tencent_neuron" |
| |
| llm.save_pretrained(save_directory) |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| |
| tokenizer.save_pretrained(save_directory) |
| |
| quit() |
| ``` |
|
|
| ``` |
| from optimum.neuron import pipeline |
| |
| # Load pipeline from Hugging Face repository |
| save_directory = "Tencent_neuron" |
| |
| pipe = pipeline("text-generation", save_directory) |
| |
| # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating |
| messages = [ |
| {"role": "user", "content": "What is 2+2?"}, |
| ] |
| prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| # Run generation |
| outputs = pipe(prompt, max_new_tokens=2048, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) |
| print(outputs[0]["generated_text"]) |
| |
| ``` |
| ``` |
| from huggingface_hub import login |
| from huggingface_hub import HfApi |
| api = HfApi() |
| login() |
| |
| |
| save_directory = "Tencent_neuron" |
| |
| api.upload_folder( |
| folder_path=save_directory, |
| repo_id="jburtoft/TencentARC-LLaMA-Pro-8B-Neuron", |
| repo_type="model", |
| multi_commits=True, |
| multi_commits_verbose=True, |
| ) |
| ``` |