Spaces:
Running on A10G
Running on A10G
File size: 2,885 Bytes
95cbc5b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | from __future__ import annotations
import sys
from typing import Any
from .agent_prompt import SYSTEM_PROMPT
def format_prompt(diff: str, available_files: list[str] = None) -> str:
"""Format the diff into the expected model prompt."""
files_str = ", ".join(available_files) if available_files else "None"
user_prompt = f"""### Input Diff
{diff}
### Environment Info
- Available Files: {files_str}
- Current Step: 0/5
Please provide your next action in XML format:"""
return (
f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
f"{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
f"{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
)
def load_model(model_path: str, is_lora: bool = False, base_model: str = None) -> tuple[Any, Any]:
"""
Load the LLM and tokenizer for inference.
"""
try:
import torch
except ImportError:
print("Error: PyTorch is not installed. Please install inference dependencies using: pip install '.[scan]'")
sys.exit(1)
if is_lora:
if not base_model:
raise ValueError("base_model is required if is_lora=True")
try:
from unsloth import FastLanguageModel
from peft import PeftModel
except ImportError:
print("Error: Unsloth/PEFT not installed. Required for LoRA models.")
sys.exit(1)
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=base_model,
max_seq_length=2048,
load_in_4bit=True,
)
model = PeftModel.from_pretrained(model, model_path)
FastLanguageModel.for_inference(model)
else:
try:
from transformers import AutoModelForCausalLM, AutoTokenizer
except ImportError:
print("Error: Transformers is not installed. Please install inference dependencies using: pip install '.[scan]'")
sys.exit(1)
device_map = "auto" if torch.cuda.is_available() else None
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map=device_map
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
return model, tokenizer
def generate(model: Any, tokenizer: Any, prompt: str, max_new_tokens: int = 256) -> str:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
)
response = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response
|