Stack-2-9-finetuned / runners /run_cache.py
walidsobhie-code
chore: Rename MCP server to Stack2.9
c7f1596
#!/usr/bin/env python3
"""
Stack 2.9 - Convert & Load (No progress bar)
"""
import os
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import torch
from pathlib import Path
import json
import sys
model_path = Path("/Users/walidsobhi/stack-2.9-final-model")
cache_path = Path("/Users/walidsobhi/stack-2.9/weights_cache.pt")
print("Loading...", flush=True)
# Load tokenizer
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(model_path / "tokenizer.json"))
tokenizer.pad_token = "<|endoftext|>"
tokenizer.eos_token = "<|endoftext|>"
print("Tokenizer ready", flush=True)
# Check if cached conversion exists
if cache_path.exists():
print("Loading cached weights...", flush=True)
state_dict = torch.load(cache_path, map_location='cpu')
else:
# Convert without progress bar
print("Converting weights (one-time)...", flush=True)
# Use lower-level loading
import io
from safetensors.torch import load_file
# Read file into memory first
with open(model_path / "model.safetensors", "rb") as f:
data = f.read()
# Write to temp and load
temp_path = Path("/tmp/weights.pt")
with open(temp_path, "wb") as f:
f.write(data)
# Load with torch (silent)
state_dict = torch.load(temp_path, map_location='cpu')
temp_path.unlink()
# Cache for next time
torch.save(state_dict, cache_path)
print("Weights ready", flush=True)
# Load config
with open(model_path / "config.json") as f:
config_dict = json.load(f)
# Build model
from transformers import Qwen2ForCausalLM, Qwen2Config
config = Qwen2Config()
for k, v in config_dict.items():
setattr(config, k, v)
print("Building model...", flush=True)
model = Qwen2ForCausalLM(config)
model.load_state_dict(state_dict, strict=False)
model = model.to(torch.float16)
if torch.cuda.is_available():
model.to("cuda")
print("Ready!\n", flush=True)
# Chat
print("=" * 40)
print("Stack 2.9 Ready! (Type 'quit' to exit)")
print("=" * 40)
while True:
try:
user_input = input("\nYou: ").strip()
if not user_input:
continue
if user_input.lower() in ['quit', 'exit', 'q']:
break
prompt = f"You are Stack 2.9.\n\nUser: {user_input}\nAssistant:"
inputs = tokenizer(prompt, return_tensors='pt')
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
outputs = model.generate(**inputs, max_new_tokens=80, temperature=0.4, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if "Assistant:" in response:
response = response.split("Assistant:")[-1].strip()
print(f"AI: {response}")
except KeyboardInterrupt:
break
except Exception as e:
print(f"Error: {e}")
print("\nDone!")