| |
| """ |
| Stack 2.9 - Convert & Load (No progress bar) |
| """ |
| import os |
| os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1' |
| os.environ['TOKENIZERS_PARALLELISM'] = 'false' |
|
|
| import torch |
| from pathlib import Path |
| import json |
| import sys |
|
|
| model_path = Path("/Users/walidsobhi/stack-2.9-final-model") |
| cache_path = Path("/Users/walidsobhi/stack-2.9/weights_cache.pt") |
|
|
| print("Loading...", flush=True) |
|
|
| |
| from transformers import PreTrainedTokenizerFast |
| tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(model_path / "tokenizer.json")) |
| tokenizer.pad_token = "<|endoftext|>" |
| tokenizer.eos_token = "<|endoftext|>" |
|
|
| print("Tokenizer ready", flush=True) |
|
|
| |
| if cache_path.exists(): |
| print("Loading cached weights...", flush=True) |
| state_dict = torch.load(cache_path, map_location='cpu') |
| else: |
| |
| print("Converting weights (one-time)...", flush=True) |
|
|
| |
| import io |
| from safetensors.torch import load_file |
|
|
| |
| with open(model_path / "model.safetensors", "rb") as f: |
| data = f.read() |
|
|
| |
| temp_path = Path("/tmp/weights.pt") |
| with open(temp_path, "wb") as f: |
| f.write(data) |
|
|
| |
| state_dict = torch.load(temp_path, map_location='cpu') |
| temp_path.unlink() |
|
|
| |
| torch.save(state_dict, cache_path) |
|
|
| print("Weights ready", flush=True) |
|
|
| |
| with open(model_path / "config.json") as f: |
| config_dict = json.load(f) |
|
|
| |
| from transformers import Qwen2ForCausalLM, Qwen2Config |
|
|
| config = Qwen2Config() |
| for k, v in config_dict.items(): |
| setattr(config, k, v) |
|
|
| print("Building model...", flush=True) |
| model = Qwen2ForCausalLM(config) |
| model.load_state_dict(state_dict, strict=False) |
| model = model.to(torch.float16) |
|
|
| if torch.cuda.is_available(): |
| model.to("cuda") |
|
|
| print("Ready!\n", flush=True) |
|
|
| |
| print("=" * 40) |
| print("Stack 2.9 Ready! (Type 'quit' to exit)") |
| print("=" * 40) |
|
|
| while True: |
| try: |
| user_input = input("\nYou: ").strip() |
| if not user_input: |
| continue |
| if user_input.lower() in ['quit', 'exit', 'q']: |
| break |
|
|
| prompt = f"You are Stack 2.9.\n\nUser: {user_input}\nAssistant:" |
| inputs = tokenizer(prompt, return_tensors='pt') |
| if torch.cuda.is_available(): |
| inputs = {k: v.cuda() for k, v in inputs.items()} |
|
|
| outputs = model.generate(**inputs, max_new_tokens=80, temperature=0.4, pad_token_id=tokenizer.eos_token_id) |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
| if "Assistant:" in response: |
| response = response.split("Assistant:")[-1].strip() |
|
|
| print(f"AI: {response}") |
|
|
| except KeyboardInterrupt: |
| break |
| except Exception as e: |
| print(f"Error: {e}") |
|
|
| print("\nDone!") |