#!/usr/bin/env python3 """ Benchmark script for comparing context window performance across different lengths. This script compares: 1. 32K context (original claim) 2. 64K context (mid-range) 3. 128K context (full potential) For each context length, it tests: - Memory consumption (VRAM and RAM) - Throughput (tokens/second during generation) - Latency (time to first token) - Quality (ability to process and generate coherent output) - Task completion on sample coding tasks Output: JSON results + summary report """ import os import sys import json import time import argparse import statistics from pathlib import Path from typing import Dict, List, Any # Required packages: vllm, transformers, psutil, torch def get_memory_info(): """Get memory statistics.""" import torch import psutil process = psutil.Process(os.getpid()) ram_mb = process.memory_info().rss / 1024 / 1024 if torch.cuda.is_available(): gpu_mem_allocated = torch.cuda.memory_allocated() / 1024 / 1024 gpu_mem_reserved = torch.cuda.memory_reserved() / 1024 / 1024 return { "ram_mb": round(ram_mb, 1), "gpu_allocated_mb": round(gpu_mem_allocated, 1), "gpu_reserved_mb": round(gpu_mem_reserved, 1), "gpu_used": True } else: return { "ram_mb": round(ram_mb, 1), "gpu_used": False } def preprocess_prompt(prompt: str, tokenizer, target_tokens: int, mode: str = "repeat") -> List[int]: """Preprocess a prompt to reach target token length.""" tokens = tokenizer.encode(prompt) if len(tokens) >= target_tokens: return tokens[:target_tokens] needed = target_tokens - len(tokens) if mode == "repeat": # Repeat a filler pattern filler = " This is additional context to fill the window. " * 100 filler_tokens = tokenizer.encode(filler) repeats = (needed // len(filler_tokens)) + 1 tokens.extend(filler_tokens * repeats) elif mode == "noise": # Use random-like content (code snippets) noise = """ // Dummy code for context expansion function placeholder() { const x = 1; const y = 2; return x + y; } class DummyClass { constructor() {} method() {} } """.repeat(needed // 50 + 1) noise_tokens = tokenizer.encode(noise) tokens.extend(noise_tokens) return tokens[:target_tokens] def load_model(model_name: str, max_model_len: int, block_size: int): """Load vLLM model with specified configuration.""" from vllm import LLM print(f"Loading model with max_model_len={max_model_len}, block_size={block_size}") model = LLM( model=model_name, max_model_len=max_model_len, block_size=block_size, gpu_memory_utilization=0.9, trust_remote_code=True, tensor_parallel_size=1, # For benchmarking, disable speculative decoding for consistent results enable_chunked_prefill=False ) return model def run_generation(model, tokenizer, prompt_tokens: List[int], max_new_tokens: int = 200) -> Dict[str, Any]: """Run generation and collect metrics.""" from vllm import SamplingParams sampling_params = SamplingParams( temperature=0.7, top_p=0.95, max_tokens=max_new_tokens, min_p=0.05 ) # Prefill phase timing torch = sys.modules.get('torch') if torch and torch.cuda.is_available(): torch.cuda.synchronize() start_time = time.time() outputs = model.generate( prompt_token_ids=prompt_tokens, sampling_params=sampling_params, use_tqdm=False ) end_time = time.time() if torch and torch.cuda.is_available(): torch.cuda.synchronize() elapsed = end_time - start_time output_token_ids = outputs[0].outputs[0].token_ids output_text = outputs[0].outputs[0].text # Count tokens in output output_length = len(output_token_ids) # Calculate prefill latency (estimated) prefill_latency = elapsed * 0.3 # Rough estimate decode_latency = elapsed - prefill_latency # Tokens per second total_tokens = output_length tokens_per_second = total_tokens / elapsed if elapsed > 0 else 0 return { "elapsed_seconds": round(elapsed, 4), "output_tokens": output_length, "output_text": output_text[:200], "tokens_per_second": round(tokens_per_second, 2), "prefill_latency_est": round(prefill_latency, 4), "decode_latency_est": round(decode_latency, 4) } def test_task(model, tokenizer, context_length: int, task_name: str, prompt: str, max_response: int = 200) -> Dict[str, Any]: """Run a single benchmark task.""" print(f"\n Task: {task_name}") sys.stdout.flush() mem_before = get_memory_info() prompt_tokens = preprocess_prompt(prompt, tokenizer, context_length) actual_context_len = len(prompt_tokens) start_time = time.time() try: result = run_generation(model, tokenizer, prompt_tokens, max_response) elapsed = time.time() - start_time mem_after = get_memory_info() # Calculate memory delta mem_delta = {} if mem_after.get("gpu_used"): mem_delta["gpu_allocated_delta_mb"] = round( mem_after["gpu_allocated_mb"] - mem_before["gpu_allocated_mb"], 1 ) mem_delta["ram_delta_mb"] = round( mem_after["ram_mb"] - mem_before["ram_mb"], 1 ) return { "task": task_name, "context_length_target": context_length, "context_length_actual": actual_context_len, "success": True, **result, **mem_delta } except Exception as e: elapsed = time.time() - start_time print(f" ā Failed: {e}") return { "task": task_name, "context_length_target": context_length, "success": False, "error": str(e), "elapsed_seconds": round(elapsed, 4) } def main(): parser = argparse.ArgumentParser(description="Benchmark context lengths: 32K, 64K, 128K") parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-32B", help="Model name") parser.add_argument("--output-dir", type=str, default="benchmarks/results", help="Directory to save results") parser.add_argument("--context-lengths", type=int, nargs='+', default=[32768, 65536, 131072], help="Context lengths to test") parser.add_argument("--tasks-per-length", type=int, default=5, help="Number of tasks per context length") args = parser.parse_args() print("="*70) print("CONTEXT LENGTH BENCHMARK") print("="*70) print(f"Model: {args.model}") print(f"Context lengths: {args.context_lengths}") print(f"Tasks per length: {args.tasks_per_length}") # Sample tasks for benchmarking tasks = [ { "name": "Code Completion", "prompt": """import React from 'react'; function Component({ children }) { return (