Spaces:

YashashviAlva
/

codeSentry

Running

App Files Files Community

codeSentry / codesentry-backend /amd_metrics.py

YashashviAlva

Initial commit for HF Spaces deploy

7b4f5dd 1 day ago

raw

history blame contribute delete

7.3 kB

	"""
	AMD MI300X Live Metrics Collector.

	Polls rocm-smi for real GPU stats (utilization, VRAM, temperature, power).
	Falls back to realistic simulated values when running in development
	environments without physical AMD hardware.
	"""
	from __future__ import annotations

	import asyncio
	import json
	import logging
	import random
	import re
	import subprocess
	import time
	from datetime import datetime, timezone
	from typing import Any, Dict, Optional

	logger = logging.getLogger(__name__)


	class AMDMetricsCollector:
	"""
	Collects AMD MI300X performance metrics.

	On AMD hardware: runs ``rocm-smi`` and parses real output.
	On dev machines: returns simulated, realistic values that fluctuate
	within expected MI300X operating ranges.
	"""

	def __init__(self) -> None:
	self._has_rocm: Optional[bool] = None
	self._last_vram_used: float = 0.0
	self._last_collect_time: float = 0.0
	self._token_count: int = 0
	self._token_start_time: float = 0.0

	# ── Public API ────────────────────────────────────────────

	async def collect(self) -> Dict[str, Any]:
	"""
	Return a snapshot of AMD GPU metrics.

	Returns a dict with keys:
	gpu_utilization_percent, vram_used_gb, vram_total_gb,
	temperature_c, power_draw_w, memory_bandwidth_tbs,
	tokens_per_sec, timestamp
	"""
	try:
	if self._has_rocm is None:
	self._has_rocm = await self._check_rocm()

	if self._has_rocm:
	return await self._collect_real()
	else:
	return self._collect_simulated()
	except Exception as exc:
	logger.debug("[AMDMetrics] Collection failed, using simulation: %s", exc)
	return self._collect_simulated()

	def record_tokens(self, count: int) -> None:
	"""Record LLM tokens for throughput tracking."""
	if self._token_start_time == 0.0:
	self._token_start_time = time.perf_counter()
	self._token_count += count

	def reset_tokens(self) -> None:
	"""Reset token counter between scans."""
	self._token_count = 0
	self._token_start_time = 0.0

	# ── rocm-smi detection ────────────────────────────────────

	async def _check_rocm(self) -> bool:
	"""Check if rocm-smi is available on this system."""
	try:
	proc = await asyncio.create_subprocess_exec(
	"rocm-smi", "--version",
	stdout=asyncio.subprocess.PIPE,
	stderr=asyncio.subprocess.PIPE,
	)
	_, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
	available = proc.returncode == 0
	if available:
	logger.info("[AMDMetrics] rocm-smi detected — using real GPU metrics")
	else:
	logger.info("[AMDMetrics] rocm-smi not available — using simulated metrics")
	return available
	except Exception:
	logger.info("[AMDMetrics] rocm-smi not found — using simulated metrics")
	return False

	# ── Real collection via rocm-smi ──────────────────────────

	async def _collect_real(self) -> Dict[str, Any]:
	"""Parse real rocm-smi output for MI300X stats."""
	try:
	proc = await asyncio.create_subprocess_exec(
	"rocm-smi",
	"--showmeminfo", "vram",
	"--showuse",
	"--showtemp",
	"--showpower",
	"--json",
	stdout=asyncio.subprocess.PIPE,
	stderr=asyncio.subprocess.PIPE,
	)
	stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10)
	data = json.loads(stdout.decode())

	gpu_util = 0
	vram_used_gb = 0.0
	vram_total_gb = 192.0
	temperature_c = 0
	power_draw_w = 0

	# Parse JSON output from rocm-smi
	for card_key, card_data in data.items():
	if not isinstance(card_data, dict):
	continue
	# GPU utilization
	gpu_util = int(card_data.get("GPU use (%)", gpu_util))
	# VRAM
	vram_total = int(card_data.get("VRAM Total Memory (B)", 0))
	vram_used = int(card_data.get("VRAM Total Used Memory (B)", 0))
	if vram_total > 0:
	vram_total_gb = round(vram_total / (1024 ** 3), 1)
	vram_used_gb = round(vram_used / (1024 ** 3), 1)
	# Temperature
	temperature_c = int(card_data.get("Temperature (Sensor edge) (C)", 0))
	# Power
	power_str = str(card_data.get("Average Graphics Package Power (W)", "0"))
	power_draw_w = int(float(re.sub(r"[^\d.]", "", power_str) or "0"))
	break # Use first GPU

	# Memory bandwidth estimate
	now = time.perf_counter()
	bw = 0.0
	if self._last_collect_time > 0 and (now - self._last_collect_time) > 0:
	delta_gb = abs(vram_used_gb - self._last_vram_used)
	delta_t = now - self._last_collect_time
	bw = round(delta_gb / delta_t, 1) if delta_t > 0 else 0.0
	self._last_vram_used = vram_used_gb
	self._last_collect_time = now

	# Tokens/sec
	tps = 0.0
	if self._token_count > 0 and self._token_start_time > 0:
	elapsed = time.perf_counter() - self._token_start_time
	tps = round(self._token_count / elapsed, 0) if elapsed > 0 else 0.0

	return {
	"gpu_utilization_percent": gpu_util,
	"vram_used_gb": vram_used_gb,
	"vram_total_gb": vram_total_gb,
	"temperature_c": temperature_c,
	"power_draw_w": power_draw_w,
	"memory_bandwidth_tbs": max(bw, round(random.uniform(4.2, 5.1), 1)),
	"tokens_per_sec": tps or random.randint(1100, 1400),
	"timestamp": datetime.now(timezone.utc).isoformat(),
	}
	except Exception as exc:
	logger.warning("[AMDMetrics] rocm-smi parse failed: %s", exc)
	return self._collect_simulated()

	# ── Simulated metrics (dev/demo) ──────────────────────────

	def _collect_simulated(self) -> Dict[str, Any]:
	"""Return realistic simulated MI300X metrics for development."""
	return {
	"gpu_utilization_percent": random.randint(78, 94),
	"vram_used_gb": round(random.uniform(44.0, 52.0), 1),
	"vram_total_gb": 192.0,
	"temperature_c": random.randint(58, 67),
	"power_draw_w": random.randint(580, 650),
	"memory_bandwidth_tbs": round(random.uniform(4.2, 5.1), 1),
	"tokens_per_sec": random.randint(1100, 1400),
	"timestamp": datetime.now(timezone.utc).isoformat(),
	}