"""
AMD ROCm Migration Advisor — CUDA → ROCm/HIP compatibility scanner.

Scans code for CUDA-specific patterns and provides actionable migration
guidance for AMD MI300X hardware.  Produces an AMD Compatibility Score
and a per-file migration guide.
"""
from __future__ import annotations

import logging
import re
from typing import Any, Dict, List, Optional, Tuple

from tools.code_parser import FileEntry, get_snippet

logger = logging.getLogger(__name__)


# ──────────────────────────────────────────────────
# Migration pattern definitions (10 categories)
# ──────────────────────────────────────────────────

MIGRATION_PATTERNS: List[Dict[str, Any]] = [
    {
        "id": "AMD_M01",
        "pattern": re.compile(
            r"torch\.cuda\.is_available\s*\(\)", re.MULTILINE
        ),
        "title": "CUDA Device Check",
        "description": (
            "torch.cuda.is_available() works on ROCm but torch.version.hip "
            "is more explicit for AMD hardware detection."
        ),
        "rocm_fix": (
            "Use `torch.cuda.is_available()` (ROCm compatible) "
            "or check `hasattr(torch.version, 'hip')` for explicit AMD detection."
        ),
        "severity": "low",
    },
    {
        "id": "AMD_M02",
        "pattern": re.compile(
            r"""(?:nvidia[\-_]smi|nvidia_smi|["']nvidia-smi["'])""",
            re.MULTILINE,
        ),
        "title": "NVIDIA-Specific CLI Tool",
        "description": "nvidia-smi is NVIDIA-only and will fail on AMD hardware.",
        "rocm_fix": (
            "Replace nvidia-smi with rocm-smi. "
            "Example: subprocess.run(['rocm-smi', '--showmeminfo', 'vram'])"
        ),
        "severity": "critical",
    },
    {
        "id": "AMD_M03",
        "pattern": re.compile(
            r"CUDA_VISIBLE_DEVICES", re.MULTILINE
        ),
        "title": "CUDA Device Selection Environment Variable",
        "description": "CUDA_VISIBLE_DEVICES is ignored on AMD/ROCm hardware.",
        "rocm_fix": "Replace with HIP_VISIBLE_DEVICES=0 for AMD GPU selection.",
        "severity": "high",
    },
    {
        "id": "AMD_M04",
        "pattern": re.compile(
            r"torch\.cuda\.amp\.(?:autocast|GradScaler)", re.MULTILINE
        ),
        "title": "Legacy CUDA AMP API",
        "description": "Old torch.cuda.amp API has limited ROCm support.",
        "rocm_fix": (
            "Upgrade to torch.amp.autocast('cuda') and torch.amp.GradScaler('cuda') "
            "which are ROCm-native and match MI300X bfloat16 support."
        ),
        "severity": "high",
    },
    {
        "id": "AMD_M05",
        "pattern": re.compile(
            r"\.half\s*\(\)|torch\.float16|dtype\s*=\s*torch\.float16",
            re.MULTILINE,
        ),
        "title": "FP16 Precision (Suboptimal on MI300X)",
        "description": (
            "FP16 works on AMD but bfloat16 is natively supported on MI300X "
            "with no accuracy loss and better numerical stability."
        ),
        "rocm_fix": (
            "Replace .half() with .bfloat16() and torch.float16 with torch.bfloat16. "
            "MI300X executes bfloat16 at the same speed with higher stability."
        ),
        "severity": "medium",
    },
    {
        "id": "AMD_M06",
        "pattern": re.compile(
            r"torch\.backends\.cudnn\.(?:benchmark|enabled|deterministic)",
            re.MULTILINE,
        ),
        "title": "cuDNN Backend Configuration",
        "description": (
            "torch.backends.cudnn settings are NVIDIA-specific. "
            "AMD uses MIOpen as its deep learning backend."
        ),
        "rocm_fix": (
            "Remove cudnn-specific flags. ROCm/MIOpen auto-configures. "
            "Use torch.backends.cuda.matmul.allow_tf32 for equivalent behavior."
        ),
        "severity": "medium",
    },
    {
        "id": "AMD_M07",
        "pattern": re.compile(
            r"(?:import\s+flash_attn|from\s+flash_attn)", re.MULTILINE
        ),
        "title": "Flash Attention — CUDA Build",
        "description": "Default flash-attn pip package is compiled for CUDA only.",
        "rocm_fix": (
            "Build flash-attn from source with ROCm flag: "
            "MAX_JOBS=4 pip install flash-attn --no-build-isolation "
            "Or use torch.nn.functional.scaled_dot_product_attention() "
            "which has native ROCm support."
        ),
        "severity": "high",
    },
    {
        "id": "AMD_M08",
        "pattern": re.compile(
            r"torch\.cuda\.(?:memory_allocated|max_memory_reserved|max_memory_allocated)\s*\(",
            re.MULTILINE,
        ),
        "title": "CUDA Memory Profiling API",
        "description": (
            "torch.cuda.memory_allocated() works on ROCm but "
            "rocm-smi gives more accurate MI300X HBM3 readings."
        ),
        "rocm_fix": (
            "Continue using torch.cuda.memory_allocated() (ROCm compatible) "
            "but add rocm-smi polling for accurate HBM3 bandwidth metrics."
        ),
        "severity": "low",
    },
    {
        "id": "AMD_M09",
        "pattern": re.compile(
            r"""device\s*=\s*['"]cuda['"]""", re.MULTILINE
        ),
        "title": "Hardcoded CUDA Device String",
        "description": (
            "Hardcoded 'cuda' string works on ROCm but poor practice "
            "for hardware-agnostic code."
        ),
        "rocm_fix": (
            "Replace with: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') "
            "This works identically on AMD ROCm."
        ),
        "severity": "low",
    },
    {
        "id": "AMD_M10",
        "pattern": re.compile(
            r"load_in_8bit\s*=\s*True|load_in_4bit\s*=\s*True|BitsAndBytesConfig",
            re.MULTILINE,
        ),
        "title": "BitsAndBytes Quantization (CUDA Only)",
        "description": "bitsandbytes library does not support AMD ROCm.",
        "rocm_fix": (
            "Use AutoAWQ or llama.cpp with ROCm backend for quantization. "
            "For vLLM on MI300X: use --quantization awq or --dtype bfloat16 "
            "with FP8 quantization which is natively supported."
        ),
        "severity": "critical",
    },
]

# Pre-built lookup for severity weighting
_SEVERITY_WEIGHT = {
    "critical": 20,
    "high": 10,
    "medium": 3,
    "low": 1,
}


# ──────────────────────────────────────────────────
# Migration Finding data class
# ──────────────────────────────────────────────────

class MigrationFinding:
    """A single CUDA → ROCm migration finding."""

    __slots__ = (
        "id", "title", "description", "rocm_fix", "severity",
        "file", "line", "code_snippet",
    )

    def __init__(
        self,
        id: str,
        title: str,
        description: str,
        rocm_fix: str,
        severity: str,
        file: str,
        line: int,
        code_snippet: str,
    ) -> None:
        self.id = id
        self.title = title
        self.description = description
        self.rocm_fix = rocm_fix
        self.severity = severity
        self.file = file
        self.line = line
        self.code_snippet = code_snippet

    def to_dict(self) -> Dict[str, Any]:
        return {
            "id": self.id,
            "title": self.title,
            "description": self.description,
            "rocm_fix": self.rocm_fix,
            "severity": self.severity,
            "file": self.file,
            "line": self.line,
            "code_snippet": self.code_snippet,
        }


# ──────────────────────────────────────────────────
# Main advisor class
# ──────────────────────────────────────────────────

class AMDMigrationAdvisor:
    """
    Scans source files for CUDA-specific patterns and produces
    an AMD Compatibility Score with migration guidance.
    """

    def __init__(self) -> None:
        self.patterns = MIGRATION_PATTERNS

    async def scan(self, files: List[FileEntry]) -> Dict[str, Any]:
        """
        Scan all files for CUDA-specific patterns.

        Parameters
        ----------
        files : list of (filename, content) tuples

        Returns
        -------
        dict with keys:
            findings, compatibility_score, compatibility_label,
            total_cuda_patterns_found
        """
        all_findings: List[MigrationFinding] = []
        seen: set = set()  # deduplicate by (pattern_id, file, line)

        for file_path, code in files:
            for pat_def in self.patterns:
                try:
                    for match in pat_def["pattern"].finditer(code):
                        line_number = code[: match.start()].count("\n") + 1
                        key = (pat_def["id"], file_path, line_number)
                        if key in seen:
                            continue
                        seen.add(key)

                        snippet = get_snippet(code, line_number, context=2)

                        all_findings.append(
                            MigrationFinding(
                                id=pat_def["id"],
                                title=pat_def["title"],
                                description=pat_def["description"],
                                rocm_fix=pat_def["rocm_fix"],
                                severity=pat_def["severity"],
                                file=file_path,
                                line=line_number,
                                code_snippet=snippet,
                            )
                        )
                except Exception as exc:
                    logger.debug(
                        "[AMDMigration] Pattern %s failed on %s: %s",
                        pat_def["id"], file_path, exc,
                    )

        # ── Compute AMD Compatibility Score ─────────────────────
        penalty = 0
        for f in all_findings:
            penalty += _SEVERITY_WEIGHT.get(f.severity, 1)

        score = max(0, min(100, 100 - penalty))

        if score >= 90:
            label = "Fully ROCm Ready"
        elif score >= 70:
            label = "Mostly Compatible"
        elif score >= 50:
            label = "Needs Migration Work"
        else:
            label = "CUDA-Specific Codebase"

        logger.info(
            "[AMDMigration] Scanned %d files — %d CUDA patterns found — score %d%% (%s)",
            len(files), len(all_findings), score, label,
        )

        return {
            "findings": [f.to_dict() for f in all_findings],
            "compatibility_score": score,
            "compatibility_label": label,
            "total_cuda_patterns_found": len(all_findings),
            "summary": (
                f"Found {len(all_findings)} CUDA-specific pattern(s). "
                f"After applying fixes, this codebase will be fully "
                f"optimized for AMD MI300X."
                if all_findings
                else "No CUDA-specific patterns detected — codebase is ROCm-ready."
            ),
        }