File size: 3,117 Bytes
af83196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Reference implementation for float16 vector addition Triton kernel.
C = A + B
"""

import math
try:
    import torch
except ImportError:
    torch = None  # Modal-only mode — functions below won't be called locally

# ---------------------------------------------------------------------------
# Reward parameters
# ---------------------------------------------------------------------------

CORRECTNESS_WEIGHT = 0.3
SPEED_WEIGHT = 1.0
SPEED_MAX_REWARD = 10.0

# ---------------------------------------------------------------------------
# Test / benchmark cases
# ---------------------------------------------------------------------------

TEST_CASES = [
    {"N": 256, "seed": 42},
    {"N": 512, "seed": 123},
    {"N": 1024, "seed": 456},
    {"N": 2048, "seed": 789},
]

BENCHMARK_CASES = [
    {"N": 1024, "seed": 1001},
    {"N": 2048, "seed": 1002},
    {"N": 4096, "seed": 1003},
    {"N": 8192, "seed": 1004},
]

# ---------------------------------------------------------------------------
# Reference kernel
# ---------------------------------------------------------------------------


def ref_kernel(data):
    a, b = data
    return a + b


def generate_input(N, seed):
    gen = torch.Generator(device="cuda")
    gen.manual_seed(seed)
    a = torch.randn(N, N, device="cuda", dtype=torch.float16, generator=gen)
    b = torch.randn(N, N, device="cuda", dtype=torch.float16, generator=gen)
    return (a, b)


def check_implementation(data, output, rtol=1e-3, atol=1e-3):
    ref_out = ref_kernel(data)
    if output.shape != ref_out.shape:
        return False, f"Shape mismatch: expected {ref_out.shape}, got {output.shape}"
    if output.dtype != torch.float16:
        return False, f"Dtype mismatch: expected float16, got {output.dtype}"
    if torch.allclose(output, ref_out, rtol=rtol, atol=atol):
        return True, "Match"
    diff = torch.abs(output.float() - ref_out.float())
    return False, f"Output mismatch: max_diff={diff.max().item():.6f}"


# ---------------------------------------------------------------------------
# Self-contained reference code for Modal execution
# ---------------------------------------------------------------------------

MODAL_REFERENCE_CODE = '''
import torch

def ref_kernel(data):
    a, b = data
    return a + b

def generate_input(N, seed):
    gen = torch.Generator(device="cuda")
    gen.manual_seed(seed)
    a = torch.randn(N, N, device="cuda", dtype=torch.float16, generator=gen)
    b = torch.randn(N, N, device="cuda", dtype=torch.float16, generator=gen)
    return (a, b)

def check_implementation(data, output, rtol=1e-3, atol=1e-3):
    ref_out = ref_kernel(data)
    if output.shape != ref_out.shape:
        return False, f"Shape mismatch: expected {ref_out.shape}, got {output.shape}"
    if output.dtype != torch.float16:
        return False, f"Dtype mismatch: expected float16, got {output.dtype}"
    if torch.allclose(output, ref_out, rtol=rtol, atol=atol):
        return True, "Match"
    diff = torch.abs(output.float() - ref_out.float())
    return False, f"Output mismatch: max_diff={diff.max().item():.6f}"
'''