File size: 3,610 Bytes
b266c31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""Modal executor — skeleton for v0.

This file is a STUB. The full Modal integration requires the `modal`
client library installed (`pip install modal`) and a configured Modal
account (`~/.modal.toml`). The user's environment has both, but the
test suite must run without them, so we keep this file import-safe.

Real implementation lives in v0 polish; the docstring below is the
contract.
"""
from __future__ import annotations

from typing import Any, Callable, Mapping

from composer_replication.diloco.serverless.executor import (
    ReplicaHandle,
    ServerlessExecutor,
)


class ModalExecutor(ServerlessExecutor):
    """Run replicas as Modal Functions in parallel.

    Reference implementation pattern (per ADR-005):

        @app.function(gpu="A100-40GB", timeout=3600)
        def run_replica(rank: int, rendezvous_uri: str, **kwargs):
            os.environ["REPLICA_RANK"] = str(rank)
            from composer_replication.diloco.serverless import (
                MockManager, ObjectStoreAllReduce,
            )
            store = ObjectStoreAllReduce(rendezvous_uri,
                                         rank=rank, world_size=N)
            manager = MockManager(store)
            # ... run the trainer with this manager ...

    Then `launch_replicas` does:
        calls = [run_replica.spawn(rank=i, ...) for i in range(N)]
        return [ReplicaHandle(rank=i, backend_name="modal",
                              metadata={"call_id": calls[i].object_id})
                for i in range(N)]

    Pricing reference (2026-05-26): A100-40GB ≈ $1.95/hr, H100 ≈ $5.50/hr.
    Cold start ≈ 30s. Inter-job networking via cluster mode (opt-in,
    not used by default).

    Status: SKELETON. Real implementation pending v0 polish wave.
    """
    backend_name = "modal"
    supports_inter_replica_network = False  # default; cluster mode = True

    def __init__(self, *, app_name: str = "composer-replication-diloco") -> None:
        try:
            import modal  # noqa: F401
        except ImportError as e:
            raise RuntimeError(
                "ModalExecutor requires the modal client. Install with "
                "`pip install modal` and configure with `modal token new`. "
                "Got: " + repr(e)
            )
        self.app_name = app_name
        # Real implementation: build a `modal.App` and register `run_replica`
        # here so that subsequent `launch_replicas` can `.spawn()` it.
        raise NotImplementedError(
            "ModalExecutor is a v0 skeleton; full implementation pending. "
            "Use LocalProcessExecutor for testing."
        )

    # All Protocol methods raise NotImplementedError via __init__ — the
    # class never instantiates successfully in the skeleton. Sketch
    # signatures here for documentation:

    def launch_replicas(
        self,
        n_replicas: int,
        entrypoint: str | Callable[..., Any],
        entrypoint_args: Mapping[str, Any],
        *,
        gpu: str | None = "A100-40GB",
        timeout: int = 3600,
    ) -> list[ReplicaHandle]:
        raise NotImplementedError

    def poll(self, handle: ReplicaHandle) -> str:
        raise NotImplementedError

    def stream_logs(self, handle: ReplicaHandle, *, n_lines: int = 200) -> str:
        raise NotImplementedError

    def cancel(self, handle: ReplicaHandle) -> None:
        raise NotImplementedError

    def collect(
        self,
        handles: list[ReplicaHandle],
        *,
        timeout: int | None = None,
    ) -> list[dict[str, Any]]:
        raise NotImplementedError


__all__ = ["ModalExecutor"]