From 2dc0a6ae8844b5c6a385b0e448a3244b8e010f89 Mon Sep 17 00:00:00 2001 From: fwyc0573 <50061432+fwyc0573@users.noreply.github.com> Date: Sun, 14 Jun 2026 01:11:55 +0800 Subject: [PATCH 1/3] Add pd-disaggregation transfer core Constraint: PR preparation is local-only for worktrees/Frontier and scoped to frontier/ plus tests/unit core transfer paths; no push, release, or PR publication. Rejected: Blindly applying the full pre-release-v0.2 patch queue | mixed commits contain examples, old pd-only names, and unrelated release hardening outside this commit boundary. Confidence: high Scope-risk: moderate Directive: Keep follow-up example scripts and final naming cleanup in separate commits so review can isolate core simulator behavior from docs/examples wording. Tested: PYTHONPATH=/local/ycfeng/frontier/worktrees/Frontier conda run -n frontier python -m pytest tests/unit/test_pd_transfer_entities.py tests/unit/test_pd_transfer_predictors.py tests/unit/test_pd_transfer_types_and_configs.py tests/unit/test_kv_transfer_completion_contract.py tests/unit/test_prefix_cache_cluster_validation.py tests/unit/test_request_generator_decode_bound_count.py -q -> 25 passed in 1.23s; changed_python_files=40 py_compile PASS; git diff --check PASS; staged_unexpected_files=0. Not-tested: Full final preparation gate is reserved for the completed three-commit branch. --- frontier/config/config.py | 46 +- frontier/config/kv_cache_transfer_config.py | 5 +- frontier/config/m2n_transfer_config.py | 5 +- frontier/entities/__init__.py | 4 + frontier/entities/kv_cache_transfer_info.py | 60 +++ frontier/entities/m2n_transfer_info.py | 93 ++++ frontier/events/__init__.py | 4 + frontier/events/cluster_batch_end_event.py | 163 ++++++- .../events/kv_cache_transfer_end_event.py | 120 +++++ .../events/kv_cache_transfer_start_event.py | 83 ++++ frontier/kv_cache_transfer/__init__.py | 9 + .../analytical_kv_cache_transfer_predictor.py | 93 ++++ .../base_kv_cache_transfer_predictor.py | 66 +++ .../kv_cache_transfer_predictor_registry.py | 38 ++ frontier/m2n_transfer/__init__.py | 9 + .../analytical_m2n_transfer_predictor.py | 105 +++++ .../base_m2n_transfer_predictor.py | 85 ++++ .../m2n_transfer_predictor_registry.py | 34 ++ frontier/main.py | 59 ++- frontier/metrics/metrics_store.py | 74 ++- frontier/metrics/op_trace_utils.py | 9 +- .../base_request_generator.py | 3 + .../base_cluster_scheduler.py | 222 ++++++++- .../round_robin_cluster_scheduler.py | 26 +- .../sticky_round_robin_cluster_scheduler.py | 13 +- .../global_scheduler/base_global_scheduler.py | 10 +- .../base_replica_scheduler.py | 9 +- .../sarathi_replica_scheduler.py | 18 + .../vllm_v1_engine_replica_scheduler.py | 19 +- frontier/simulator.py | 22 +- frontier/types/__init__.py | 4 + frontier/types/kv_cache_transfer_type.py | 7 + frontier/types/m2n_transfer_type.py | 7 + .../test_kv_transfer_completion_contract.py | 226 ++++++++++ tests/unit/test_op_trace_utils.py | 421 ++++++++++++++++++ tests/unit/test_pd_transfer_entities.py | 173 +++++++ tests/unit/test_pd_transfer_predictors.py | 157 +++++++ .../test_pd_transfer_types_and_configs.py | 82 ++++ .../test_prefix_cache_cluster_validation.py | 201 +++++++++ ...st_request_generator_decode_bound_count.py | 33 ++ 40 files changed, 2749 insertions(+), 68 deletions(-) create mode 100644 frontier/entities/kv_cache_transfer_info.py create mode 100644 frontier/entities/m2n_transfer_info.py create mode 100644 frontier/events/kv_cache_transfer_end_event.py create mode 100644 frontier/events/kv_cache_transfer_start_event.py create mode 100644 frontier/kv_cache_transfer/__init__.py create mode 100644 frontier/kv_cache_transfer/analytical_kv_cache_transfer_predictor.py create mode 100644 frontier/kv_cache_transfer/base_kv_cache_transfer_predictor.py create mode 100644 frontier/kv_cache_transfer/kv_cache_transfer_predictor_registry.py create mode 100644 frontier/m2n_transfer/__init__.py create mode 100644 frontier/m2n_transfer/analytical_m2n_transfer_predictor.py create mode 100644 frontier/m2n_transfer/base_m2n_transfer_predictor.py create mode 100644 frontier/m2n_transfer/m2n_transfer_predictor_registry.py create mode 100644 frontier/types/kv_cache_transfer_type.py create mode 100644 frontier/types/m2n_transfer_type.py create mode 100644 tests/unit/test_kv_transfer_completion_contract.py create mode 100644 tests/unit/test_op_trace_utils.py create mode 100644 tests/unit/test_pd_transfer_entities.py create mode 100644 tests/unit/test_pd_transfer_predictors.py create mode 100644 tests/unit/test_pd_transfer_types_and_configs.py create mode 100644 tests/unit/test_prefix_cache_cluster_validation.py create mode 100644 tests/unit/test_request_generator_decode_bound_count.py diff --git a/frontier/config/config.py b/frontier/config/config.py index 615da4c..4b518f3 100644 --- a/frontier/config/config.py +++ b/frontier/config/config.py @@ -71,6 +71,12 @@ "It will be available in an upcoming version. Please use the co-located architecture for current usage and testing." ) +PD_DISAGGREGATION_PARALLEL_CLUSTER_RELEASE_ERROR = ( + "Error: pd-disaggregation public release support requires " + "--no-enable_parallel_clusters. Parallel cluster processing for " + "pd-disaggregation is not included in this release." +) + AICONFIGURATOR_BACKEND_RELEASE_ERROR = ( "Error: The aiconfigurator communication backend is not included in this release. " "Please use collective_sim, astra_sim_analytical, analytical, or vidur for current usage and testing." @@ -289,6 +295,13 @@ class BaseRequestGeneratorConfig(BasePolyConfig): default=42, metadata={"help": "Seed for the random number generator."}, ) + num_decode_bound_requests: Optional[int] = field( + default=None, + metadata={ + "help": "Number of generated requests that require decode-cluster work. " + "Derived by request generation and used by offline pd-disaggregation scheduling." + }, + ) @dataclass @@ -4558,14 +4571,28 @@ def _validate_mode_consistency(self): ) def _validate_open_source_release_disaggregation_fields_guard(self) -> None: - if self._has_disaggregation_params_set(): + for field_def in self.__dataclass_fields__.values(): + if not ( + field_def.name.startswith(("decode_attn_", "decode_ffn_")) + or field_def.name in DISAGGREGATED_CLUSTER_FIELD_NAMES + ): + continue + if self._field_is_set_to_non_default(field_def): + raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + + if any( + cluster_type in {ClusterType.DECODE_ATTN, ClusterType.DECODE_FFN} + for cluster_type in self.periodic_scheduling_clusters + ): + raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + + if self.allow_experiment_multi_decode_ffn_replicas: raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) def _validate_open_source_release_cluster_type_guard(self) -> None: - if ( - self.cluster_type is not None - and self.cluster_type != ClusterType.MONOLITHIC - ): + if self.cluster_type in {ClusterType.DECODE_ATTN, ClusterType.DECODE_FFN}: + raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + if self.cluster_type is not None and self._has_disaggregation_params_set(): raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) def _validate_open_source_release_cc_backend_guard(self) -> None: @@ -5967,10 +5994,16 @@ def __post_init__(self): self.write_config_to_file() def _validate_open_source_release_architecture_guard(self) -> None: - if self.sys_arch in {"pd-disaggregation", "pd-af-disaggregation"}: + if self.sys_arch == "pd-af-disaggregation": raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) if getattr(self, "use_cuda_graph", False): raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + + if self.sys_arch == "pd-disaggregation": + if self.enable_parallel_clusters: + raise ValueError(PD_DISAGGREGATION_PARALLEL_CLUSTER_RELEASE_ERROR) + return + default_kv_cache_transfer_config = AnalyticalKVCacheTransferConfig() if ( getattr( @@ -5981,6 +6014,7 @@ def _validate_open_source_release_architecture_guard(self) -> None: != default_kv_cache_transfer_config ): raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + default_m2n_transfer_config = AnalyticalM2NTransferConfig() if ( getattr(self, "m2n_transfer_config", default_m2n_transfer_config) diff --git a/frontier/config/kv_cache_transfer_config.py b/frontier/config/kv_cache_transfer_config.py index d5620ff..14ef67f 100644 --- a/frontier/config/kv_cache_transfer_config.py +++ b/frontier/config/kv_cache_transfer_config.py @@ -2,6 +2,7 @@ from typing import Optional from frontier.config.base_poly_config import BasePolyConfig +from frontier.types import KVCacheTransferType @dataclass @@ -76,8 +77,8 @@ class AnalyticalKVCacheTransferConfig(BaseKVCacheTransferConfig): ) @classmethod - def get_type(cls) -> str: - return "analytical" + def get_type(cls) -> KVCacheTransferType: + return KVCacheTransferType.ANALYTICAL @classmethod def get_name(cls) -> str: diff --git a/frontier/config/m2n_transfer_config.py b/frontier/config/m2n_transfer_config.py index 196c5ca..1047a02 100644 --- a/frontier/config/m2n_transfer_config.py +++ b/frontier/config/m2n_transfer_config.py @@ -2,6 +2,7 @@ from typing import Optional from frontier.config.base_poly_config import BasePolyConfig +from frontier.types import M2NTransferType @dataclass @@ -74,8 +75,8 @@ class AnalyticalM2NTransferConfig(BaseM2NTransferConfig): ) @classmethod - def get_type(cls) -> str: - return "analytical" + def get_type(cls) -> M2NTransferType: + return M2NTransferType.ANALYTICAL @classmethod def get_name(cls) -> str: diff --git a/frontier/entities/__init__.py b/frontier/entities/__init__.py index 624652f..7e2d852 100644 --- a/frontier/entities/__init__.py +++ b/frontier/entities/__init__.py @@ -2,6 +2,8 @@ from frontier.entities.batch_stage import BatchStage from frontier.entities.cluster import Cluster from frontier.entities.execution_time import ExecutionTime +from frontier.entities.kv_cache_transfer_info import KVCacheTransferInfo +from frontier.entities.m2n_transfer_info import M2NTransferInfo from frontier.entities.replica import Replica from frontier.entities.request import Request, RequestRoundPlan @@ -15,4 +17,6 @@ Cluster, BatchStage, ExecutionTime, + KVCacheTransferInfo, + M2NTransferInfo, ] diff --git a/frontier/entities/kv_cache_transfer_info.py b/frontier/entities/kv_cache_transfer_info.py new file mode 100644 index 0000000..53f6318 --- /dev/null +++ b/frontier/entities/kv_cache_transfer_info.py @@ -0,0 +1,60 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional + +from frontier.types import ClusterType + +if TYPE_CHECKING: + from frontier.entities import Batch + + +@dataclass +class KVCacheTransferInfo: + """Information about a KV cache transfer operation.""" + + batch: "Batch" + source_cluster_type: ClusterType + target_cluster_type: ClusterType + source_replica_id: int + source_dp_id: int + kv_cache_size_bytes: int + transfer_time_ms: float + transfer_start_time: float + transfer_end_time: Optional[float] = None + enable_compression: bool = False + compression_ratio: float = 1.0 + enable_latency_hiding: bool = False + transfer_protocol: str = "rdma" + transfer_requests: bool = False + + def __post_init__(self) -> None: + if self.transfer_end_time is None: + self.transfer_end_time = self.transfer_start_time + (self.transfer_time_ms * 1e-3) + + @property + def is_completed(self) -> bool: + return self.transfer_end_time is not None + + @property + def effective_data_size_bytes(self) -> int: + if self.enable_compression: + return int(self.kv_cache_size_bytes / self.compression_ratio) + return self.kv_cache_size_bytes + + def to_dict(self) -> dict: + return { + "batch_id": self.batch.id, + "batch_global_id": self.batch.global_id, + "source_cluster_type": self.source_cluster_type.name, + "target_cluster_type": self.target_cluster_type.name, + "source_replica_id": self.source_replica_id, + "kv_cache_size_bytes": self.kv_cache_size_bytes, + "effective_data_size_bytes": self.effective_data_size_bytes, + "transfer_time_ms": self.transfer_time_ms, + "transfer_start_time": self.transfer_start_time, + "transfer_end_time": self.transfer_end_time, + "enable_compression": self.enable_compression, + "compression_ratio": self.compression_ratio, + "enable_latency_hiding": self.enable_latency_hiding, + "transfer_protocol": self.transfer_protocol, + "transfer_requests": self.transfer_requests, + } diff --git a/frontier/entities/m2n_transfer_info.py b/frontier/entities/m2n_transfer_info.py new file mode 100644 index 0000000..049847a --- /dev/null +++ b/frontier/entities/m2n_transfer_info.py @@ -0,0 +1,93 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional + +from frontier.types import ClusterType + +if TYPE_CHECKING: + from frontier.entities import Batch + + +@dataclass +class M2NTransferInfo: + """Information about a Memory-to-Memory transfer operation.""" + + batch: "Batch" + source_cluster_type: ClusterType + target_cluster_type: ClusterType + source_replica_id: int + source_dp_id: int + activation_size_bytes: int + transfer_time_ms: float + transfer_start_time: float + transfer_end_time: Optional[float] = None + enable_p2p_optimization: bool = True + p2p_protocol: str = "nvlink" + enable_compression: bool = False + compression_ratio: float = 1.0 + enable_latency_hiding: bool = False + layer_id: Optional[int] = None + afd_stage_idx: Optional[int] = None + pipeline_stage: Optional[str] = None + target_ffn_replica_id: Optional[int] = None + + def __post_init__(self) -> None: + if self.transfer_end_time is None: + self.transfer_end_time = self.transfer_start_time + (self.transfer_time_ms * 1e-3) + + valid_transfers = [ + (ClusterType.DECODE_ATTN, ClusterType.DECODE_FFN), + (ClusterType.DECODE_FFN, ClusterType.DECODE_ATTN), + ] + if (self.source_cluster_type, self.target_cluster_type) not in valid_transfers: + raise ValueError( + f"Invalid M2N transfer: {self.source_cluster_type.name} -> {self.target_cluster_type.name}. " + "M2N transfers only support DECODE_ATTN <-> DECODE_FFN communication." + ) + + if self.pipeline_stage is None: + if self.source_cluster_type == ClusterType.DECODE_ATTN: + self.pipeline_stage = "attn_to_ffn" + else: + self.pipeline_stage = "ffn_to_attn" + + @property + def is_completed(self) -> bool: + return self.transfer_end_time is not None + + @property + def effective_data_size_bytes(self) -> int: + if self.enable_compression: + return int(self.activation_size_bytes / self.compression_ratio) + return self.activation_size_bytes + + @property + def is_attn_to_ffn(self) -> bool: + return self.source_cluster_type == ClusterType.DECODE_ATTN + + @property + def is_ffn_to_attn(self) -> bool: + return self.source_cluster_type == ClusterType.DECODE_FFN + + def to_dict(self) -> dict: + return { + "batch_id": self.batch.id, + "batch_global_id": self.batch.global_id, + "source_cluster_type": self.source_cluster_type.name, + "target_cluster_type": self.target_cluster_type.name, + "source_replica_id": self.source_replica_id, + "source_dp_id": self.source_dp_id, + "activation_size_bytes": self.activation_size_bytes, + "effective_data_size_bytes": self.effective_data_size_bytes, + "transfer_time_ms": self.transfer_time_ms, + "transfer_start_time": self.transfer_start_time, + "transfer_end_time": self.transfer_end_time, + "enable_p2p_optimization": self.enable_p2p_optimization, + "p2p_protocol": self.p2p_protocol, + "enable_compression": self.enable_compression, + "compression_ratio": self.compression_ratio, + "enable_latency_hiding": self.enable_latency_hiding, + "layer_id": self.layer_id, + "afd_stage_idx": self.afd_stage_idx, + "pipeline_stage": self.pipeline_stage, + "target_ffn_replica_id": self.target_ffn_replica_id, + } diff --git a/frontier/events/__init__.py b/frontier/events/__init__.py index 12e29a9..469e168 100644 --- a/frontier/events/__init__.py +++ b/frontier/events/__init__.py @@ -13,6 +13,8 @@ from frontier.events.periodic_schedule_event import PeriodicScheduleEvent from frontier.events.cluster_batch_end_event import ClusterBatchEndEvent from frontier.events.global_batch_end_event import GlobalBatchEndEvent +from frontier.events.kv_cache_transfer_end_event import KVCacheTransferEndEvent +from frontier.events.kv_cache_transfer_start_event import KVCacheTransferStartEvent from frontier.events.thinking_round_requeue_event import ThinkingRoundRequeueEvent @@ -32,5 +34,7 @@ "PeriodicScheduleEvent", "ClusterBatchEndEvent", "GlobalBatchEndEvent", + "KVCacheTransferEndEvent", + "KVCacheTransferStartEvent", "ThinkingRoundRequeueEvent", ] diff --git a/frontier/events/cluster_batch_end_event.py b/frontier/events/cluster_batch_end_event.py index a963b39..ce2bf79 100644 --- a/frontier/events/cluster_batch_end_event.py +++ b/frontier/events/cluster_batch_end_event.py @@ -6,15 +6,14 @@ from frontier.metrics import MetricsStore from frontier.entities import Batch from frontier.logger import get_cluster_logger -from frontier.config.config import DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR class ClusterBatchEndEvent(BaseEvent): """ Cluster-internal batch stage completion event. - This release supports only the MONOLITHIC co-location path. Disaggregated - cluster types fail fast before any cluster-local completion logic runs. + PREFILL completes local batch work and emits KV cache transfers to the decode + cluster. MONOLITHIC keeps the existing co-location completion path. """ def __init__( @@ -58,11 +57,11 @@ def __init__( def handle_event( self, scheduler: BaseGlobalScheduler, metrics_store: MetricsStore ) -> List[BaseEvent]: + from frontier.events.kv_cache_transfer_start_event import ( + KVCacheTransferStartEvent, + ) from frontier.events.replica_schedule_event import ReplicaScheduleEvent - if self._cluster_type != ClusterType.MONOLITHIC: - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) - cluster_scheduler = scheduler.get_cluster_scheduler(self._cluster_type) replica_scheduler = cluster_scheduler.get_dp_replica_scheduler( self._replica_id, self._dp_id @@ -81,16 +80,148 @@ def handle_event( ) return [] - # Always record cluster-internal stage completion hooks - try: - # Entities-level hook (lightweight; can be a no-op) - if hasattr(self._batch, "on_cluster_stage_end"): - self._batch.on_cluster_stage_end(self.time, self._cluster_type) - # Replica-scheduler-level hook (lightweight; can be a no-op) - if hasattr(replica_scheduler, "on_cluster_stage_end"): - replica_scheduler.on_cluster_stage_end(self._batch) - except Exception as e: - logger.info(f"[CLUSTER-END][WARN] on_cluster_stage_end hooks error: {e}") + # Always record cluster-internal stage completion hooks. + if hasattr(self._batch, "on_cluster_stage_end"): + self._batch.on_cluster_stage_end(self.time, self._cluster_type) + if hasattr(replica_scheduler, "on_cluster_stage_end"): + replica_scheduler.on_cluster_stage_end(self._batch) + + if self._cluster_type == ClusterType.PREFILL: + self._batch.on_batch_end( + self.time, + self._cluster_type, + ) + replica_scheduler.on_batch_end(self._batch) + + memory_usage_percent = replica_scheduler.memory_usage_percent + metrics_store.on_batch_end( + self.time, + self._batch, + self._replica_id, + memory_usage_percent, + self._cluster_type, + self._dp_id, + ) + + kv_pred = cluster_scheduler._kv_cache_transfer_predictor + if kv_pred is None: + raise ValueError( + "KV cache transfer predictor not found in ClusterScheduler" + ) + + replica_config = cluster_scheduler._config.replica_config + target_cluster = cluster_scheduler._get_decode_target_cluster() + + for request in self._batch.requests: + if request.is_prefill_complete and request.num_decode_tokens > 0: + kv_cache_size_bytes, transfer_time_ms = ( + kv_pred.get_transfer_info_for_request( + source_cluster_type=self._cluster_type, + target_cluster_type=target_cluster, + request=request, + replica_config=replica_config, + ) + ) + + from frontier.entities.batch import Batch as SingleBatch + + single_request_batch = SingleBatch( + replica_id=self._replica_id, + requests=[request], + num_tokens=[request.num_prefill_tokens], + is_moe=replica_config.model_config.is_moe, + ) + next_events.append( + KVCacheTransferStartEvent( + self.time, + source_replica_id=self._replica_id, + source_dp_id=self._dp_id, + target_cluster_type=target_cluster, + batch=single_request_batch, + kv_cache_size_bytes=kv_cache_size_bytes, + transfer_time_ms=transfer_time_ms, + source_cluster_type=self._cluster_type, + ) + ) + + next_events.append( + ReplicaScheduleEvent( + self.time, self._replica_id, self._cluster_type, self._dp_id + ) + ) + return next_events + + if self._cluster_type == ClusterType.DECODE: + if self._batch.is_idle: + logger.info( + f"[DECODE-END][IDLE] batch_id={self._batch.id} is idle batch, skipping normal end logic" + ) + next_events.append( + ReplicaScheduleEvent( + self.time, self._replica_id, self._cluster_type, self._dp_id + ) + ) + return next_events + + replica = cluster_scheduler._cluster.replicas[self._replica_id] + is_moe = replica.is_moe + + if not is_moe: + from frontier.events.global_batch_end_event import GlobalBatchEndEvent + + next_events.append( + GlobalBatchEndEvent( + self.time, + self._replica_id, + self._dp_id, + self._batch, + self._cluster_type, + batch_schedule_epoch=self._batch_schedule_epoch, + request_execution_signatures=self._request_execution_signatures, + request_mutation_signatures=self._request_mutation_signatures, + thinking_round_start_times=self._thinking_round_start_times, + ) + ) + return next_events + + model_config = cluster_scheduler._config.replica_config.model_config + total_layers = model_config.num_layers + current_layer_id = self._get_current_layer_id_from_batch(self._batch) + is_final_layer = current_layer_id >= total_layers - 1 + + if is_final_layer: + from frontier.events.global_batch_end_event import GlobalBatchEndEvent + + next_events.append( + GlobalBatchEndEvent( + self.time, + self._replica_id, + self._dp_id, + self._batch, + self._cluster_type, + batch_schedule_epoch=self._batch_schedule_epoch, + request_execution_signatures=self._request_execution_signatures, + request_mutation_signatures=self._request_mutation_signatures, + thinking_round_start_times=self._thinking_round_start_times, + ) + ) + else: + memory_usage_percent = replica_scheduler.memory_usage_percent + metrics_store.on_batch_end( + self.time, + self._batch, + self._replica_id, + memory_usage_percent, + self._cluster_type, + self._dp_id, + ) + next_events.append( + ReplicaScheduleEvent( + self.time, self._replica_id, self._cluster_type, self._dp_id + ) + ) + + return next_events # MONOLITHIC cluster: Complete batch processing # In co-location mode, MONOLITHIC processes everything: prefill + all decode tokens diff --git a/frontier/events/kv_cache_transfer_end_event.py b/frontier/events/kv_cache_transfer_end_event.py new file mode 100644 index 0000000..de79890 --- /dev/null +++ b/frontier/events/kv_cache_transfer_end_event.py @@ -0,0 +1,120 @@ +from typing import TYPE_CHECKING, List + +from frontier.events.base_event import BaseEvent +from frontier.types import ClusterType, EventType + +if TYPE_CHECKING: + from frontier.entities import KVCacheTransferInfo + from frontier.metrics import MetricsStore + from frontier.scheduler import BaseGlobalScheduler + + +class KVCacheTransferEndEvent(BaseEvent): + """Event emitted when a KV cache transfer completes.""" + + def __init__( + self, + time: float, + transfer_info: "KVCacheTransferInfo", + ) -> None: + super().__init__(time, EventType.KV_CACHE_TRANSFER_END) + self._transfer_info = transfer_info + self._transfer_info.transfer_end_time = time + + def handle_event( + self, + scheduler: "BaseGlobalScheduler", + metrics_store: "MetricsStore", + ) -> List[BaseEvent]: + transfer_duration_s = self.time - self._transfer_info.transfer_start_time + transfer_duration_ms = transfer_duration_s * 1e3 + metrics_store.on_kv_cache_transfer_end( + self.time, + transfer_duration_ms, + self._transfer_info.kv_cache_size_bytes, + self._transfer_info.target_cluster_type, + self._transfer_info, + ) + + batch = self._transfer_info.batch + for request in batch.requests: + request.on_kv_cache_transfer_complete(self.time, transfer_duration_s) + + target_cluster_scheduler = scheduler.get_cluster_scheduler( + self._transfer_info.target_cluster_type + ) + arrival_events = target_cluster_scheduler.on_kv_cache_arrival( + self.time, + self._transfer_info.batch, + self._transfer_info, + ) + + if self._transfer_info.source_cluster_type == ClusterType.PREFILL: + from frontier.events.replica_schedule_event import ReplicaScheduleEvent + + source_cluster_scheduler = scheduler.get_cluster_scheduler( + self._transfer_info.source_cluster_type + ) + source_replica_scheduler = source_cluster_scheduler.get_dp_replica_scheduler( + self._transfer_info.source_replica_id, + self._transfer_info.source_dp_id, + ) + source_replica_scheduler.complete_kv_transfer_for_requests(batch.requests) + + memory_usage_percent = source_replica_scheduler.memory_usage_percent + metrics_store.on_replica_schedule( + self.time, + self._transfer_info.source_replica_id, + memory_usage_percent, + self._transfer_info.source_cluster_type, + dp_id=self._transfer_info.source_dp_id, + ) + + pending_requests = getattr( + source_replica_scheduler, + "num_pending_requests", + None, + ) + num_running_batches = getattr( + source_replica_scheduler, + "num_running_batches", + None, + ) + if ( + pending_requests is not None + and num_running_batches is not None + and pending_requests > 0 + and num_running_batches == 0 + ): + source_cluster_logical_time = scheduler.get_cluster_logical_time( + self._transfer_info.source_cluster_type + ) + source_reschedule_time = max(self.time, source_cluster_logical_time) + arrival_events.append( + ReplicaScheduleEvent( + source_reschedule_time, + self._transfer_info.source_replica_id, + self._transfer_info.source_cluster_type, + self._transfer_info.source_dp_id, + ) + ) + + return arrival_events + + def get_target_cluster(self) -> ClusterType: + return self._transfer_info.target_cluster_type + + def to_dict(self) -> dict: + return { + "time": self.time, + "event_type": self.event_type.name, + "batch_id": self._transfer_info.batch.id, + "batch_global_id": self._transfer_info.batch.global_id, + "source_cluster_type": self._transfer_info.source_cluster_type.name, + "target_cluster_type": self._transfer_info.target_cluster_type.name, + "source_replica_id": self._transfer_info.source_replica_id, + "kv_cache_size_bytes": self._transfer_info.kv_cache_size_bytes, + "transfer_time_ms": self._transfer_info.transfer_time_ms, + "transfer_start_time": self._transfer_info.transfer_start_time, + "transfer_end_time": self._transfer_info.transfer_end_time, + } diff --git a/frontier/events/kv_cache_transfer_start_event.py b/frontier/events/kv_cache_transfer_start_event.py new file mode 100644 index 0000000..28812cc --- /dev/null +++ b/frontier/events/kv_cache_transfer_start_event.py @@ -0,0 +1,83 @@ +from typing import TYPE_CHECKING, List + +from frontier.events.base_event import BaseEvent +from frontier.types import ClusterType, EventType + +if TYPE_CHECKING: + from frontier.entities import Batch + from frontier.metrics import MetricsStore + from frontier.scheduler import BaseGlobalScheduler + + +class KVCacheTransferStartEvent(BaseEvent): + """Event emitted when a KV cache transfer starts.""" + + def __init__( + self, + time: float, + source_replica_id: int, + source_dp_id: int, + target_cluster_type: ClusterType, + batch: "Batch", + kv_cache_size_bytes: int, + transfer_time_ms: float, + source_cluster_type: ClusterType = ClusterType.PREFILL, + ) -> None: + super().__init__(time, EventType.KV_CACHE_TRANSFER_START) + self._source_replica_id = source_replica_id + self._source_dp_id = source_dp_id + self._source_cluster_type = source_cluster_type + self._target_cluster_type = target_cluster_type + self._batch = batch + self._kv_cache_size_bytes = kv_cache_size_bytes + self._transfer_time_ms = transfer_time_ms + + def handle_event( + self, + scheduler: "BaseGlobalScheduler", + metrics_store: "MetricsStore", + ) -> List[BaseEvent]: + from frontier.entities.kv_cache_transfer_info import KVCacheTransferInfo + from frontier.events.kv_cache_transfer_end_event import KVCacheTransferEndEvent + + transfer_info = KVCacheTransferInfo( + batch=self._batch, + source_cluster_type=self._source_cluster_type, + target_cluster_type=self._target_cluster_type, + source_replica_id=self._source_replica_id, + source_dp_id=self._source_dp_id, + kv_cache_size_bytes=self._kv_cache_size_bytes, + transfer_time_ms=self._transfer_time_ms, + transfer_start_time=self.time, + ) + + metrics_store.on_kv_cache_transfer_start( + self.time, + self._source_replica_id, + self._source_dp_id, + self._target_cluster_type, + self._kv_cache_size_bytes, + transfer_info, + ) + + for request in self._batch.requests: + request.on_kv_cache_transfer_start(self.time) + + transfer_end_time = self.time + self._transfer_time_ms * 1e-3 + return [KVCacheTransferEndEvent(transfer_end_time, transfer_info)] + + def get_target_cluster(self) -> ClusterType: + return self._target_cluster_type + + def to_dict(self) -> dict: + return { + "time": self.time, + "event_type": self.event_type.name, + "source_replica_id": self._source_replica_id, + "source_cluster_type": self._source_cluster_type.name, + "target_cluster_type": self._target_cluster_type.name, + "batch_id": self._batch.id, + "batch_global_id": self._batch.global_id, + "kv_cache_size_bytes": self._kv_cache_size_bytes, + "transfer_time_ms": self._transfer_time_ms, + } diff --git a/frontier/kv_cache_transfer/__init__.py b/frontier/kv_cache_transfer/__init__.py new file mode 100644 index 0000000..1a387a7 --- /dev/null +++ b/frontier/kv_cache_transfer/__init__.py @@ -0,0 +1,9 @@ +from .analytical_kv_cache_transfer_predictor import AnalyticalKVCacheTransferPredictor +from .base_kv_cache_transfer_predictor import BaseKVCacheTransferPredictor +from .kv_cache_transfer_predictor_registry import KVCacheTransferPredictorRegistry + +__all__ = [ + "BaseKVCacheTransferPredictor", + "AnalyticalKVCacheTransferPredictor", + "KVCacheTransferPredictorRegistry", +] diff --git a/frontier/kv_cache_transfer/analytical_kv_cache_transfer_predictor.py b/frontier/kv_cache_transfer/analytical_kv_cache_transfer_predictor.py new file mode 100644 index 0000000..d374ca9 --- /dev/null +++ b/frontier/kv_cache_transfer/analytical_kv_cache_transfer_predictor.py @@ -0,0 +1,93 @@ +from typing import TYPE_CHECKING + +from frontier.config import get_quantization_manager +from frontier.kv_cache_transfer.base_kv_cache_transfer_predictor import BaseKVCacheTransferPredictor +from frontier.logger import init_logger +from frontier.types import ClusterType + +if TYPE_CHECKING: + from frontier.config import ReplicaConfig + from frontier.config.config import AnalyticalKVCacheTransferConfig + from frontier.entities import Batch, Request + + +class AnalyticalKVCacheTransferPredictor(BaseKVCacheTransferPredictor): + """Analytical KV cache transfer predictor using bandwidth and latency.""" + + def __init__(self, config: "AnalyticalKVCacheTransferConfig") -> None: + super().__init__(config) + self._config: "AnalyticalKVCacheTransferConfig" = config + self._logger = init_logger(__name__) + + def get_transfer_time( + self, + source_cluster_type: ClusterType, + target_cluster_type: ClusterType, + batch: "Batch", + kv_cache_size_bytes: int, + ) -> float: + effective_size_bytes = kv_cache_size_bytes + if self._config.enable_compression: + effective_size_bytes = kv_cache_size_bytes / self._config.compression_ratio + + bandwidth_bytes_per_ms = (self._config.network_bandwidth_gbps * 1e9) / (8 * 1000) + return self._config.network_latency_ms + (effective_size_bytes / bandwidth_bytes_per_ms) + + def get_kv_cache_size(self, batch: "Batch", replica_config: "ReplicaConfig") -> int: + total_tokens = sum(req.num_prefill_tokens for req in batch.requests) + return self._calculate_kv_cache_size_for_tokens(total_tokens, replica_config) + + def get_kv_cache_size_for_request( + self, request: "Request", replica_config: "ReplicaConfig" + ) -> int: + return self._calculate_kv_cache_size_for_tokens( + request.num_prefill_tokens, replica_config + ) + + def _calculate_kv_cache_size_for_tokens( + self, num_tokens: int, replica_config: "ReplicaConfig" + ) -> int: + model_config = replica_config.model_config + num_layers = ( + self._config.override_num_layers + if self._config.override_num_layers is not None + else model_config.num_layers + ) + num_heads = ( + self._config.override_num_heads + if self._config.override_num_heads is not None + else model_config.num_kv_heads + ) + head_dim = ( + self._config.override_head_dim + if self._config.override_head_dim is not None + else model_config.get_head_dim() + ) + dtype_size = self._get_kv_cache_dtype_size_bytes() + return int(num_tokens * num_layers * num_heads * head_dim * 2 * dtype_size) + + def _get_kv_cache_dtype_size_bytes(self) -> float: + quant_manager = get_quantization_manager() + has_explicit_quant = quant_manager.has_explicit_precision("kv_cache_transfer") + quant_precision = quant_manager.get_precision("kv_cache_transfer") + quant_dtype_size = quant_precision.bytes_per_element + + if has_explicit_quant: + if ( + self._config.kv_cache_dtype_size_bytes is not None + and self._config.kv_cache_dtype_size_bytes != quant_dtype_size + ): + raise ValueError( + "kv_cache_dtype_size_bytes is deprecated and conflicts with quantization " + f"config for kv_cache_transfer (config={self._config.kv_cache_dtype_size_bytes}, " + f"quantization={quant_dtype_size})." + ) + return quant_dtype_size + + if self._config.kv_cache_dtype_size_bytes is not None: + return self._config.kv_cache_dtype_size_bytes + + return quant_dtype_size + + def supports_latency_hiding(self) -> bool: + return self._config.enable_latency_hiding diff --git a/frontier/kv_cache_transfer/base_kv_cache_transfer_predictor.py b/frontier/kv_cache_transfer/base_kv_cache_transfer_predictor.py new file mode 100644 index 0000000..1f40efd --- /dev/null +++ b/frontier/kv_cache_transfer/base_kv_cache_transfer_predictor.py @@ -0,0 +1,66 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from frontier.types import ClusterType + +if TYPE_CHECKING: + from frontier.config import ReplicaConfig + from frontier.config.kv_cache_transfer_config import BaseKVCacheTransferConfig + from frontier.entities import Batch, Request + + +class BaseKVCacheTransferPredictor(ABC): + """Abstract base class for KV cache transfer predictors.""" + + def __init__(self, config: "BaseKVCacheTransferConfig") -> None: + self._config = config + + @abstractmethod + def get_transfer_time( + self, + source_cluster_type: ClusterType, + target_cluster_type: ClusterType, + batch: "Batch", + kv_cache_size_bytes: int, + ) -> float: + pass + + @abstractmethod + def get_kv_cache_size(self, batch: "Batch", replica_config: "ReplicaConfig") -> int: + pass + + @abstractmethod + def get_kv_cache_size_for_request( + self, request: "Request", replica_config: "ReplicaConfig" + ) -> int: + pass + + @abstractmethod + def supports_latency_hiding(self) -> bool: + pass + + def get_transfer_info( + self, + source_cluster_type: ClusterType, + target_cluster_type: ClusterType, + batch: "Batch", + replica_config: "ReplicaConfig", + ) -> tuple[int, float]: + kv_cache_size = self.get_kv_cache_size(batch, replica_config) + transfer_time = self.get_transfer_time( + source_cluster_type, target_cluster_type, batch, kv_cache_size + ) + return kv_cache_size, transfer_time + + def get_transfer_info_for_request( + self, + source_cluster_type: ClusterType, + target_cluster_type: ClusterType, + request: "Request", + replica_config: "ReplicaConfig", + ) -> tuple[int, float]: + kv_cache_size = self.get_kv_cache_size_for_request(request, replica_config) + transfer_time = self.get_transfer_time( + source_cluster_type, target_cluster_type, None, kv_cache_size + ) + return kv_cache_size, transfer_time diff --git a/frontier/kv_cache_transfer/kv_cache_transfer_predictor_registry.py b/frontier/kv_cache_transfer/kv_cache_transfer_predictor_registry.py new file mode 100644 index 0000000..a334307 --- /dev/null +++ b/frontier/kv_cache_transfer/kv_cache_transfer_predictor_registry.py @@ -0,0 +1,38 @@ +from typing import TYPE_CHECKING + +from frontier.kv_cache_transfer.analytical_kv_cache_transfer_predictor import ( + AnalyticalKVCacheTransferPredictor, +) +from frontier.kv_cache_transfer.base_kv_cache_transfer_predictor import ( + BaseKVCacheTransferPredictor, +) +from frontier.types import KVCacheTransferType +from frontier.utils.base_registry import BaseRegistry + +if TYPE_CHECKING: + from frontier.config.kv_cache_transfer_config import BaseKVCacheTransferConfig + + +class KVCacheTransferPredictorRegistry(BaseRegistry): + """Registry for KV cache transfer predictors.""" + + @classmethod + def get_key_from_str(cls, key_str: str) -> KVCacheTransferType: + return KVCacheTransferType.from_str(key_str) + + @classmethod + def get( + cls, + predictor_type: KVCacheTransferType, + config: "BaseKVCacheTransferConfig", + ) -> BaseKVCacheTransferPredictor: + if predictor_type not in cls._registry: + raise ValueError( + f"KV cache transfer predictor type {predictor_type} is not registered" + ) + return cls._registry[predictor_type](config) + + +KVCacheTransferPredictorRegistry.register( + KVCacheTransferType.ANALYTICAL, AnalyticalKVCacheTransferPredictor +) diff --git a/frontier/m2n_transfer/__init__.py b/frontier/m2n_transfer/__init__.py new file mode 100644 index 0000000..66ef443 --- /dev/null +++ b/frontier/m2n_transfer/__init__.py @@ -0,0 +1,9 @@ +from .analytical_m2n_transfer_predictor import AnalyticalM2NTransferPredictor +from .base_m2n_transfer_predictor import BaseM2NTransferPredictor +from .m2n_transfer_predictor_registry import M2NTransferPredictorRegistry + +__all__ = [ + "BaseM2NTransferPredictor", + "AnalyticalM2NTransferPredictor", + "M2NTransferPredictorRegistry", +] diff --git a/frontier/m2n_transfer/analytical_m2n_transfer_predictor.py b/frontier/m2n_transfer/analytical_m2n_transfer_predictor.py new file mode 100644 index 0000000..37b7616 --- /dev/null +++ b/frontier/m2n_transfer/analytical_m2n_transfer_predictor.py @@ -0,0 +1,105 @@ +from typing import TYPE_CHECKING + +from frontier.config import get_quantization_manager +from frontier.logger import init_logger +from frontier.m2n_transfer.base_m2n_transfer_predictor import BaseM2NTransferPredictor +from frontier.types import ClusterType + +if TYPE_CHECKING: + from frontier.config import ReplicaConfig + from frontier.config.m2n_transfer_config import AnalyticalM2NTransferConfig + from frontier.entities import Batch, Request + + +class AnalyticalM2NTransferPredictor(BaseM2NTransferPredictor): + """Analytical M2N transfer predictor using bandwidth and latency.""" + + def __init__(self, config: "AnalyticalM2NTransferConfig") -> None: + super().__init__(config) + self._config: "AnalyticalM2NTransferConfig" = config + self._logger = init_logger(__name__) + + def get_transfer_time( + self, + source_cluster_type: ClusterType, + target_cluster_type: ClusterType, + batch: "Batch", + activation_size_bytes: int, + ) -> float: + valid_transfers = [ + (ClusterType.DECODE_ATTN, ClusterType.DECODE_FFN), + (ClusterType.DECODE_FFN, ClusterType.DECODE_ATTN), + ] + if (source_cluster_type, target_cluster_type) not in valid_transfers: + raise ValueError( + f"Invalid M2N transfer: {source_cluster_type.name} -> {target_cluster_type.name}. " + "M2N transfers only support DECODE_ATTN <-> DECODE_FFN communication." + ) + + effective_size_bytes = activation_size_bytes + if self._config.enable_compression: + effective_size_bytes = int(activation_size_bytes / self._config.compression_ratio) + + bandwidth_bytes_per_ms = self._config.memory_bandwidth_gbps * 125_000 + transfer_time_ms = self._config.network_latency_ms + ( + effective_size_bytes / bandwidth_bytes_per_ms + ) + if self._config.enable_p2p_optimization: + transfer_time_ms = transfer_time_ms / 1.2 + return transfer_time_ms + + def get_activation_size( + self, + batch: "Batch", + replica_config: "ReplicaConfig", + source_cluster_type: ClusterType, + ) -> int: + hidden_size = self._config.override_hidden_size or replica_config.model_config.embedding_dim + total_tokens = batch.get_effective_total_tokens_for_transfer(source_cluster_type) + dtype_size = self._get_activation_dtype_size_bytes(source_cluster_type) + + if source_cluster_type in {ClusterType.DECODE_ATTN, ClusterType.DECODE_FFN}: + return int(total_tokens * hidden_size * dtype_size) + raise ValueError( + f"Invalid source cluster type for M2N transfer: {source_cluster_type.name}" + ) + + def get_activation_size_for_request( + self, + request: "Request", + replica_config: "ReplicaConfig", + source_cluster_type: ClusterType, + ) -> int: + hidden_size = self._config.override_hidden_size or replica_config.model_config.embedding_dim + dtype_size = self._get_activation_dtype_size_bytes(source_cluster_type) + + if source_cluster_type in {ClusterType.DECODE_ATTN, ClusterType.DECODE_FFN}: + return int(1 * hidden_size * dtype_size) + raise ValueError( + f"Invalid source cluster type for M2N transfer: {source_cluster_type.name}" + ) + + def _get_activation_dtype_size_bytes(self, source_cluster_type: ClusterType) -> float: + quant_manager = get_quantization_manager() + has_explicit_quant = quant_manager.has_explicit_precision( + "m2n_transfer", source_cluster_type + ) + quant_precision = quant_manager.get_precision("m2n_transfer", source_cluster_type) + quant_dtype_size = quant_precision.bytes_per_element + + if has_explicit_quant: + if ( + self._config.activation_dtype_size_bytes is not None + and self._config.activation_dtype_size_bytes != quant_dtype_size + ): + raise ValueError( + "activation_dtype_size_bytes is deprecated and conflicts with quantization " + f"config for m2n_transfer (config={self._config.activation_dtype_size_bytes}, " + f"quantization={quant_dtype_size})." + ) + return quant_dtype_size + + if self._config.activation_dtype_size_bytes is not None: + return self._config.activation_dtype_size_bytes + + return quant_dtype_size diff --git a/frontier/m2n_transfer/base_m2n_transfer_predictor.py b/frontier/m2n_transfer/base_m2n_transfer_predictor.py new file mode 100644 index 0000000..feca6b6 --- /dev/null +++ b/frontier/m2n_transfer/base_m2n_transfer_predictor.py @@ -0,0 +1,85 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from frontier.types import ClusterType + +if TYPE_CHECKING: + from frontier.config import ReplicaConfig + from frontier.config.m2n_transfer_config import BaseM2NTransferConfig + from frontier.entities import Batch, Request + + +class BaseM2NTransferPredictor(ABC): + """Abstract base class for Memory-to-Memory transfer predictors.""" + + def __init__(self, config: "BaseM2NTransferConfig") -> None: + self._config = config + + @abstractmethod + def get_transfer_time( + self, + source_cluster_type: ClusterType, + target_cluster_type: ClusterType, + batch: "Batch", + activation_size_bytes: int, + ) -> float: + pass + + @abstractmethod + def get_activation_size( + self, + batch: "Batch", + replica_config: "ReplicaConfig", + source_cluster_type: ClusterType, + ) -> int: + pass + + @abstractmethod + def get_activation_size_for_request( + self, + request: "Request", + replica_config: "ReplicaConfig", + source_cluster_type: ClusterType, + ) -> int: + pass + + def get_transfer_info( + self, + source_cluster_type: ClusterType, + target_cluster_type: ClusterType, + batch: "Batch", + replica_config: "ReplicaConfig", + ) -> tuple[int, float]: + activation_size = self.get_activation_size( + batch, replica_config, source_cluster_type + ) + transfer_time = self.get_transfer_time( + source_cluster_type, target_cluster_type, batch, activation_size + ) + return activation_size, transfer_time + + def get_transfer_info_for_request( + self, + source_cluster_type: ClusterType, + target_cluster_type: ClusterType, + request: "Request", + replica_config: "ReplicaConfig", + ) -> tuple[int, float]: + activation_size = self.get_activation_size_for_request( + request, replica_config, source_cluster_type + ) + from frontier.entities import Batch + + single_request_batch = Batch( + replica_id=0, + requests=[request], + num_tokens=[1], + is_moe=replica_config.model_config.is_moe, + ) + transfer_time = self.get_transfer_time( + source_cluster_type, + target_cluster_type, + single_request_batch, + activation_size, + ) + return activation_size, transfer_time diff --git a/frontier/m2n_transfer/m2n_transfer_predictor_registry.py b/frontier/m2n_transfer/m2n_transfer_predictor_registry.py new file mode 100644 index 0000000..39a72ed --- /dev/null +++ b/frontier/m2n_transfer/m2n_transfer_predictor_registry.py @@ -0,0 +1,34 @@ +from typing import TYPE_CHECKING + +from frontier.m2n_transfer.analytical_m2n_transfer_predictor import ( + AnalyticalM2NTransferPredictor, +) +from frontier.m2n_transfer.base_m2n_transfer_predictor import BaseM2NTransferPredictor +from frontier.types import M2NTransferType +from frontier.utils.base_registry import BaseRegistry + +if TYPE_CHECKING: + from frontier.config.m2n_transfer_config import BaseM2NTransferConfig + + +class M2NTransferPredictorRegistry(BaseRegistry): + """Registry for M2N transfer predictors.""" + + @classmethod + def get_key_from_str(cls, key_str: str) -> M2NTransferType: + return M2NTransferType.from_str(key_str) + + @classmethod + def get( + cls, + predictor_type: M2NTransferType, + config: "BaseM2NTransferConfig", + ) -> BaseM2NTransferPredictor: + if predictor_type not in cls._registry: + raise ValueError(f"M2N transfer predictor type {predictor_type} is not registered") + return cls._registry[predictor_type](config) + + +M2NTransferPredictorRegistry.register( + M2NTransferType.ANALYTICAL, AnalyticalM2NTransferPredictor +) diff --git a/frontier/main.py b/frontier/main.py index 7a595bc..176e8dd 100644 --- a/frontier/main.py +++ b/frontier/main.py @@ -4,6 +4,7 @@ from frontier.config import ( AICONFIGURATOR_BACKEND_RELEASE_ERROR, DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR, + PD_DISAGGREGATION_PARALLEL_CLUSTER_RELEASE_ERROR, SimulationConfig, ) from frontier.errors import FrontierMemoryOOMError @@ -11,22 +12,24 @@ from frontier.utils.random import set_seeds -_DISAGGREGATED_ARCHITECTURES = {"pd-disaggregation", "pd-af-disaggregation"} -_DISAGGREGATED_CLUSTER_OPTION_PREFIXES = ( +_UNSUPPORTED_DISAGGREGATED_ARCHITECTURES = {"pd-af-disaggregation"} +_PD_CLUSTER_OPTION_PREFIXES = ( "--cluster_config_prefill_", "--cluster_config_decode_", +) +_UNSUPPORTED_DISAGGREGATED_CLUSTER_OPTION_PREFIXES = ( "--cluster_config_decode_attn_", "--cluster_config_decode_ffn_", ) -_DISAGGREGATED_CLUSTER_OPTIONS = frozenset( - { - "--cluster_config_af_pipeline_num_micro_batch", - } -) _DISAGGREGATED_TRANSFER_OPTION_MARKERS = ( "kv_cache_transfer_config", "m2n_transfer_config", ) +_UNSUPPORTED_DISAGGREGATED_CLUSTER_OPTIONS = frozenset( + { + "--cluster_config_af_pipeline_num_micro_batch", + } +) _AICONFIGURATOR_BACKEND_CONFIG_OPTION_PREFIXES = ( "--aiconfigurator_cc_backend_config_", ) @@ -62,9 +65,29 @@ def _normalize_cli_option(option: str) -> str: def _has_disaggregated_cluster_option(argv: list[str]) -> bool: for arg in argv: option = _normalize_cli_option(arg.split("=", maxsplit=1)[0]) - if option in _DISAGGREGATED_CLUSTER_OPTIONS: + if option in _UNSUPPORTED_DISAGGREGATED_CLUSTER_OPTIONS: + return True + if option.startswith(_PD_CLUSTER_OPTION_PREFIXES): + return True + if option.startswith(_UNSUPPORTED_DISAGGREGATED_CLUSTER_OPTION_PREFIXES): + return True + return False + + +def _has_unsupported_disaggregated_cluster_option(argv: list[str]) -> bool: + for arg in argv: + option = _normalize_cli_option(arg.split("=", maxsplit=1)[0]) + if option in _UNSUPPORTED_DISAGGREGATED_CLUSTER_OPTIONS: return True - if option.startswith(_DISAGGREGATED_CLUSTER_OPTION_PREFIXES): + if option.startswith(_UNSUPPORTED_DISAGGREGATED_CLUSTER_OPTION_PREFIXES): + return True + return False + + +def _has_pd_cluster_option(argv: list[str]) -> bool: + for arg in argv: + option = _normalize_cli_option(arg.split("=", maxsplit=1)[0]) + if option.startswith(_PD_CLUSTER_OPTION_PREFIXES): return True return False @@ -112,13 +135,22 @@ def _has_truthy_cli_bool(argv: list[str], option: str) -> bool: def _exit_if_disaggregated_architecture_requested(argv: list[str]) -> None: sys_arch = _get_cli_option_value(argv, "--sys_arch") - has_disaggregated_cluster_args = _has_disaggregated_cluster_option(argv) + has_unsupported_disaggregated_cluster_args = ( + _has_unsupported_disaggregated_cluster_option(argv) + ) + has_pd_cluster_args_without_pd_arch = ( + _has_pd_cluster_option(argv) and sys_arch != "pd-disaggregation" + ) has_disaggregated_transfer_args = _has_disaggregated_transfer_option(argv) + has_disaggregated_transfer_args_without_pd_arch = ( + has_disaggregated_transfer_args and sys_arch != "pd-disaggregation" + ) has_pd_af_cuda_graph_arg = _has_truthy_cli_bool(argv, "--use_cuda_graph") if ( - sys_arch in _DISAGGREGATED_ARCHITECTURES - or has_disaggregated_cluster_args - or has_disaggregated_transfer_args + sys_arch in _UNSUPPORTED_DISAGGREGATED_ARCHITECTURES + or has_unsupported_disaggregated_cluster_args + or has_pd_cluster_args_without_pd_arch + or has_disaggregated_transfer_args_without_pd_arch or has_pd_af_cuda_graph_arg ): print(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR, file=sys.stderr) @@ -156,6 +188,7 @@ def main() -> None: if str(exc) in { AICONFIGURATOR_BACKEND_RELEASE_ERROR, DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR, + PD_DISAGGREGATION_PARALLEL_CLUSTER_RELEASE_ERROR, }: print(str(exc), file=sys.stderr) raise SystemExit(1) from exc diff --git a/frontier/metrics/metrics_store.py b/frontier/metrics/metrics_store.py index bbf5c3e..422e2c2 100644 --- a/frontier/metrics/metrics_store.py +++ b/frontier/metrics/metrics_store.py @@ -31,6 +31,7 @@ from frontier.metrics.trace_store import TraceStore from frontier.metrics.op_trace_utils import ( OpTraceContext, + build_kv_cache_transfer_meta, build_parallel_context, compute_op_trace_meta, ) @@ -4181,7 +4182,60 @@ def on_kv_cache_transfer_start( kv_cache_size_bytes: int, transfer_info: Any, ) -> None: - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + if self._trace_store and self._config.enable_op_level_tracing: + from frontier.metrics.trace_store import TraceEvent + + batch_id = transfer_info.batch.id if transfer_info.batch else None + request_ids = ( + [str(req.id) for req in transfer_info.batch.requests] + if transfer_info.batch and transfer_info.batch.requests + else [] + ) + + cluster_config = self._cluster_configs.get(transfer_info.source_cluster_type) + if cluster_config is None: + raise ValueError( + f"Cluster config not found for {transfer_info.source_cluster_type}" + ) + transfer_meta = build_kv_cache_transfer_meta( + transfer_info.batch, + cluster_config.replica_config, + transfer_info.source_cluster_type, + kv_cache_size_bytes, + ) + total_tokens = transfer_meta["total_tokens"] + trace_context = OpTraceContext( + cluster_type=transfer_info.source_cluster_type, + model_config=cluster_config.replica_config.model_config, + replica_config=cluster_config.replica_config, + total_tokens=total_tokens, + effective_tokens_compute=total_tokens, + effective_tokens_transfer=total_tokens, + effective_tokens_rounded=(total_tokens + 7) // 8 * 8, + tokens_are_post_routing=False, + ) + transfer_meta["parallel_context"] = build_parallel_context(trace_context) + transfer_meta["model_name"] = cluster_config.replica_config.model_name + transfer_meta["request_ids"] = request_ids + transfer_meta["source_dp_id"] = source_dp_id + + event = TraceEvent( + type="TRANSFER", + name="kv_cache_transfer", + ts_start=time, + duration_ms=transfer_info.transfer_time_ms, + cluster=transfer_info.source_cluster_type.name, + replica_id=source_replica_id, + batch_id=batch_id, + target_cluster=target_cluster_type.name, + meta=transfer_meta, + ) + self._trace_store.log_event(event) + + if not self._config.write_metrics: + return + + self._kv_cache_transfer_metrics["transfer_count"] += 1 def on_kv_cache_transfer_end( self, @@ -4191,7 +4245,23 @@ def on_kv_cache_transfer_end( target_cluster_type: ClusterType, transfer_info: Any, ) -> None: - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + if not self._config.write_metrics: + return + + self._kv_cache_transfer_metrics["total_transfer_time"] += duration + self._kv_cache_transfer_metrics["total_data_transferred"] += size_bytes + + request_info = "" + if transfer_info.batch and transfer_info.batch.requests: + request_ids = [str(req.id) for req in transfer_info.batch.requests] + request_info = f"_req_{'_'.join(request_ids)}" + + transfer_id = ( + f"transfer_{self._kv_cache_transfer_metrics['transfer_count']}" + f"{request_info}" + ) + self._kv_cache_transfer_metrics["transfer_times"].put(transfer_id, duration) + self._kv_cache_transfer_metrics["transfer_sizes"].put(transfer_id, size_bytes) def on_m2n_transfer_start( self, diff --git a/frontier/metrics/op_trace_utils.py b/frontier/metrics/op_trace_utils.py index b5a1c7f..de198b5 100644 --- a/frontier/metrics/op_trace_utils.py +++ b/frontier/metrics/op_trace_utils.py @@ -400,7 +400,8 @@ def build_kv_cache_transfer_meta( ) -> Dict[str, Any]: model_config = replica_config.model_config num_layers = model_config.num_layers - num_heads = model_config.num_q_heads + num_q_heads = model_config.num_q_heads + num_kv_heads = model_config.num_kv_heads # Use model_config.get_head_dim() to prioritize explicit head_dim from JSON config head_dim = model_config.get_head_dim() @@ -408,7 +409,7 @@ def build_kv_cache_transfer_meta( precision = _precision_for_op("kv_cache_transfer", cluster_type) dtype_bytes = precision.bytes_per_element tensor_shape = { - "kv": [total_tokens, num_layers, num_heads, head_dim, 2], + "kv": [total_tokens, num_layers, num_kv_heads, head_dim, 2], } element_count = _elements_from_shape(tensor_shape["kv"]) tensor_size_bytes = { @@ -424,7 +425,9 @@ def build_kv_cache_transfer_meta( "transfer_size_bytes": transfer_size_bytes, "total_tokens": total_tokens, "num_layers": num_layers, - "num_heads": num_heads, + "num_heads": num_kv_heads, + "num_q_heads": num_q_heads, + "num_kv_heads": num_kv_heads, "head_dim": head_dim, } diff --git a/frontier/request_generator/base_request_generator.py b/frontier/request_generator/base_request_generator.py index 73ed91b..8893a69 100644 --- a/frontier/request_generator/base_request_generator.py +++ b/frontier/request_generator/base_request_generator.py @@ -151,4 +151,7 @@ def generate_requests(self) -> List[Request]: def generate(self) -> List[Request]: requests = self.generate_requests() + self.config.num_decode_bound_requests = sum( + 1 for request in requests if request.num_decode_tokens > 0 + ) return requests diff --git a/frontier/scheduler/cluster_scheduler/base_cluster_scheduler.py b/frontier/scheduler/cluster_scheduler/base_cluster_scheduler.py index ca9accc..247dc1f 100644 --- a/frontier/scheduler/cluster_scheduler/base_cluster_scheduler.py +++ b/frontier/scheduler/cluster_scheduler/base_cluster_scheduler.py @@ -1,21 +1,32 @@ from abc import ABC, abstractmethod from collections import defaultdict, deque +import csv import math +from pathlib import Path from typing import Any, Dict, List, Tuple, Optional, TYPE_CHECKING -from frontier.config import ClusterConfig, MetricsConfig, BaseRequestGeneratorConfig +from frontier.config import ClusterConfig, BaseRequestGeneratorConfig from frontier.entities import Batch, EPBatchGroup, ExecutionTime, Replica, Request, Cluster from frontier.config.config import DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR # Phase 2.5: Removed deprecated MoECollectiveScheduleEvent import from frontier.execution_time_predictor import ( BaseExecutionTimePredictor, - ExecutionTimePredictorRegistry, ) from frontier.scheduler.replica_scheduler.replica_scheduler_registry import ( ReplicaSchedulerRegistry, ) -from frontier.types import ClusterType, ClusterSchedulerType, ReplicaSchedulerType +from frontier.types import ( + ClusterType, + ClusterSchedulerType, + ReplicaSchedulerType, + RequestGeneratorType, +) + +if TYPE_CHECKING: + from frontier.kv_cache_transfer import BaseKVCacheTransferPredictor + from frontier.m2n_transfer import BaseM2NTransferPredictor + class BaseClusterScheduler(ABC): def _validate_prefix_cache_cluster_config(self, replica_scheduler_config) -> None: @@ -51,6 +62,51 @@ def _validate_prefix_cache_cluster_config(self, replica_scheduler_config) -> Non f"Got {cluster_scheduler_type}." ) + request_generator_config = getattr(self, "_request_generator_config", None) + if request_generator_config is None: + return + + request_generator_type = request_generator_config.get_type() + if request_generator_type != RequestGeneratorType.TRACE_REPLAY: + raise ValueError( + "Prefix caching requires a trace request source with session_id " + "and block_hash_ids metadata before scheduling. " + f"Got {request_generator_type}." + ) + + trace_file = Path(request_generator_config.trace_file) + if not trace_file.exists(): + raise ValueError( + "Prefix caching trace request source requires an existing trace file " + f"with session_id and block_hash_ids columns. Got {trace_file}." + ) + + with trace_file.open("r", encoding="utf-8", newline="") as file: + reader = csv.DictReader(file) + header = reader.fieldnames + required_columns = {"session_id", "block_hash_ids"} + missing_columns = sorted(required_columns - set(header or [])) + if missing_columns: + raise ValueError( + "Prefix caching trace request source requires session_id and " + "block_hash_ids columns before scheduling. " + f"Missing columns: {missing_columns}." + ) + + for row_number, row in enumerate(reader, start=2): + missing_values = sorted( + column + for column in required_columns + if row.get(column) is None or not row[column].strip() + ) + if missing_values: + raise ValueError( + "Prefix caching trace request source requires non-empty " + "session_id and block_hash_ids values before scheduling. " + f"Trace file: {trace_file}; row {row_number}; " + f"missing values: {missing_values}." + ) + def _get_cluster_specific_replica_scheduler_config(self, config: ClusterConfig, cluster_type: ClusterType): """ Get cluster-specific replica scheduler configuration. @@ -160,15 +216,17 @@ def __init__( cluster: Cluster, request_generator_config: BaseRequestGeneratorConfig, predictor: BaseExecutionTimePredictor = None, + kv_cache_transfer_predictor: Optional["BaseKVCacheTransferPredictor"] = None, + m2n_transfer_predictor: Optional["BaseM2NTransferPredictor"] = None, available_clusters: Optional[set] = None, ): self._config = config self._cluster = cluster self._cluster_type = cluster.cluster_type - if self._cluster_type != ClusterType.MONOLITHIC: - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) self._num_replicas = len(self._cluster.replicas) self._predictor = predictor + self._kv_cache_transfer_predictor = kv_cache_transfer_predictor + self._m2n_transfer_predictor = m2n_transfer_predictor self._replica_dp_size = self._config.replica_config.data_parallel_size self._available_clusters = available_clusters or set() self._request_generator_config = request_generator_config @@ -2891,8 +2949,18 @@ def on_kv_cache_arrival( batch: Batch, transfer_info, ) -> List: - """Disaggregated KV cache arrivals are not included in this release.""" - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + """Handle KV cache arrival at a decode-side cluster.""" + from frontier.logger import get_cluster_logger + + logger = get_cluster_logger(__name__, self._cluster_type.name) + + if self._cluster_type == ClusterType.DECODE_ATTN: + return self._handle_decode_attn_arrival(time, batch, transfer_info, logger) + if self._cluster_type == ClusterType.DECODE: + return self._handle_decode_arrival(time, batch, transfer_info, logger) + raise ValueError( + f"Unexpected cluster type for KV cache arrival: {self._cluster_type}" + ) def _handle_decode_attn_arrival( self, @@ -2901,8 +2969,62 @@ def _handle_decode_attn_arrival( transfer_info, logger, ) -> List: - """Disaggregated decode-attn arrivals are not included in this release.""" - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + """Handle KV cache arrival at a decode-attention cluster.""" + request_ids = [req.id for req in batch.requests] + logger.info( + "Decode-attn cluster received KV cache at %.3fs: requests %s, " + "batch_id=%s, transfer_size=%s bytes, source_cluster=%s", + time, + request_ids, + batch.id, + transfer_info.kv_cache_size_bytes, + transfer_info.source_cluster_type.name, + ) + + queue_was_empty = len(self._request_queue) == 0 + for request in batch.requests: + request.on_arrival(time, self._cluster_type) + self.add_request(request) + logger.info( + "Request %s added to decode-attn cluster queue, prefill_tokens=%s, " + "decode_tokens=%s, num_processed_tokens=%s, total_tokens=%s, " + "is_prefill_complete=%s, current_decode_token_index=%s, " + "completed_layer_count=%s.", + request.id, + request.num_prefill_tokens, + request.num_decode_tokens, + request.num_processed_tokens, + request.total_tokens, + request.is_prefill_complete, + request.current_decode_token_index, + request.completed_layer_count, + ) + + if self._is_periodic_scheduling_enabled: + logger.info( + "Requests cached for periodic scheduling (interval=%sms), current queue size: %s", + self._periodic_scheduling_interval_ms, + len(self._request_queue), + ) + return [] + + from frontier.config.global_vars import get_simulation_mode + from frontier.events.cluster_schedule_event import ClusterScheduleEvent + + simulation_mode = get_simulation_mode() + if not queue_was_empty: + logger.info( + "Decode-attn queue already has pending requests; skip redundant schedule trigger in %s mode", + simulation_mode, + ) + return [] + + logger.info( + "KV-cache arrival triggers immediate decode-attn scheduling in %s mode; queue size=%d", + simulation_mode, + len(self._request_queue), + ) + return [ClusterScheduleEvent(time, self._cluster_type)] def _handle_decode_arrival( self, @@ -2911,8 +3033,86 @@ def _handle_decode_arrival( transfer_info, logger, ) -> List: - """Disaggregated decode arrivals are not included in this release.""" - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + """Handle KV cache arrival at a unified decode cluster.""" + request_ids = [req.id for req in batch.requests] + logger.info( + "Decode cluster received KV cache at %.3fs: requests %s, " + "batch_id=%s, transfer_size=%s bytes, source_cluster=%s", + time, + request_ids, + batch.id, + transfer_info.kv_cache_size_bytes, + transfer_info.source_cluster_type.name, + ) + + for request in batch.requests: + request.on_arrival(time, self._cluster_type) + self.add_request(request) + logger.info( + "Request %s added to decode cluster queue, prefill_tokens=%s, " + "decode_tokens=%s, num_processed_tokens=%s, total_tokens=%s, " + "is_prefill_complete=%s, current_decode_token_index=%s, " + "completed_layer_count=%s.", + request.id, + request.num_prefill_tokens, + request.num_decode_tokens, + request.num_processed_tokens, + request.total_tokens, + request.is_prefill_complete, + request.current_decode_token_index, + request.completed_layer_count, + ) + + if self._is_periodic_scheduling_enabled: + logger.info( + "Requests cached for periodic scheduling (interval=%sms), current queue size: %s", + self._periodic_scheduling_interval_ms, + len(self._request_queue), + ) + return [] + + from frontier.config.global_vars import get_simulation_mode + from frontier.events.cluster_schedule_event import ClusterScheduleEvent + + simulation_mode = get_simulation_mode() + if simulation_mode == "offline": + expected_num_requests = getattr( + self._request_generator_config, "num_decode_bound_requests", None + ) + if expected_num_requests is None: + raise ValueError( + "Offline DECODE scheduling requires " + "request_generator_config.num_decode_bound_requests to be set " + "by request generation." + ) + + current_num_requests = len(self._request_queue) + if current_num_requests > expected_num_requests: + raise ValueError( + "Offline DECODE received more decode-bound requests than " + f"expected: current={current_num_requests}, " + f"expected={expected_num_requests}" + ) + if current_num_requests < expected_num_requests: + logger.info( + "Offline mode: buffering decode-bound requests (%s/%s), " + "deferring scheduling until all decode-bound requests arrive", + current_num_requests, + expected_num_requests, + ) + return [] + logger.info( + "Offline mode: all %s decode-bound requests arrived, " + "triggering batch scheduling", + expected_num_requests, + ) + return [ClusterScheduleEvent(time, self._cluster_type)] + + logger.info( + "Online mode: triggering immediate cluster scheduling for %s requests", + len(batch.requests), + ) + return [ClusterScheduleEvent(time, self._cluster_type)] def on_m2n_arrival( diff --git a/frontier/scheduler/cluster_scheduler/round_robin_cluster_scheduler.py b/frontier/scheduler/cluster_scheduler/round_robin_cluster_scheduler.py index d98a62a..7398c05 100644 --- a/frontier/scheduler/cluster_scheduler/round_robin_cluster_scheduler.py +++ b/frontier/scheduler/cluster_scheduler/round_robin_cluster_scheduler.py @@ -3,7 +3,6 @@ from frontier.scheduler.cluster_scheduler.base_cluster_scheduler import ( BaseClusterScheduler, ) -from frontier.config.config import DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR from frontier.types import ClusterType @@ -38,13 +37,30 @@ def __init__(self, *args, **kwargs): def schedule(self) -> List[Tuple[int, int, Request]]: """ - Schedule requests with the release-supported monolithic round-robin strategy. + Schedule requests using cluster-type-aware round-robin strategy. + + - PREFILL cluster: Batch processing mode (offline-style) + - DECODE cluster (PD mode): Priority-based scheduling with batch backfilling + - DECODE_ATTN cluster (PD+AF mode): Optional initial request allocation with threshold, then A↔F priority dynamic routing + - DECODE_FFN cluster (PD+AF mode): Batch processing mode with M2N immediate processing + - Other clusters: Default batch processing mode """ self.sort_requests() - if self._cluster_type != ClusterType.MONOLITHIC: - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) - return self._schedule_batch_mode() + if self._cluster_type == ClusterType.DECODE: + return self._schedule_decode_with_priority() + elif self._cluster_type == ClusterType.DECODE_ATTN: + initial_mapping = self._try_initial_request_allocation() + if initial_mapping is not None: + if initial_mapping and self._request_queue: + initial_mapping.extend(self._schedule_dynamic()) + return initial_mapping + return self._schedule_dynamic_with_af_priority() + elif self._cluster_type == ClusterType.DECODE_FFN: + affected = self.schedule_ffn_with_m2n_immediate() + return [(replica_id, ep_id, None) for (replica_id, ep_id) in affected] + else: + return self._schedule_batch_mode() def _try_initial_request_allocation(self) -> Optional[List[Tuple[int, int, Request]]]: """ diff --git a/frontier/scheduler/cluster_scheduler/sticky_round_robin_cluster_scheduler.py b/frontier/scheduler/cluster_scheduler/sticky_round_robin_cluster_scheduler.py index 8a72510..48040f5 100644 --- a/frontier/scheduler/cluster_scheduler/sticky_round_robin_cluster_scheduler.py +++ b/frontier/scheduler/cluster_scheduler/sticky_round_robin_cluster_scheduler.py @@ -6,7 +6,6 @@ from frontier.scheduler.cluster_scheduler.round_robin_cluster_scheduler import ( RoundRobinClusterScheduler, ) -from frontier.config.config import DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR from frontier.types import ClusterType @@ -47,6 +46,14 @@ def _schedule_batch_mode(self) -> List[Tuple[int, int, Request]]: def schedule(self) -> List[Tuple[int, int, Request]]: self.sort_requests() cluster_type = getattr(self, "_cluster_type", None) - if cluster_type != ClusterType.MONOLITHIC: - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) + if cluster_type == ClusterType.DECODE: + return self._schedule_decode_with_priority() + if cluster_type == ClusterType.DECODE_ATTN: + initial_mapping = self._try_initial_request_allocation() + if initial_mapping is not None: + return initial_mapping + return self._schedule_dynamic_with_af_priority() + if cluster_type == ClusterType.DECODE_FFN: + affected = self.schedule_ffn_with_m2n_immediate() + return [(replica_id, ep_id, None) for (replica_id, ep_id) in affected] return self._schedule_batch_mode() diff --git a/frontier/scheduler/global_scheduler/base_global_scheduler.py b/frontier/scheduler/global_scheduler/base_global_scheduler.py index 6e5c53e..99fdda6 100644 --- a/frontier/scheduler/global_scheduler/base_global_scheduler.py +++ b/frontier/scheduler/global_scheduler/base_global_scheduler.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, List, Tuple, TYPE_CHECKING +from typing import Dict, List, Tuple, Optional, TYPE_CHECKING import threading import queue from collections import defaultdict @@ -12,6 +12,8 @@ from frontier.logger import init_logger if TYPE_CHECKING: + from frontier.kv_cache_transfer import BaseKVCacheTransferPredictor + from frontier.m2n_transfer import BaseM2NTransferPredictor from frontier.events import BaseEvent logger = init_logger(__name__) @@ -23,11 +25,15 @@ def __init__( clusters: Dict[ClusterType, Cluster], request_generator_config: BaseRequestGeneratorConfig, predictors: Dict[ClusterType, BaseExecutionTimePredictor] = None, + kv_cache_transfer_predictor: Optional["BaseKVCacheTransferPredictor"] = None, + m2n_transfer_predictor: Optional["BaseM2NTransferPredictor"] = None, enable_parallel_mode: bool = False, max_inter_cluster_queue_size: int = 1000, ): self._clusters = clusters self._cluster_schedulers = {} + self._kv_cache_transfer_predictor = kv_cache_transfer_predictor + self._m2n_transfer_predictor = m2n_transfer_predictor self._enable_parallel_mode = enable_parallel_mode assert predictors is not None, "Predictors are required for scheduler initialization" @@ -42,6 +48,8 @@ def __init__( cluster=cluster, request_generator_config=request_generator_config, predictor=predictor, + kv_cache_transfer_predictor=kv_cache_transfer_predictor, + m2n_transfer_predictor=m2n_transfer_predictor, available_clusters=set(clusters.keys()), # Pass available cluster types ) self._request_queue = [] # List[Tuple[Request, ClusterType]] diff --git a/frontier/scheduler/replica_scheduler/base_replica_scheduler.py b/frontier/scheduler/replica_scheduler/base_replica_scheduler.py index f304ecd..0883876 100644 --- a/frontier/scheduler/replica_scheduler/base_replica_scheduler.py +++ b/frontier/scheduler/replica_scheduler/base_replica_scheduler.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, List +from typing import Any, Dict, List, Sequence from frontier.config import ( BaseReplicaSchedulerConfig, @@ -768,6 +768,13 @@ def free(self, *request_ids: List[int]) -> None: def free_batch(self, batch: Batch) -> None: self.free(*batch.request_ids) + def complete_kv_transfer_for_requests( + self, requests: Sequence[Request] + ) -> None: + raise NotImplementedError( + f"{self.__class__.__name__} does not support KV transfer completion." + ) + @abstractmethod def on_batch_end(self, batch: Batch) -> None: pass diff --git a/frontier/scheduler/replica_scheduler/sarathi_replica_scheduler.py b/frontier/scheduler/replica_scheduler/sarathi_replica_scheduler.py index 46b98aa..4ab52b8 100644 --- a/frontier/scheduler/replica_scheduler/sarathi_replica_scheduler.py +++ b/frontier/scheduler/replica_scheduler/sarathi_replica_scheduler.py @@ -1,4 +1,5 @@ from math import ceil +from typing import Sequence from frontier.entities.batch import Batch, Request from frontier.scheduler.replica_scheduler.base_replica_scheduler import ( @@ -78,6 +79,23 @@ def on_batch_end(self, batch: Batch) -> None: def _free_request_resources(self, request: Request) -> None: self.free(request.id) + def complete_kv_transfer_for_requests( + self, requests: Sequence[Request] + ) -> None: + for request in requests: + if request.id not in self._pending_kv_transfer_requests: + raise ValueError( + "KV transfer completion for request without pending transfer state: " + f"request_id={request.id}, " + f"source_cluster={self._cluster_type.name}, " + f"source_replica={self._replica_id}, " + f"source_dp={self._dp_id}" + ) + + if request.id in self._allocation_map: + self._free_request_resources(request) + self._pending_kv_transfer_requests.discard(request.id) + def _get_request_next_num_tokens( self, request: Request, batch_contains_prefill: bool, num_batch_tokens: int ) -> int: diff --git a/frontier/scheduler/replica_scheduler/vllm_v1_engine_replica_scheduler.py b/frontier/scheduler/replica_scheduler/vllm_v1_engine_replica_scheduler.py index 477e327..b9a2a79 100644 --- a/frontier/scheduler/replica_scheduler/vllm_v1_engine_replica_scheduler.py +++ b/frontier/scheduler/replica_scheduler/vllm_v1_engine_replica_scheduler.py @@ -22,7 +22,7 @@ from math import ceil import os import time -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Sequence, Tuple from frontier.config import global_vars from frontier.entities.batch import ( @@ -1123,6 +1123,23 @@ def _find_request_by_id(self, request_id: int) -> Optional[Request]: return request return None + def complete_kv_transfer_for_requests( + self, requests: Sequence[Request] + ) -> None: + for request in requests: + if request.id not in self._pending_kv_transfer_requests: + raise ValueError( + "KV transfer completion for request without pending transfer state: " + f"request_id={request.id}, " + f"source_cluster={self._cluster_type.name}, " + f"source_replica={self._replica_id}, " + f"source_dp={self._dp_id}" + ) + + if request.id in self._allocation_map: + self._free_request_resources(request) + self._pending_kv_transfer_requests.discard(request.id) + def _free_request_resources(self, request: Request) -> None: self._get_monolithic_pp_mtp_near_full_prefill_request_ids().discard( request.id diff --git a/frontier/simulator.py b/frontier/simulator.py index 3a60dee..b1e14fd 100644 --- a/frontier/simulator.py +++ b/frontier/simulator.py @@ -41,8 +41,6 @@ class Simulator: def __init__(self, config: SimulationConfig) -> None: self._config: SimulationConfig = config - if self._config.is_disaggregated_mode(): - raise ValueError(DISAGGREGATED_ARCHITECTURE_RELEASE_ERROR) self._time = 0 self._terminate = False @@ -201,6 +199,24 @@ def __init__(self, config: SimulationConfig) -> None: ) ) + kv_cache_transfer_predictor = None + if self._config.is_disaggregated_mode(): + from frontier.kv_cache_transfer import KVCacheTransferPredictorRegistry + + kv_cache_transfer_predictor = KVCacheTransferPredictorRegistry.get( + self._config.kv_cache_transfer_config.get_type(), + config=self._config.kv_cache_transfer_config, + ) + + m2n_transfer_predictor = None + if self._config.is_disaggregated_mode(): + from frontier.m2n_transfer import M2NTransferPredictorRegistry + + m2n_transfer_predictor = M2NTransferPredictorRegistry.get( + self._config.m2n_transfer_config.get_type(), + config=self._config.m2n_transfer_config, + ) + # In disaggregated mode, global scheduler gets all clusters with their predictors. # In monolithic mode, it gets a dict with one cluster and predictor. # Enable parallel mode in GlobalScheduler if configured @@ -214,6 +230,8 @@ def __init__(self, config: SimulationConfig) -> None: self._clusters, self._config.request_generator_config, predictors=self._predictors, + kv_cache_transfer_predictor=kv_cache_transfer_predictor, + m2n_transfer_predictor=m2n_transfer_predictor, enable_parallel_mode=enable_parallel, max_inter_cluster_queue_size=self._config.max_inter_cluster_queue_size, ) diff --git a/frontier/types/__init__.py b/frontier/types/__init__.py index 7d605e4..9438b33 100644 --- a/frontier/types/__init__.py +++ b/frontier/types/__init__.py @@ -6,6 +6,8 @@ from .event_type import EventType from .execution_time_predictor_type import ExecutionTimePredictorType from .cluster_scheduler_type import ClusterSchedulerType +from .kv_cache_transfer_type import KVCacheTransferType +from .m2n_transfer_type import M2NTransferType from .measurement_type import MeasurementType from .node_sku_type import NodeSKUType from .norm_type import NormType @@ -19,6 +21,8 @@ ExecutionTimePredictorType, ClusterSchedulerType, CCBackendType, + KVCacheTransferType, + M2NTransferType, MeasurementType, RequestGeneratorType, RequestLengthGeneratorType, diff --git a/frontier/types/kv_cache_transfer_type.py b/frontier/types/kv_cache_transfer_type.py new file mode 100644 index 0000000..5a232cc --- /dev/null +++ b/frontier/types/kv_cache_transfer_type.py @@ -0,0 +1,7 @@ +from frontier.types.base_int_enum import BaseIntEnum + + +class KVCacheTransferType(BaseIntEnum): + ANALYTICAL = 1 + EMPIRICAL = 2 + HYBRID = 3 diff --git a/frontier/types/m2n_transfer_type.py b/frontier/types/m2n_transfer_type.py new file mode 100644 index 0000000..857582f --- /dev/null +++ b/frontier/types/m2n_transfer_type.py @@ -0,0 +1,7 @@ +from frontier.types.base_int_enum import BaseIntEnum + + +class M2NTransferType(BaseIntEnum): + ANALYTICAL = 1 + EMPIRICAL = 2 + HYBRID = 3 diff --git a/tests/unit/test_kv_transfer_completion_contract.py b/tests/unit/test_kv_transfer_completion_contract.py new file mode 100644 index 0000000..978eac0 --- /dev/null +++ b/tests/unit/test_kv_transfer_completion_contract.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from frontier.events.cluster_batch_end_event import ClusterBatchEndEvent +from frontier.events.kv_cache_transfer_end_event import KVCacheTransferEndEvent +from frontier.scheduler.replica_scheduler.base_replica_scheduler import ( + BaseReplicaScheduler, +) +from frontier.scheduler.replica_scheduler.sarathi_replica_scheduler import ( + SarathiReplicaScheduler, +) +from frontier.scheduler.replica_scheduler.vllm_v1_engine_replica_scheduler import ( + VLLMv1EngineReplicaScheduler, +) +from frontier.types import ClusterType + + +class _Request: + def __init__(self, request_id: int) -> None: + self.id = request_id + self.transfer_events: list[tuple[float, float]] = [] + + def on_kv_cache_transfer_complete( + self, time: float, transfer_duration_s: float + ) -> None: + self.transfer_events.append((time, transfer_duration_s)) + + +class _ConcreteBaseScheduler(BaseReplicaScheduler): + def on_batch_end(self, batch) -> None: + raise NotImplementedError + + def _get_next_batch(self): + raise NotImplementedError + + +class _ContractSourceScheduler: + memory_usage_percent = 12.5 + num_pending_requests = 0 + num_running_batches = 0 + + def __init__(self) -> None: + self.completed_request_ids: list[int] = [] + + def complete_kv_transfer_for_requests(self, requests) -> None: + self.completed_request_ids.extend(request.id for request in requests) + + +class _SourceClusterScheduler: + def __init__(self, replica_scheduler) -> None: + self.replica_scheduler = replica_scheduler + + def get_dp_replica_scheduler(self, replica_id: int, dp_id: int): + assert replica_id == 3 + assert dp_id == 1 + return self.replica_scheduler + + +class _TargetClusterScheduler: + def __init__(self) -> None: + self.arrival_batches = [] + + def on_kv_cache_arrival(self, time, batch, transfer_info): + self.arrival_batches.append((time, batch, transfer_info)) + return ["arrival-event"] + + +class _GlobalScheduler: + def __init__(self, source_cluster, target_cluster) -> None: + self.source_cluster = source_cluster + self.target_cluster = target_cluster + + def get_cluster_scheduler(self, cluster_type: ClusterType): + if cluster_type == ClusterType.PREFILL: + return self.source_cluster + if cluster_type == ClusterType.DECODE: + return self.target_cluster + raise AssertionError(f"Unexpected cluster_type={cluster_type}") + + +class _MetricsStore: + def __init__(self) -> None: + self.kv_transfer_end_calls = [] + self.replica_schedule_calls = [] + + def on_kv_cache_transfer_end(self, *args, **kwargs) -> None: + self.kv_transfer_end_calls.append((args, kwargs)) + + def on_replica_schedule(self, *args, **kwargs) -> None: + self.replica_schedule_calls.append((args, kwargs)) + + +class _HookFailingBatch: + id = 909 + schedule_epoch = 0 + request_execution_signatures = [] + request_mutation_signatures = [] + thinking_round_start_times = [] + + def on_cluster_stage_end(self, time, cluster_type) -> None: + raise RuntimeError("stage hook failed") + + +class _HookReplicaScheduler: + def on_cluster_stage_end(self, batch) -> None: + raise AssertionError("batch hook should fail before replica hook") + + +class _HookClusterScheduler: + def get_dp_replica_scheduler(self, replica_id: int, dp_id: int): + assert replica_id == 0 + assert dp_id == 0 + return _HookReplicaScheduler() + + +class _HookGlobalScheduler: + def get_cluster_scheduler(self, cluster_type: ClusterType): + assert cluster_type == ClusterType.DECODE_ATTN + return _HookClusterScheduler() + + +def test_cluster_batch_end_fails_fast_when_stage_hook_fails() -> None: + event = ClusterBatchEndEvent( + time=1.0, + replica_id=0, + batch=_HookFailingBatch(), + cluster_type=ClusterType.DECODE_ATTN, + dp_id=0, + ) + + with pytest.raises(RuntimeError, match="stage hook failed"): + event.handle_event(_HookGlobalScheduler(), _MetricsStore()) + + +def test_kv_transfer_end_uses_public_scheduler_completion_contract() -> None: + request = _Request(101) + batch = SimpleNamespace( + id=7, + global_id=17, + requests=[request], + request_ids=[request.id], + ) + transfer_info = SimpleNamespace( + transfer_start_time=1.0, + transfer_end_time=None, + kv_cache_size_bytes=4096, + target_cluster_type=ClusterType.DECODE, + source_cluster_type=ClusterType.PREFILL, + source_replica_id=3, + source_dp_id=1, + transfer_time_ms=2.0, + batch=batch, + ) + source_replica_scheduler = _ContractSourceScheduler() + source_cluster_scheduler = _SourceClusterScheduler(source_replica_scheduler) + target_cluster_scheduler = _TargetClusterScheduler() + scheduler = _GlobalScheduler(source_cluster_scheduler, target_cluster_scheduler) + metrics_store = _MetricsStore() + + event = KVCacheTransferEndEvent(1.25, transfer_info) + arrival_events = event.handle_event(scheduler, metrics_store) + + assert arrival_events == ["arrival-event"] + assert source_replica_scheduler.completed_request_ids == [101] + assert request.transfer_events == [(1.25, 0.25)] + assert len(metrics_store.kv_transfer_end_calls) == 1 + assert len(metrics_store.replica_schedule_calls) == 1 + + +def test_base_scheduler_rejects_unsupported_kv_transfer_completion() -> None: + scheduler = object.__new__(_ConcreteBaseScheduler) + + with pytest.raises(NotImplementedError, match="KV transfer completion"): + scheduler.complete_kv_transfer_for_requests([]) + + +@pytest.mark.parametrize( + "scheduler_cls", + [VLLMv1EngineReplicaScheduler, SarathiReplicaScheduler], +) +def test_replica_scheduler_completion_releases_allocation_and_pending_state( + scheduler_cls, +) -> None: + request = _Request(202) + scheduler = object.__new__(scheduler_cls) + scheduler._pending_kv_transfer_requests = {request.id} + scheduler._allocation_map = {request.id: 2} + scheduler._cluster_type = ClusterType.PREFILL + scheduler._replica_id = 3 + scheduler._dp_id = 1 + freed_request_ids = [] + + def _free_request_resources(freed_request) -> None: + freed_request_ids.append(freed_request.id) + scheduler._allocation_map.pop(freed_request.id, None) + + scheduler._free_request_resources = _free_request_resources + + scheduler.complete_kv_transfer_for_requests([request]) + + assert freed_request_ids == [request.id] + assert scheduler._allocation_map == {} + assert scheduler._pending_kv_transfer_requests == set() + + +@pytest.mark.parametrize( + "scheduler_cls", + [VLLMv1EngineReplicaScheduler, SarathiReplicaScheduler], +) +def test_replica_scheduler_completion_requires_pending_transfer_state( + scheduler_cls, +) -> None: + request = _Request(303) + scheduler = object.__new__(scheduler_cls) + scheduler._pending_kv_transfer_requests = set() + scheduler._allocation_map = {request.id: 1} + scheduler._cluster_type = ClusterType.PREFILL + scheduler._replica_id = 3 + scheduler._dp_id = 1 + + with pytest.raises(ValueError, match="without pending transfer state"): + scheduler.complete_kv_transfer_for_requests([request]) diff --git a/tests/unit/test_op_trace_utils.py b/tests/unit/test_op_trace_utils.py new file mode 100644 index 0000000..5e6ecd7 --- /dev/null +++ b/tests/unit/test_op_trace_utils.py @@ -0,0 +1,421 @@ +""" +Unit tests for op trace shape/precision helpers. + +Tests cover: +- Compute op shape/size metadata +- Communication op size metadata +- Transfer shape/size metadata +""" + +from unittest.mock import MagicMock + +import pytest + + +def _reset_quantization() -> None: + from frontier.config import get_quantization_manager + + get_quantization_manager().load_config(None) + + +def _build_context(is_moe: bool = False, tokens_are_post_routing: bool = False): + from frontier.metrics.op_trace_utils import OpTraceContext + from frontier.types import ClusterType + + _reset_quantization() + + model_config = MagicMock() + model_config.embedding_dim = 8 + model_config.num_q_heads = 4 + model_config.num_kv_heads = 2 + model_config.mlp_hidden_dim = 16 + model_config.num_experts = 4 if is_moe else 0 + model_config.num_experts_per_tok = 2 if is_moe else 0 + model_config.is_moe = is_moe + # Mock get_head_dim() to return computed value (embedding_dim // num_q_heads = 8 // 4 = 2) + model_config.get_head_dim = MagicMock(return_value=2) + + replica_config = MagicMock() + replica_config.attn_tensor_parallel_size = 2 + replica_config.attn_data_parallel_size = 1 + replica_config.moe_tensor_parallel_size = 2 + replica_config.moe_expert_parallel_size = 2 + replica_config.num_pipeline_stages = 1 + replica_config.router_topk = 2 if is_moe else 1 + + return OpTraceContext( + cluster_type=ClusterType.PREFILL, + model_config=model_config, + replica_config=replica_config, + total_tokens=8, + effective_tokens_compute=8, + effective_tokens_transfer=8, + effective_tokens_rounded=8, + tokens_are_post_routing=tokens_are_post_routing, + ) + + +def test_compute_attn_pre_proj_shapes(): + from frontier.metrics.op_trace_utils import compute_op_trace_meta + + _reset_quantization() + ctx = _build_context() + meta = compute_op_trace_meta("attn_pre_proj", "COMPUTE", ctx) + + assert meta["tensor_shape"]["input"] == [8, 8] + assert meta["tensor_shape"]["output"] == [8, 8] + assert meta["tensor_size_bytes"]["input"] == 128 + assert meta["tensor_size_bytes"]["output"] == 128 + assert meta["dtype"] == "FP16" + + +def test_attention_kv_replication_shapes_for_tp8(): + from frontier.metrics.op_trace_utils import OpTraceContext, compute_op_trace_meta + from frontier.types import ClusterType + + _reset_quantization() + + model_config = MagicMock() + model_config.embedding_dim = 4096 + model_config.num_q_heads = 32 + model_config.num_kv_heads = 4 + model_config.mlp_hidden_dim = 11008 + model_config.num_experts = 0 + model_config.num_experts_per_tok = 0 + model_config.is_moe = False + model_config.get_head_dim = MagicMock(return_value=128) + + replica_config = MagicMock() + replica_config.attn_tensor_parallel_size = 8 + replica_config.attn_data_parallel_size = 1 + replica_config.moe_tensor_parallel_size = 1 + replica_config.moe_expert_parallel_size = 1 + replica_config.num_pipeline_stages = 1 + replica_config.router_topk = 1 + + ctx = OpTraceContext( + cluster_type=ClusterType.MONOLITHIC, + model_config=model_config, + replica_config=replica_config, + total_tokens=16, + effective_tokens_compute=16, + effective_tokens_transfer=16, + effective_tokens_rounded=16, + tokens_are_post_routing=False, + ) + + pre_proj_meta = compute_op_trace_meta("attn_pre_proj", "COMPUTE", ctx) + assert pre_proj_meta["tensor_shape"]["output"] == [16, 768] + + kv_save_meta = compute_op_trace_meta("attn_kv_cache_save", "COMPUTE", ctx) + assert kv_save_meta["tensor_shape"]["k"] == [16, 1, 128] + assert kv_save_meta["tensor_shape"]["v"] == [16, 1, 128] + + +def test_attention_kv_replication_requires_divisible_tp_ratio(): + from frontier.metrics.op_trace_utils import OpTraceContext, compute_op_trace_meta + from frontier.types import ClusterType + + _reset_quantization() + + model_config = MagicMock() + model_config.embedding_dim = 4096 + model_config.num_q_heads = 32 + model_config.num_kv_heads = 3 + model_config.mlp_hidden_dim = 11008 + model_config.num_experts = 0 + model_config.num_experts_per_tok = 0 + model_config.is_moe = False + model_config.get_head_dim = MagicMock(return_value=128) + + replica_config = MagicMock() + replica_config.attn_tensor_parallel_size = 8 + replica_config.attn_data_parallel_size = 1 + replica_config.moe_tensor_parallel_size = 1 + replica_config.moe_expert_parallel_size = 1 + replica_config.num_pipeline_stages = 1 + replica_config.router_topk = 1 + + ctx = OpTraceContext( + cluster_type=ClusterType.MONOLITHIC, + model_config=model_config, + replica_config=replica_config, + total_tokens=16, + effective_tokens_compute=16, + effective_tokens_transfer=16, + effective_tokens_rounded=16, + tokens_are_post_routing=False, + ) + + with pytest.raises(ValueError, match="replication requires attn_tp"): + compute_op_trace_meta("attn_prefill", "COMPUTE", ctx) + + +def test_comm_allreduce_sizes(): + from frontier.metrics.op_trace_utils import compute_op_trace_meta + + _reset_quantization() + ctx = _build_context() + meta = compute_op_trace_meta("attn_tensor_parallel_allreduce", "COMM", ctx) + + assert meta["tensor_shape"]["data"] == [8, 8] + assert meta["element_count"] == 64 + assert meta["base_size_bytes"] == 128 + assert meta["data_size_bytes"] == 128 + assert meta["dtype"] == "FP16" + + +def test_moe_grouped_gemm_shapes(): + from frontier.metrics.op_trace_utils import compute_op_trace_meta + + _reset_quantization() + ctx = _build_context(is_moe=True) + meta = compute_op_trace_meta("moe_grouped_gemm", "COMPUTE", ctx) + + assert meta["tensor_shape"]["input"] == [16, 8] + assert meta["tensor_shape"]["output"] == [16, 8] + assert meta["tensor_size_bytes"]["input"] == 256 + assert meta["tensor_size_bytes"]["output"] == 256 + + +def test_moe_grouped_gemm_post_routing_shapes(): + from frontier.metrics.op_trace_utils import compute_op_trace_meta + + _reset_quantization() + ctx = _build_context(is_moe=True, tokens_are_post_routing=True) + meta = compute_op_trace_meta("moe_grouped_gemm", "COMPUTE", ctx) + + assert meta["tensor_shape"]["input"] == [8, 8] + assert meta["tensor_shape"]["output"] == [8, 8] + assert meta["tensor_size_bytes"]["input"] == 128 + assert meta["tensor_size_bytes"]["output"] == 128 + + +def test_moe_ep_comm_post_routing_shape(): + from frontier.metrics.op_trace_utils import compute_op_trace_meta + + _reset_quantization() + ctx = _build_context(is_moe=True, tokens_are_post_routing=True) + meta = compute_op_trace_meta("expert_parallel_alltoall_dispatch", "COMM", ctx) + + assert meta["tensor_shape"]["data"] == [4, 2, 8] + assert meta["element_count"] == 64 + assert meta["base_size_bytes"] == 128 + + +def test_moe_ep_alltoall_post_routing_shape(): + from frontier.metrics.op_trace_utils import compute_op_trace_meta + + _reset_quantization() + ctx = _build_context(is_moe=True, tokens_are_post_routing=True) + meta = compute_op_trace_meta("expert_parallel_alltoall", "COMM", ctx) + + assert meta["tensor_shape"]["data"] == [4, 2, 8] + assert meta["element_count"] == 64 + assert meta["base_size_bytes"] == 128 + + +def test_moe_shuffling_non_divisible_tokens(): + from frontier.metrics.op_trace_utils import OpTraceContext, compute_op_trace_meta + from frontier.types import ClusterType + + _reset_quantization() + + model_config = MagicMock() + model_config.embedding_dim = 8 + model_config.num_q_heads = 4 + model_config.num_kv_heads = 2 + model_config.mlp_hidden_dim = 16 + model_config.num_experts = 4 + model_config.num_experts_per_tok = 3 + model_config.is_moe = True + model_config.get_head_dim = MagicMock(return_value=2) + + replica_config = MagicMock() + replica_config.attn_tensor_parallel_size = 1 + replica_config.attn_data_parallel_size = 1 + replica_config.moe_tensor_parallel_size = 1 + replica_config.moe_expert_parallel_size = 2 + replica_config.num_pipeline_stages = 1 + replica_config.router_topk = 3 + + ctx = OpTraceContext( + cluster_type=ClusterType.DECODE_FFN, + model_config=model_config, + replica_config=replica_config, + total_tokens=10, + effective_tokens_compute=10, + effective_tokens_transfer=10, + effective_tokens_rounded=10, + tokens_are_post_routing=True, + ) + + # In Frontier tracing, moe_shuffling is modeled as a compute op (not a comm op). + meta = compute_op_trace_meta("moe_shuffling", "COMPUTE", ctx) + + assert meta["tensor_shape"]["input"] == [4, 3, 8] + assert meta["tensor_shape"]["output"] == [4, 3, 8] + assert meta["tensor_size_bytes"]["input"] == 192 + assert meta["tensor_size_bytes"]["output"] == 192 + + +def test_share_expert_trace_shapes(): + from frontier.metrics.op_trace_utils import OpTraceContext, compute_op_trace_meta + from frontier.types import ClusterType + + _reset_quantization() + + model_config = MagicMock() + model_config.embedding_dim = 16 + model_config.num_q_heads = 4 + model_config.num_kv_heads = 2 + model_config.mlp_hidden_dim = 32 + model_config.share_expert_dim = 12 + model_config.num_experts = 8 + model_config.num_experts_per_tok = 2 + model_config.is_moe = True + model_config.get_head_dim = MagicMock(return_value=4) + + replica_config = MagicMock() + replica_config.attn_tensor_parallel_size = 1 + replica_config.attn_data_parallel_size = 1 + replica_config.moe_tensor_parallel_size = 2 + replica_config.moe_expert_parallel_size = 2 + replica_config.num_pipeline_stages = 1 + replica_config.router_topk = 2 + + ctx = OpTraceContext( + cluster_type=ClusterType.DECODE_FFN, + model_config=model_config, + replica_config=replica_config, + total_tokens=8, + effective_tokens_compute=8, + effective_tokens_transfer=8, + effective_tokens_rounded=8, + tokens_are_post_routing=False, + ) + + meta_up = compute_op_trace_meta("share_expert_up_proj", "COMPUTE", ctx) + assert meta_up["tensor_shape"]["output"] == [8, 6] + + meta_act = compute_op_trace_meta("share_expert_act", "COMPUTE", ctx) + assert meta_act["tensor_shape"]["input"] == [8, 6] + + meta_down = compute_op_trace_meta("share_expert_down_proj", "COMPUTE", ctx) + assert meta_down["tensor_shape"]["input"] == [8, 6] + assert meta_down["tensor_shape"]["output"] == [8, 16] + + +def test_share_expert_requires_dim(): + from frontier.metrics.op_trace_utils import OpTraceContext, compute_op_trace_meta + from frontier.types import ClusterType + + _reset_quantization() + + model_config = MagicMock() + model_config.embedding_dim = 16 + model_config.num_q_heads = 4 + model_config.num_kv_heads = 2 + model_config.mlp_hidden_dim = 32 + model_config.share_expert_dim = None + model_config.num_experts = 8 + model_config.num_experts_per_tok = 2 + model_config.is_moe = True + model_config.get_head_dim = MagicMock(return_value=4) + + replica_config = MagicMock() + replica_config.attn_tensor_parallel_size = 1 + replica_config.attn_data_parallel_size = 1 + replica_config.moe_tensor_parallel_size = 2 + replica_config.moe_expert_parallel_size = 2 + replica_config.num_pipeline_stages = 1 + replica_config.router_topk = 2 + + ctx = OpTraceContext( + cluster_type=ClusterType.DECODE_FFN, + model_config=model_config, + replica_config=replica_config, + total_tokens=8, + effective_tokens_compute=8, + effective_tokens_transfer=8, + effective_tokens_rounded=8, + tokens_are_post_routing=False, + ) + + with pytest.raises(ValueError, match="share_expert_dim must be set"): + compute_op_trace_meta("share_expert_up_proj", "COMPUTE", ctx) + + +def test_kv_cache_transfer_meta(): + from frontier.metrics.op_trace_utils import build_kv_cache_transfer_meta + from frontier.types import ClusterType + + _reset_quantization() + req_a = MagicMock() + req_a.num_prefill_tokens = 2 + req_b = MagicMock() + req_b.num_prefill_tokens = 2 + + batch = MagicMock() + batch.requests = [req_a, req_b] + + replica_config = MagicMock() + model_config = MagicMock() + model_config.num_layers = 2 + model_config.num_q_heads = 4 + model_config.num_kv_heads = 2 + model_config.embedding_dim = 8 + model_config.is_moe = False + # Mock get_head_dim() to return computed value (embedding_dim // num_q_heads = 8 // 4 = 2) + model_config.get_head_dim = MagicMock(return_value=2) + replica_config.model_config = model_config + + meta = build_kv_cache_transfer_meta( + batch, replica_config, ClusterType.PREFILL, transfer_size_bytes=512 + ) + + assert meta["total_tokens"] == 4 + assert meta["tensor_shape"]["kv"] == [4, 2, 2, 2, 2] + assert meta["tensor_size_bytes"]["kv"] == 128 + assert meta["num_heads"] == 2 + assert meta["num_q_heads"] == 4 + assert meta["num_kv_heads"] == 2 + assert meta["dtype"] == "FP16" + assert meta["transfer_size_bytes"] == 512 + + +def test_m2n_transfer_meta(): + from frontier.metrics.op_trace_utils import build_m2n_transfer_meta + from frontier.types import ClusterType + + _reset_quantization() + batch = MagicMock() + batch.get_effective_total_tokens_for_transfer = MagicMock(return_value=4) + + replica_config = MagicMock() + model_config = MagicMock() + model_config.embedding_dim = 8 + model_config.is_moe = False + replica_config.model_config = model_config + + meta = build_m2n_transfer_meta( + batch, replica_config, ClusterType.DECODE_ATTN, activation_size_bytes=128 + ) + + assert meta["total_tokens"] == 4 + assert meta["tensor_shape"]["activation"] == [4, 8] + assert meta["dtype"] == "FP16" + assert meta["activation_size_bytes"] == 128 + + + +def test_moe_ep_allreduce_shape(): + from frontier.metrics.op_trace_utils import compute_op_trace_meta + + _reset_quantization() + ctx = _build_context(is_moe=True, tokens_are_post_routing=False) + meta = compute_op_trace_meta("expert_parallel_allreduce", "COMM", ctx) + + assert meta["tensor_shape"]["data"] == [8, 8] + assert meta["element_count"] == 64 + assert meta["base_size_bytes"] == 128 diff --git a/tests/unit/test_pd_transfer_entities.py b/tests/unit/test_pd_transfer_entities.py new file mode 100644 index 0000000..b4f6dfa --- /dev/null +++ b/tests/unit/test_pd_transfer_entities.py @@ -0,0 +1,173 @@ +"""pd-disaggregation transfer entity contract tests.""" + +from types import SimpleNamespace + +import pytest + +from frontier.types import ClusterType + + +def test_kv_cache_transfer_info_computes_end_time_effective_bytes_and_dict() -> None: + from frontier.entities import KVCacheTransferInfo + + batch = SimpleNamespace(id=11, global_id=101) + info = KVCacheTransferInfo( + batch=batch, + source_cluster_type=ClusterType.PREFILL, + target_cluster_type=ClusterType.DECODE, + source_replica_id=3, + source_dp_id=2, + kv_cache_size_bytes=4096, + transfer_time_ms=2.5, + transfer_start_time=7.0, + enable_compression=True, + compression_ratio=2.0, + enable_latency_hiding=True, + transfer_protocol="rdma", + transfer_requests=True, + ) + + assert info.is_completed is True + assert info.transfer_end_time == pytest.approx(7.0025) + assert info.effective_data_size_bytes == 2048 + + data = info.to_dict() + assert data["batch_id"] == 11 + assert data["batch_global_id"] == 101 + assert data["source_cluster_type"] == "PREFILL" + assert data["target_cluster_type"] == "DECODE" + assert data["source_replica_id"] == 3 + assert data["kv_cache_size_bytes"] == 4096 + assert data["effective_data_size_bytes"] == 2048 + assert data["transfer_time_ms"] == pytest.approx(2.5) + assert data["transfer_start_time"] == pytest.approx(7.0) + assert data["transfer_end_time"] == pytest.approx(7.0025) + assert data["enable_compression"] is True + assert data["compression_ratio"] == pytest.approx(2.0) + assert data["enable_latency_hiding"] is True + assert data["transfer_protocol"] == "rdma" + assert data["transfer_requests"] is True + + +def test_m2n_transfer_info_validates_direction_and_sets_pipeline_stage() -> None: + from frontier.entities import M2NTransferInfo + + batch = SimpleNamespace(id=12, global_id=102) + info = M2NTransferInfo( + batch=batch, + source_cluster_type=ClusterType.DECODE_ATTN, + target_cluster_type=ClusterType.DECODE_FFN, + source_replica_id=4, + source_dp_id=1, + activation_size_bytes=8192, + transfer_time_ms=1.2, + transfer_start_time=9.0, + enable_compression=True, + compression_ratio=4.0, + layer_id=5, + afd_stage_idx=6, + target_ffn_replica_id=7, + ) + + assert info.is_completed is True + assert info.transfer_end_time == pytest.approx(9.0012) + assert info.effective_data_size_bytes == 2048 + assert info.pipeline_stage == "attn_to_ffn" + assert info.is_attn_to_ffn is True + assert info.is_ffn_to_attn is False + + data = info.to_dict() + assert data["batch_id"] == 12 + assert data["batch_global_id"] == 102 + assert data["source_cluster_type"] == "DECODE_ATTN" + assert data["target_cluster_type"] == "DECODE_FFN" + assert data["source_replica_id"] == 4 + assert data["source_dp_id"] == 1 + assert data["activation_size_bytes"] == 8192 + assert data["effective_data_size_bytes"] == 2048 + assert data["transfer_time_ms"] == pytest.approx(1.2) + assert data["transfer_start_time"] == pytest.approx(9.0) + assert data["transfer_end_time"] == pytest.approx(9.0012) + assert data["enable_p2p_optimization"] is True + assert data["p2p_protocol"] == "nvlink" + assert data["enable_compression"] is True + assert data["compression_ratio"] == pytest.approx(4.0) + assert data["enable_latency_hiding"] is False + assert data["layer_id"] == 5 + assert data["afd_stage_idx"] == 6 + assert data["pipeline_stage"] == "attn_to_ffn" + assert data["target_ffn_replica_id"] == 7 + + +def test_m2n_transfer_info_rejects_non_m2n_cluster_pairs() -> None: + from frontier.entities import M2NTransferInfo + + batch = SimpleNamespace(id=13, global_id=103) + + with pytest.raises(ValueError, match="DECODE_ATTN <-> DECODE_FFN"): + M2NTransferInfo( + batch=batch, + source_cluster_type=ClusterType.PREFILL, + target_cluster_type=ClusterType.DECODE, + source_replica_id=0, + source_dp_id=0, + activation_size_bytes=1, + transfer_time_ms=0.1, + transfer_start_time=0.0, + ) + + +def test_kv_cache_transfer_start_event_targets_decode_cluster_for_routing() -> None: + from frontier.cluster_simulator import ClusterSimulator + from frontier.events.kv_cache_transfer_start_event import KVCacheTransferStartEvent + + batch = SimpleNamespace(id=14, global_id=104, requests=[]) + event = KVCacheTransferStartEvent( + time=1.0, + source_replica_id=0, + source_dp_id=0, + source_cluster_type=ClusterType.PREFILL, + target_cluster_type=ClusterType.DECODE, + batch=batch, + kv_cache_size_bytes=1024, + transfer_time_ms=0.5, + ) + simulator = object.__new__(ClusterSimulator) + simulator._cluster_type = ClusterType.PREFILL + + assert event.get_target_cluster() is ClusterType.DECODE + assert simulator._determine_target_cluster(event) is ClusterType.DECODE + + +def test_transfer_info_without_compression_preserves_original_size() -> None: + from frontier.entities import KVCacheTransferInfo, M2NTransferInfo + + batch = SimpleNamespace(id=21, global_id=201) + kv_info = KVCacheTransferInfo( + batch=batch, + source_cluster_type=ClusterType.PREFILL, + target_cluster_type=ClusterType.DECODE, + source_replica_id=0, + source_dp_id=0, + kv_cache_size_bytes=0, + transfer_time_ms=0.0, + transfer_start_time=3.0, + ) + assert kv_info.effective_data_size_bytes == 0 + assert kv_info.transfer_end_time == pytest.approx(3.0) + + m2n_info = M2NTransferInfo( + batch=batch, + source_cluster_type=ClusterType.DECODE_FFN, + target_cluster_type=ClusterType.DECODE_ATTN, + source_replica_id=0, + source_dp_id=0, + activation_size_bytes=128, + transfer_time_ms=0.0, + transfer_start_time=3.5, + ) + assert m2n_info.effective_data_size_bytes == 128 + assert m2n_info.transfer_end_time == pytest.approx(3.5) + assert m2n_info.pipeline_stage == "ffn_to_attn" + assert m2n_info.is_attn_to_ffn is False + assert m2n_info.is_ffn_to_attn is True diff --git a/tests/unit/test_pd_transfer_predictors.py b/tests/unit/test_pd_transfer_predictors.py new file mode 100644 index 0000000..4265ff2 --- /dev/null +++ b/tests/unit/test_pd_transfer_predictors.py @@ -0,0 +1,157 @@ +"""pd-disaggregation transfer predictor contract tests.""" + +from types import SimpleNamespace + +import pytest + +from frontier.config.kv_cache_transfer_config import AnalyticalKVCacheTransferConfig +from frontier.config.m2n_transfer_config import AnalyticalM2NTransferConfig +from frontier.types import ClusterType + + +class _ModelConfig: + num_layers = 2 + num_kv_heads = 4 + embedding_dim = 64 + is_moe = False + + def get_head_dim(self) -> int: + return 8 + + +class _ReplicaConfig: + model_config = _ModelConfig() + + +class _Batch: + def __init__(self) -> None: + self.requests = [ + SimpleNamespace(num_prefill_tokens=3), + SimpleNamespace(num_prefill_tokens=5), + ] + + def get_effective_total_tokens_for_transfer(self, _source_cluster_type: ClusterType) -> int: + return 6 + + +def test_kv_cache_analytical_predictor_computes_size_time_and_registry_lookup() -> None: + from frontier.kv_cache_transfer import ( + AnalyticalKVCacheTransferPredictor, + KVCacheTransferPredictorRegistry, + ) + from frontier.types import KVCacheTransferType + + config = AnalyticalKVCacheTransferConfig( + network_bandwidth_gbps=80.0, + network_latency_ms=0.25, + kv_cache_dtype_size_bytes=2, + enable_compression=True, + compression_ratio=2.0, + ) + predictor = AnalyticalKVCacheTransferPredictor(config) + batch = _Batch() + replica_config = _ReplicaConfig() + + kv_size = predictor.get_kv_cache_size(batch, replica_config) + expected_size = 8 * 2 * 4 * 8 * 2 * 2 + assert kv_size == expected_size + + request_size = predictor.get_kv_cache_size_for_request(batch.requests[0], replica_config) + assert request_size == 3 * 2 * 4 * 8 * 2 * 2 + + transfer_ms = predictor.get_transfer_time( + ClusterType.PREFILL, + ClusterType.DECODE, + batch, + kv_size, + ) + expected_effective_bytes = expected_size / 2.0 + expected_bandwidth_bytes_per_ms = (80.0 * 1e9) / (8 * 1000) + expected_transfer_ms = 0.25 + expected_effective_bytes / expected_bandwidth_bytes_per_ms + assert transfer_ms == pytest.approx(expected_transfer_ms) + + assert predictor.supports_latency_hiding() is False + assert KVCacheTransferPredictorRegistry.get_key_from_str("analytical") is KVCacheTransferType.ANALYTICAL + assert isinstance( + KVCacheTransferPredictorRegistry.get(KVCacheTransferType.ANALYTICAL, config), + AnalyticalKVCacheTransferPredictor, + ) + + +def test_m2n_analytical_predictor_computes_size_time_and_registry_lookup() -> None: + from frontier.m2n_transfer import AnalyticalM2NTransferPredictor, M2NTransferPredictorRegistry + from frontier.types import M2NTransferType + + config = AnalyticalM2NTransferConfig( + memory_bandwidth_gbps=160.0, + network_latency_ms=0.12, + activation_dtype_size_bytes=2, + enable_compression=True, + compression_ratio=2.0, + enable_p2p_optimization=True, + ) + predictor = AnalyticalM2NTransferPredictor(config) + batch = _Batch() + replica_config = _ReplicaConfig() + + activation_size = predictor.get_activation_size(batch, replica_config, ClusterType.DECODE_ATTN) + expected_size = 6 * 64 * 2 + assert activation_size == expected_size + + request_size = predictor.get_activation_size_for_request( + SimpleNamespace(), replica_config, ClusterType.DECODE_FFN + ) + assert request_size == 1 * 64 * 2 + + transfer_ms = predictor.get_transfer_time( + ClusterType.DECODE_ATTN, + ClusterType.DECODE_FFN, + batch, + activation_size, + ) + expected_effective_bytes = int(expected_size / 2.0) + expected_bandwidth_bytes_per_ms = 160.0 * 125_000 + expected_transfer_ms = (0.12 + expected_effective_bytes / expected_bandwidth_bytes_per_ms) / 1.2 + assert transfer_ms == pytest.approx(expected_transfer_ms) + + assert M2NTransferPredictorRegistry.get_key_from_str("analytical") is M2NTransferType.ANALYTICAL + assert isinstance( + M2NTransferPredictorRegistry.get(M2NTransferType.ANALYTICAL, config), + AnalyticalM2NTransferPredictor, + ) + + +def test_m2n_analytical_predictor_rejects_invalid_transfer_pairs() -> None: + from frontier.m2n_transfer import AnalyticalM2NTransferPredictor + + predictor = AnalyticalM2NTransferPredictor(AnalyticalM2NTransferConfig()) + + with pytest.raises(ValueError, match="DECODE_ATTN <-> DECODE_FFN"): + predictor.get_transfer_time( + ClusterType.PREFILL, + ClusterType.DECODE, + _Batch(), + activation_size_bytes=64, + ) + + +def test_transfer_predictors_handle_zero_token_and_invalid_source_boundaries() -> None: + from frontier.kv_cache_transfer import AnalyticalKVCacheTransferPredictor + from frontier.m2n_transfer import AnalyticalM2NTransferPredictor + + kv_predictor = AnalyticalKVCacheTransferPredictor( + AnalyticalKVCacheTransferConfig(network_bandwidth_gbps=100.0, network_latency_ms=0.1) + ) + replica_config = _ReplicaConfig() + zero_request = SimpleNamespace(num_prefill_tokens=0) + assert kv_predictor.get_kv_cache_size_for_request(zero_request, replica_config) == 0 + assert kv_predictor.get_transfer_time( + ClusterType.PREFILL, + ClusterType.DECODE, + None, + kv_cache_size_bytes=0, + ) == pytest.approx(0.1) + + m2n_predictor = AnalyticalM2NTransferPredictor(AnalyticalM2NTransferConfig()) + with pytest.raises(ValueError, match="Invalid source cluster type"): + m2n_predictor.get_activation_size(_Batch(), replica_config, ClusterType.PREFILL) diff --git a/tests/unit/test_pd_transfer_types_and_configs.py b/tests/unit/test_pd_transfer_types_and_configs.py new file mode 100644 index 0000000..f5f875d --- /dev/null +++ b/tests/unit/test_pd_transfer_types_and_configs.py @@ -0,0 +1,82 @@ +"""pd-disaggregation transfer enum and config contract tests.""" + +import os +import subprocess +import sys +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_pd_transfer_types_are_exported_and_parse_analytical() -> None: + from frontier.types import KVCacheTransferType, M2NTransferType + + assert KVCacheTransferType.ANALYTICAL.value == 1 + assert str(KVCacheTransferType.ANALYTICAL) == "analytical" + assert KVCacheTransferType.from_str("analytical") is KVCacheTransferType.ANALYTICAL + + assert M2NTransferType.ANALYTICAL.value == 1 + assert str(M2NTransferType.ANALYTICAL) == "analytical" + assert M2NTransferType.from_str("analytical") is M2NTransferType.ANALYTICAL + + +def test_analytical_transfer_configs_return_enum_types() -> None: + from frontier.config.kv_cache_transfer_config import AnalyticalKVCacheTransferConfig + from frontier.config.m2n_transfer_config import AnalyticalM2NTransferConfig + from frontier.types import KVCacheTransferType, M2NTransferType + + assert AnalyticalKVCacheTransferConfig.get_type() is KVCacheTransferType.ANALYTICAL + assert AnalyticalKVCacheTransferConfig.get_name() == "analytical" + + assert AnalyticalM2NTransferConfig.get_type() is M2NTransferType.ANALYTICAL + assert AnalyticalM2NTransferConfig.get_name() == "analytical" + + +def test_pd_disaggregation_release_guard_rejects_parallel_cluster_default() -> None: + from frontier.config.config import SimulationConfig + + config = object.__new__(SimulationConfig) + config.sys_arch = "pd-disaggregation" + config.use_cuda_graph = False + config.enable_parallel_clusters = True + + with pytest.raises(ValueError, match="--no-enable_parallel_clusters"): + config._validate_open_source_release_architecture_guard() + + +def test_pd_disaggregation_release_guard_allows_explicit_sequential_mode() -> None: + from frontier.config.config import SimulationConfig + + config = object.__new__(SimulationConfig) + config.sys_arch = "pd-disaggregation" + config.use_cuda_graph = False + config.enable_parallel_clusters = False + + config._validate_open_source_release_architecture_guard() + + +def test_pd_disaggregation_cli_release_guard_exits_without_traceback() -> None: + env = os.environ.copy() + env.update( + { + "PYTHONPATH": str(REPO_ROOT), + "PYTHONDONTWRITEBYTECODE": "1", + } + ) + + result = subprocess.run( + [sys.executable, "-m", "frontier.main", "--sys_arch", "pd-disaggregation"], + cwd=REPO_ROOT, + env=env, + text=True, + capture_output=True, + check=False, + timeout=30, + ) + + assert result.returncode == 1 + assert "--no-enable_parallel_clusters" in result.stderr + assert "Traceback" not in result.stderr diff --git a/tests/unit/test_prefix_cache_cluster_validation.py b/tests/unit/test_prefix_cache_cluster_validation.py new file mode 100644 index 0000000..72a039b --- /dev/null +++ b/tests/unit/test_prefix_cache_cluster_validation.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from frontier.config.config import ( + FixedRequestLengthGeneratorConfig, + TraceRequestLengthGeneratorConfig, + RoundRobinClusterSchedulerConfig, + StickyRoundRobinClusterSchedulerConfig, + SyntheticRequestGeneratorConfig, + TraceRequestGeneratorConfig, + VllmV1SchedulerConfig, +) +from frontier.scheduler.cluster_scheduler.base_cluster_scheduler import ( + BaseClusterScheduler, +) +from frontier.types import ClusterType + + +class _DummyClusterScheduler(BaseClusterScheduler): + def schedule(self): + raise NotImplementedError + + +def _build_scheduler( + *, + num_replicas: int, + cluster_type: ClusterType, + cluster_scheduler_config, + replica_scheduler_config, + request_generator_config=None, +): + scheduler = object.__new__(_DummyClusterScheduler) + scheduler._cluster_type = cluster_type + scheduler._num_replicas = num_replicas + scheduler._config = SimpleNamespace( + cluster_scheduler_config=cluster_scheduler_config, + ) + scheduler._request_generator_config = request_generator_config + return scheduler + + +def test_prefix_cache_requires_sticky_scheduler_for_multi_replica_clusters() -> None: + scheduler = _build_scheduler( + num_replicas=2, + cluster_type=ClusterType.MONOLITHIC, + cluster_scheduler_config=RoundRobinClusterSchedulerConfig(), + replica_scheduler_config=VllmV1SchedulerConfig(enable_prefix_caching=True), + ) + + with pytest.raises(ValueError, match="sticky"): + scheduler._validate_prefix_cache_cluster_config( + VllmV1SchedulerConfig(enable_prefix_caching=True) + ) + + +def test_prefix_cache_allows_sticky_scheduler_for_multi_replica_clusters() -> None: + scheduler = _build_scheduler( + num_replicas=2, + cluster_type=ClusterType.PREFILL, + cluster_scheduler_config=StickyRoundRobinClusterSchedulerConfig(), + replica_scheduler_config=VllmV1SchedulerConfig(enable_prefix_caching=True), + ) + + scheduler._validate_prefix_cache_cluster_config( + VllmV1SchedulerConfig(enable_prefix_caching=True) + ) + + +def test_prefix_cache_rejects_plain_synthetic_request_source_before_scheduling() -> None: + scheduler = _build_scheduler( + num_replicas=1, + cluster_type=ClusterType.PREFILL, + cluster_scheduler_config=StickyRoundRobinClusterSchedulerConfig(), + replica_scheduler_config=VllmV1SchedulerConfig(enable_prefix_caching=True), + request_generator_config=SyntheticRequestGeneratorConfig( + length_generator_config=FixedRequestLengthGeneratorConfig( + prefill_tokens=32, + decode_tokens=8, + ) + ), + ) + + with pytest.raises(ValueError, match="session_id.*block_hash_ids"): + scheduler._validate_prefix_cache_cluster_config( + VllmV1SchedulerConfig(enable_prefix_caching=True) + ) + + +def test_prefix_cache_rejects_synthetic_trace_length_source_before_scheduling( + tmp_path: Path, +) -> None: + trace_file = tmp_path / "synthetic_length_trace.csv" + trace_file.write_text( + "num_prefill_tokens,num_decode_tokens,session_id,block_hash_ids\n" + "32,8,7,11|22\n", + encoding="utf-8", + ) + scheduler = _build_scheduler( + num_replicas=1, + cluster_type=ClusterType.PREFILL, + cluster_scheduler_config=StickyRoundRobinClusterSchedulerConfig(), + replica_scheduler_config=VllmV1SchedulerConfig(enable_prefix_caching=True), + request_generator_config=SyntheticRequestGeneratorConfig( + length_generator_config=TraceRequestLengthGeneratorConfig( + trace_file=str(trace_file), + ) + ), + ) + + with pytest.raises(ValueError, match="trace request source"): + scheduler._validate_prefix_cache_cluster_config( + VllmV1SchedulerConfig(enable_prefix_caching=True) + ) + + +def test_prefix_cache_allows_trace_request_source_with_prefix_metadata( + tmp_path: Path, +) -> None: + trace_file = tmp_path / "prefix_trace.csv" + trace_file.write_text( + "arrived_at,num_prefill_tokens,num_decode_tokens,session_id,block_hash_ids\n" + "0.0,32,8,7,11|22\n", + encoding="utf-8", + ) + scheduler = _build_scheduler( + num_replicas=1, + cluster_type=ClusterType.PREFILL, + cluster_scheduler_config=StickyRoundRobinClusterSchedulerConfig(), + replica_scheduler_config=VllmV1SchedulerConfig(enable_prefix_caching=True), + request_generator_config=TraceRequestGeneratorConfig( + trace_file=str(trace_file) + ), + ) + + scheduler._validate_prefix_cache_cluster_config( + VllmV1SchedulerConfig(enable_prefix_caching=True) + ) + + +def test_prefix_cache_rejects_trace_request_source_without_block_hash_ids( + tmp_path: Path, +) -> None: + trace_file = tmp_path / "missing_prefix_trace.csv" + trace_file.write_text( + "arrived_at,num_prefill_tokens,num_decode_tokens,session_id\n" + "0.0,32,8,7\n", + encoding="utf-8", + ) + scheduler = _build_scheduler( + num_replicas=1, + cluster_type=ClusterType.PREFILL, + cluster_scheduler_config=StickyRoundRobinClusterSchedulerConfig(), + replica_scheduler_config=VllmV1SchedulerConfig(enable_prefix_caching=True), + request_generator_config=TraceRequestGeneratorConfig( + trace_file=str(trace_file) + ), + ) + + with pytest.raises(ValueError, match="block_hash_ids"): + scheduler._validate_prefix_cache_cluster_config( + VllmV1SchedulerConfig(enable_prefix_caching=True) + ) + + +@pytest.mark.parametrize( + ("row", "missing_column"), + [ + ("0.0,32,8,,11|22\n", "session_id"), + ("0.0,32,8,7,\n", "block_hash_ids"), + ], +) +def test_prefix_cache_rejects_trace_request_source_with_empty_metadata_values( + tmp_path: Path, + row: str, + missing_column: str, +) -> None: + trace_file = tmp_path / "empty_prefix_metadata_trace.csv" + trace_file.write_text( + "arrived_at,num_prefill_tokens,num_decode_tokens,session_id,block_hash_ids\n" + + row, + encoding="utf-8", + ) + scheduler = _build_scheduler( + num_replicas=1, + cluster_type=ClusterType.PREFILL, + cluster_scheduler_config=StickyRoundRobinClusterSchedulerConfig(), + replica_scheduler_config=VllmV1SchedulerConfig(enable_prefix_caching=True), + request_generator_config=TraceRequestGeneratorConfig( + trace_file=str(trace_file) + ), + ) + + with pytest.raises(ValueError, match=fr"row 2.*{missing_column}"): + scheduler._validate_prefix_cache_cluster_config( + VllmV1SchedulerConfig(enable_prefix_caching=True) + ) diff --git a/tests/unit/test_request_generator_decode_bound_count.py b/tests/unit/test_request_generator_decode_bound_count.py new file mode 100644 index 0000000..0c79645 --- /dev/null +++ b/tests/unit/test_request_generator_decode_bound_count.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +"""Regression tests for request-generator decode-bound workload metadata.""" + +from __future__ import annotations + +from frontier.config.config import BaseRequestGeneratorConfig +from frontier.request_generator.base_request_generator import BaseRequestGenerator + + +class _MixedDecodeRequestGenerator(BaseRequestGenerator): + def generate_requests(self): + return [ + self._build_request( + arrived_at=0.0, + num_prefill_tokens=8, + num_decode_tokens=0, + ), + self._build_request( + arrived_at=1.0, + num_prefill_tokens=8, + num_decode_tokens=3, + ), + ] + + +def test_generate_records_decode_bound_request_count() -> None: + config = BaseRequestGeneratorConfig() + generator = _MixedDecodeRequestGenerator(config) + + requests = generator.generate() + + assert len(requests) == 2 + assert config.num_decode_bound_requests == 1 From f41ec8ba136e47ae0f14b95f637ea00ae37b575a Mon Sep 17 00:00:00 2001 From: fwyc0573 <50061432+fwyc0573@users.noreply.github.com> Date: Sun, 14 Jun 2026 01:21:08 +0800 Subject: [PATCH 2/3] Add pd-disaggregation examples and smoke coverage Constraint: Commit is limited to examples/architecture/pdd, examples indexes, transfer config boundary checks, and PDD smoke/cross-validation tests for the local worktrees/Frontier PR branch. Rejected: Applying the full examples patch | target main already contains co-location offline/online layout, and broad co-location rewrites are outside the pd-disaggregation PR boundary. Confidence: high Scope-risk: moderate Directive: Keep generated outputs, analysis/performance harnesses, root docs, and profiling docs out of this PR unless a future scope expansion explicitly includes them. Tested: PYTHONPATH=/local/ycfeng/frontier/worktrees/Frontier conda run -n frontier python -m pytest tests/unit/test_examples_pdd_scripts.py tests/unit/test_pdd_scripts_cross_validate.py tests/e2e/test_pd_disaggregation_example_script_smoke.py tests/e2e/test_pdd_example_scripts_smoke.py -q -> 22 passed in 16.70s; changed_python_files=8 py_compile PASS; pdd_shell_scripts=12 bash -n PASS; git diff --check PASS; staged_unexpected_files=0. Not-tested: Full final preparation gate is reserved for the completed three-commit branch. --- examples/README.md | 94 +++- examples/architecture/README.md | 75 ++- .../architecture/pdd/dense_model_basic.sh | 16 + .../pdd/offline/dense_model_basic.sh | 217 ++++++++ .../pdd/offline/moe_model_basic.sh | 229 +++++++++ .../pdd/offline/moe_prefix_caching.sh | 241 +++++++++ .../architecture/pdd/offline/moe_spec_dec.sh | 282 ++++++++++ .../pdd/offline/thinking_mode_basic.sh | 225 ++++++++ .../pdd/online/dense_model_basic_online.sh | 217 ++++++++ .../pdd/online/moe_model_basic_online.sh | 229 +++++++++ .../pdd/online/moe_prefix_caching_online.sh | 241 +++++++++ .../pdd/online/moe_spec_dec_online.sh | 282 ++++++++++ .../pdd/online/thinking_mode_basic_online.sh | 225 ++++++++ examples/architecture/pdd/run_all.sh | 55 ++ frontier/config/kv_cache_transfer_config.py | 29 +- frontier/config/m2n_transfer_config.py | 28 +- tests/e2e/pdd_scripts_cross_validate.py | 484 ++++++++++++++++++ ..._pd_disaggregation_example_script_smoke.py | 75 +++ tests/e2e/test_pdd_example_scripts_smoke.py | 88 ++++ tests/unit/test_examples_pdd_scripts.py | 139 +++++ .../test_pd_transfer_types_and_configs.py | 51 +- tests/unit/test_pdd_scripts_cross_validate.py | 118 +++++ 22 files changed, 3607 insertions(+), 33 deletions(-) create mode 100755 examples/architecture/pdd/dense_model_basic.sh create mode 100755 examples/architecture/pdd/offline/dense_model_basic.sh create mode 100755 examples/architecture/pdd/offline/moe_model_basic.sh create mode 100755 examples/architecture/pdd/offline/moe_prefix_caching.sh create mode 100755 examples/architecture/pdd/offline/moe_spec_dec.sh create mode 100755 examples/architecture/pdd/offline/thinking_mode_basic.sh create mode 100755 examples/architecture/pdd/online/dense_model_basic_online.sh create mode 100755 examples/architecture/pdd/online/moe_model_basic_online.sh create mode 100755 examples/architecture/pdd/online/moe_prefix_caching_online.sh create mode 100755 examples/architecture/pdd/online/moe_spec_dec_online.sh create mode 100755 examples/architecture/pdd/online/thinking_mode_basic_online.sh create mode 100755 examples/architecture/pdd/run_all.sh create mode 100644 tests/e2e/pdd_scripts_cross_validate.py create mode 100644 tests/e2e/test_pd_disaggregation_example_script_smoke.py create mode 100644 tests/e2e/test_pdd_example_scripts_smoke.py create mode 100644 tests/unit/test_examples_pdd_scripts.py create mode 100644 tests/unit/test_pdd_scripts_cross_validate.py diff --git a/examples/README.md b/examples/README.md index 09ce407..57e7699 100644 --- a/examples/README.md +++ b/examples/README.md @@ -2,6 +2,7 @@ | Date | Summary of Changes | |------------|--------------------| +| 2026-06-14 | Added PDD pd-disaggregation examples, script index, and release-scope guidance for local PR preparation. | | 2026-06-08 | Clarified that dummy analytical co-location smoke runs validate runtime plumbing, not profiling fidelity. | | 2026-06-08 | Split co-location examples into `offline/` and `online/`, added suite runner and cross-validation guidance. | | 2026-06-07 | Added optimized co-location advanced MoE recipes, top-level profiling examples, and corrected metrics behavior for Thinking Mode. | @@ -16,36 +17,43 @@ This directory contains runnable examples for the release-supported Frontier sim ## Release Scope -`pre-release-v0.1` supports only the `co-location` architecture. Historical `pd-disaggregation` and `pd-af-disaggregation` examples are intentionally not included in this branch. If those architectures are requested through CLI/config, Frontier exits with the release error documented in the top-level `README.md`. +`pre-release-v0.2` foregrounds **PDD / `pd-disaggregation`** examples: prefill runs in the `PREFILL` cluster, decode runs in the unified `DECODE` cluster, and KV cache is transferred between them. The public PDD example path uses the sequential simulator mode through `--no-enable_parallel_clusters`. -## Quick Start +The `pd-af-disaggregation` architecture and split `DECODE_ATTN` / `DECODE_FFN` release surface remain intentionally outside this examples release scope. Co-location examples are still kept as baseline comparison recipes and historical v0.1-compatible references. -The co-location examples are split by simulation mode: +## Quick Start -- `examples/architecture/co-location/offline/`: offline batch simulations. Existing offline examples were moved here unchanged in scenario intent. -- `examples/architecture/co-location/online/`: online serving simulations that mirror the offline scenarios while preserving generated request arrivals. -- `examples/architecture/co-location/run_all.sh`: one-click suite runner for all 10 co-location cases. +The PDD dense example uses dummy execution-time prediction and the analytical communication backend, so it does not require profiling data or the collective-sim binary for the first smoke run. ```bash export PYTHONPATH=$PWD export WANDB_DISABLED=true export VIDUR_DISABLE_WANDB=1 -# Run all five offline cases and all five online cases. -bash examples/architecture/co-location/run_all.sh +bash examples/architecture/pdd/offline/dense_model_basic.sh +``` -# Run one case directly. -bash examples/architecture/co-location/offline/dense_model_basic.sh -bash examples/architecture/co-location/online/dense_model_basic_online.sh +For the complete PDD architecture suite, run: -# Thinking Mode examples are available in both modes. -bash examples/architecture/co-location/offline/thinking_mode_basic.sh -bash examples/architecture/co-location/online/thinking_mode_basic_online.sh +```bash +bash examples/architecture/pdd/run_all.sh ``` -All co-location examples default to `--cc_backend_config_type analytical` so the suite is one-click runnable on a fresh checkout without building the collective-sim binary. To exercise the topology-aware backend, set `CC_BACKEND=collective_sim` and build `frontier/cc_backend/backends/collective-sim/sim/datacenter/htsim_ndp` first. +Co-location baseline and advanced recipes remain available for comparison. The current co-location layout is split into offline and online entrypoints: -Profiling commands can be validated without launching GPU kernels by using `--dry-run`: +```bash +bash examples/architecture/co-location/run_all.sh +bash examples/architecture/co-location/offline/dense_model_basic.sh +bash examples/architecture/co-location/offline/moe_model_basic.sh +bash examples/architecture/co-location/offline/thinking_mode_basic.sh +bash examples/architecture/co-location/offline/moe_spec_dec.sh +bash examples/architecture/co-location/offline/moe_prefix_caching.sh +bash examples/architecture/co-location/online/dense_model_basic_online.sh +bash examples/architecture/co-location/online/moe_model_basic_online.sh +bash examples/architecture/co-location/online/thinking_mode_basic_online.sh +bash examples/architecture/co-location/online/moe_spec_dec_online.sh +bash examples/architecture/co-location/online/moe_prefix_caching_online.sh +``` ```bash bash examples/profiling/profile_linear_op.sh --dry-run @@ -62,6 +70,21 @@ examples/ │ └── prefix_cache_shared_session_trace.csv ├── architecture/ │ ├── README.md +│ ├── pdd/ +│ │ ├── run_all.sh +│ │ ├── dense_model_basic.sh +│ │ ├── offline/ +│ │ │ ├── dense_model_basic.sh +│ │ │ ├── moe_model_basic.sh +│ │ │ ├── thinking_mode_basic.sh +│ │ │ ├── moe_spec_dec.sh +│ │ │ └── moe_prefix_caching.sh +│ │ └── online/ +│ │ ├── dense_model_basic_online.sh +│ │ ├── moe_model_basic_online.sh +│ │ ├── thinking_mode_basic_online.sh +│ │ ├── moe_spec_dec_online.sh +│ │ └── moe_prefix_caching_online.sh │ └── co-location/ │ ├── run_all.sh │ ├── offline/ @@ -88,9 +111,19 @@ examples/ ## Architecture Mode +### PDD / pd-disaggregation + +Separate prefill and decode clusters model prefill/decode disaggregation without splitting decode attention and decode FFN into separate public release clusters. + +- `--sys_arch pd-disaggregation` +- Uses `PREFILL` and unified `DECODE` clusters. +- Supports Dense, MoE, Thinking Mode, Speculative Decoding / MTP, and Prefix Caching examples in offline and online modes. +- Uses `--no-enable_parallel_clusters` because the pre-release-v0.2 public PDD path is the sequential simulator path; parallel cluster processing is still guarded. +- Keeps `pd-af-disaggregation` and global `--use_cuda_graph` outside the v0.2 examples release surface. + ### Co-location -Single monolithic cluster handles all prefill and decode work. +Single monolithic cluster handles all prefill and decode work. These examples are retained as baseline comparison recipes. - `--sys_arch co-location` - Supports dense and MoE model configs. @@ -99,13 +132,22 @@ Single monolithic cluster handles all prefill and decode work. ## Key Configuration Options +### PDD Cluster Layout + +- `--cluster_config_prefill_cluster_num_replicas`: Number of `PREFILL` cluster replicas. +- `--cluster_config_decode_cluster_num_replicas`: Number of unified `DECODE` cluster replicas. +- `--cluster_config_prefill_replica_config_*`: `PREFILL` replica parallelism and device fields. +- `--cluster_config_decode_replica_config_*`: `DECODE` replica parallelism and device fields. +- `--analytical_kv_cache_transfer_config_network_bandwidth_gbps`: Analytical KV transfer bandwidth. +- `--analytical_kv_cache_transfer_config_network_latency_ms`: Analytical KV transfer latency. + ### Parallelism -- `--replica_config_attn_tensor_parallel_size`: Attention tensor parallelism. -- `--replica_config_moe_tensor_parallel_size`: MoE tensor parallelism. -- `--replica_config_moe_expert_parallel_size`: Expert parallelism. -- `--replica_config_num_pipeline_stages`: Pipeline parallelism. -- `--cluster_config_num_replicas`: Number of monolithic cluster replicas. +- `--replica_config_attn_tensor_parallel_size`: Attention tensor parallelism for co-location examples. +- `--replica_config_moe_tensor_parallel_size`: MoE tensor parallelism for co-location examples. +- `--replica_config_moe_expert_parallel_size`: Expert parallelism for co-location examples. +- `--replica_config_num_pipeline_stages`: Pipeline parallelism for co-location examples. +- `--cluster_config_num_replicas`: Number of monolithic cluster replicas for co-location examples. ### Request Generation @@ -129,11 +171,11 @@ Single monolithic cluster handles all prefill and decode work. ## Running Examples -The checked-in co-location simulation examples use dummy mode (`--random_forrest_execution_time_predictor_config_enable_dummy_mode`) for quick testing without profiling data. Dummy mode skips ML predictor training and profiling metadata loading, so missing profiling CSVs do not affect smoke-test correctness. +The checked-in PDD examples use dummy mode (`--random_forrest_execution_time_predictor_config_enable_dummy_mode`), analytical communication cost modeling, and `--no-enable_parallel_clusters` for quick testing without profiling data. The expected minimal dense smoke behavior is one completed request, one KV cache transfer, and no release-guard crash. Metrics are written under `outputs/examples/pdd` by default. -These examples validate CLI/runtime plumbing and metrics artifact generation, not profiling fidelity. Use non-dummy profiling data before drawing hardware accuracy conclusions. +PDD Thinking Mode can produce multiple prefill-to-decode handoffs for one user request. The default small smoke configuration completes one request and records two KV transfers. -Offline cases write under `outputs/examples/co-location/offline//offline_batch//` by default. Online cases write under `outputs/examples/co-location/online//online_serving//` by default. The mode-specific `offline_batch` / `online_serving` path segment is added by Frontier's metrics taxonomy. +Co-location examples also use dummy mode for quick testing without profiling data. These examples validate CLI/runtime plumbing and metrics artifact generation, not profiling fidelity. Use non-dummy profiling data before drawing hardware accuracy conclusions. Baseline co-location scripts default to `decode_cuda_graph_mode=full_decode_only` and Chunked Prefill. The Speculative Decoding / MTP recipes use `decode_cuda_graph_mode=none` because speculative decoding currently conflicts with decode CUDA Graph modeling. The Prefix Caching recipes replay `examples/fixtures/prefix_cache_shared_session_trace.csv` to exercise cache-hit behavior. @@ -154,7 +196,7 @@ When comparing offline and online pairs, validate the following for each scenari ## Thinking Mode Example -The Thinking Mode scripts use: +The PDD and co-location Thinking Mode scripts use: - `--enable_thinking_mode` - `--thinking_depth 2` diff --git a/examples/architecture/README.md b/examples/architecture/README.md index 3ee7ff8..b780fe3 100644 --- a/examples/architecture/README.md +++ b/examples/architecture/README.md @@ -1,10 +1,18 @@ # Architecture Examples -This directory contains one-click architecture entrypoints for Frontier's release-supported runtime layout. +## Modification History + +| Date | Summary of Changes | +|------------|--------------------| +| 2026-06-14 | Added PDD pd-disaggregation script list, configuration contract, and validation criteria for local PR preparation. | + +This directory contains one-click architecture entrypoints for Frontier's release-supported runtime layouts. ## Release Scope -`pre-release-v0.1` supports only `co-location`. Disaggregated architecture examples are intentionally absent from this branch because the runtime guard rejects `pd-disaggregation` and `pd-af-disaggregation`. +`pre-release-v0.2` foregrounds **PDD / `pd-disaggregation`** examples. Prefill runs in the `PREFILL` cluster, decode runs in the unified `DECODE` cluster, and KV cache is transferred between them. The public PDD example path uses the sequential simulator mode through `--no-enable_parallel_clusters`. + +`co-location` examples remain available as baseline comparison recipes and v0.1-compatible architecture references. `pd-af-disaggregation` and split `DECODE_ATTN` / `DECODE_FFN` public examples remain outside this release scope. ## Scripts @@ -21,6 +29,38 @@ This directory contains one-click architecture entrypoints for Frontier's releas | `co-location/online/thinking_mode_basic_online.sh` | Online Thinking Mode v1 co-location | Mirrors Thinking Mode offline settings with `--simulation_mode online` | | `co-location/online/moe_spec_dec_online.sh` | Online MoE Speculative Decoding / MTP | Mirrors Speculative Decoding offline settings with `--simulation_mode online` | | `co-location/online/moe_prefix_caching_online.sh` | Online MoE Prefix Caching | Replays the same prefix-cache fixture with `--simulation_mode online` | +| `pdd/run_all.sh` | Full PDD suite | Runs all five offline PDD cases and all five online PDD cases; pass extra Frontier CLI flags after `--` | +| `pdd/offline/dense_model_basic.sh` | Offline dense PDD baseline | Sequential `pd-disaggregation`, analytical backend, dummy execution time, Chunked Prefill, CSV/JSON metrics | +| `pdd/offline/moe_model_basic.sh` | Offline MoE PDD baseline | Sequential `pd-disaggregation`, reference-runnable shared-domain MoE topology, Chunked Prefill, CSV/JSON metrics | +| `pdd/offline/thinking_mode_basic.sh` | Offline Thinking Mode v1 PDD | Thinking Mode with two KV transfer handoffs for the one-request smoke configuration | +| `pdd/offline/moe_spec_dec.sh` | Offline MoE PDD Speculative Decoding / MTP | Speculative Decoding enabled; Prefix Caching intentionally disabled; `DECODE_CUDA_GRAPH_MODE=none` | +| `pdd/offline/moe_prefix_caching.sh` | Offline MoE PDD Prefix Caching | Sticky scheduler with `examples/fixtures/prefix_cache_shared_session_trace.csv` | +| `pdd/online/dense_model_basic_online.sh` | Online dense PDD baseline | Mirrors dense offline settings with `--simulation_mode online` | +| `pdd/online/moe_model_basic_online.sh` | Online MoE PDD baseline | Mirrors MoE offline settings with `--simulation_mode online` | +| `pdd/online/thinking_mode_basic_online.sh` | Online Thinking Mode v1 PDD | Mirrors Thinking Mode offline settings with `--simulation_mode online` | +| `pdd/online/moe_spec_dec_online.sh` | Online MoE PDD Speculative Decoding / MTP | Mirrors Speculative Decoding offline settings with `--simulation_mode online` | +| `pdd/online/moe_prefix_caching_online.sh` | Online MoE PDD Prefix Caching | Replays the same prefix-cache fixture with `--simulation_mode online` | + +## PDD Configuration Contract + +All PDD scripts use these release-supported defaults unless overridden from the shell: + +- `--sys_arch pd-disaggregation` +- `--no-enable_parallel_clusters` +- explicit `PREFILL` and unified `DECODE` cluster settings +- `--cc_backend_config_type analytical` +- dummy execution-time prediction enabled by default +- CSV/JSON metrics enabled by default through `--metrics_config_write_metrics` and `--metrics_config_store_request_metrics` +- plots, Chrome trace, and JSON event trace disabled for lightweight one-click artifacts + +MoE PDD scripts also enforce the shared-domain invariant before launching Frontier: + +```text +PREFILL_ATTN_TP * PREFILL_ATTN_DP == PREFILL_MOE_TP * PREFILL_MOE_EP +DECODE_ATTN_TP * DECODE_ATTN_DP == DECODE_MOE_TP * DECODE_MOE_EP +``` + +This fail-fast check prevents known non-runnable MoE topology combinations from entering the simulator. ## Thinking Mode v1 @@ -34,10 +74,29 @@ The Thinking Mode examples use: - `--cc_backend_config_type analytical` so the one-click smoke run works on a minimal single-replica layout - CSV/JSON metrics enabled by default, with plots, Chrome trace, and JSON event trace disabled for lightweight artifacts +Under PDD, one user request can produce multiple prefill-to-decode KV handoffs. The default Thinking Mode smoke case completes one request and records two KV transfers. + ## Recommended Start Order ```bash -# Full suite. +# Full PDD suite for pre-release-v0.2. +bash examples/architecture/pdd/run_all.sh + +# PDD offline cases. +bash examples/architecture/pdd/offline/dense_model_basic.sh +bash examples/architecture/pdd/offline/moe_model_basic.sh +bash examples/architecture/pdd/offline/thinking_mode_basic.sh +bash examples/architecture/pdd/offline/moe_spec_dec.sh +bash examples/architecture/pdd/offline/moe_prefix_caching.sh + +# PDD online cases. +bash examples/architecture/pdd/online/dense_model_basic_online.sh +bash examples/architecture/pdd/online/moe_model_basic_online.sh +bash examples/architecture/pdd/online/thinking_mode_basic_online.sh +bash examples/architecture/pdd/online/moe_spec_dec_online.sh +bash examples/architecture/pdd/online/moe_prefix_caching_online.sh + +# Full co-location comparison suite. bash examples/architecture/co-location/run_all.sh # Offline cases. @@ -55,7 +114,7 @@ bash examples/architecture/co-location/online/moe_spec_dec_online.sh bash examples/architecture/co-location/online/moe_prefix_caching_online.sh ``` -Use the baseline scripts first, then use the Speculative Decoding / MTP and Prefix Caching recipes as advanced cases. +Use the dense baseline scripts first, then use the Thinking Mode, Speculative Decoding / MTP, and Prefix Caching recipes as advanced cases. ## Cross-validation Criteria @@ -66,3 +125,11 @@ For each offline/online pair: 3. Record expected request count, actual request rows, completed request rows, total input tokens, total output tokens, mean TTFT, mean latency, and request throughput when present. 4. Confirm offline outputs include the `offline_batch` taxonomy segment and online outputs include `online_serving`. 5. Treat latency differences as expected when online mode preserves request arrival times; investigate only if counts, token totals, output files, or finite numeric metrics diverge unexpectedly. + +For every PDD script, the release gate should additionally record: + +1. The script exits with code `0`. +2. `request_metrics.csv` and `system_metrics.json` exist in the metrics output directory. +3. Request row count, `total_requests`, and `completed_requests` match the expected case size. +4. KV transfer count, total KV bytes, and KV transfer time are present and positive. +5. Request-level `ttft`, `tpot`, `request_e2e_time`, and `transfer_kv_cache` are finite and positive. diff --git a/examples/architecture/pdd/dense_model_basic.sh b/examples/architecture/pdd/dense_model_basic.sh new file mode 100755 index 0000000..3ac4e64 --- /dev/null +++ b/examples/architecture/pdd/dense_model_basic.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# ============================================================================= +# Compatibility entrypoint for the PDD dense offline example +# ============================================================================= +# The canonical pre-release-v0.2 PDD example layout is: +# examples/architecture/pdd/offline/dense_model_basic.sh +# examples/architecture/pdd/online/dense_model_basic_online.sh +# +# Keep this top-level script as a backward-compatible alias for users who ran +# the early PDD dense smoke entrypoint before the offline/online split. +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +exec bash "$SCRIPT_DIR/offline/dense_model_basic.sh" "$@" diff --git a/examples/architecture/pdd/offline/dense_model_basic.sh b/examples/architecture/pdd/offline/dense_model_basic.sh new file mode 100755 index 0000000..0492e7c --- /dev/null +++ b/examples/architecture/pdd/offline/dense_model_basic.sh @@ -0,0 +1,217 @@ +#!/bin/bash +# ============================================================================= +# PDD / pd-disaggregation Offline Mode - Dense Model Example +# ============================================================================= +# This script mirrors the co-location example surface while using the +# pre-release-v0.2 PDD / pd-disaggregation architecture: prefill runs in the PREFILL cluster, +# decode runs in the DECODE cluster, and KV cache is transferred between them. +# +# This script demonstrates the release-supported sequential pd-disaggregation path. +# Decode CUDA Graph modeling and Chunked Prefill can be toggled with +# DECODE_CUDA_GRAPH_MODE and ENABLE_CHUNKED_PREFILL. +## Override any uppercase variable from the shell, and append extra Frontier CLI +# flags after "--" if you need to customize the run. +# ============================================================================= + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +export PYTHONPATH="$REPO_ROOT${PYTHONPATH:+:$PYTHONPATH}" +export WANDB_DISABLED=true +export VIDUR_DISABLE_WANDB=1 +PYTHON_BIN="${PYTHON_BIN:-python3}" + +MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-2-7b-hf}" +SYS_ARCH="${SYS_ARCH:-pd-disaggregation}" +PREFILL_REPLICAS="${PREFILL_REPLICAS:-1}" +DECODE_REPLICAS="${DECODE_REPLICAS:-1}" +PREFILL_ATTN_TP="${PREFILL_ATTN_TP:-1}" +PREFILL_ATTN_DP="${PREFILL_ATTN_DP:-1}" +PREFILL_MOE_TP="${PREFILL_MOE_TP:-1}" +PREFILL_MOE_EP="${PREFILL_MOE_EP:-1}" +PREFILL_PP="${PREFILL_PP:-1}" +PREFILL_DEVICE="${PREFILL_DEVICE:-a800}" +PREFILL_MEMORY_MARGIN_FRACTION="${PREFILL_MEMORY_MARGIN_FRACTION:-0.2}" +DECODE_ATTN_TP="${DECODE_ATTN_TP:-1}" +DECODE_ATTN_DP="${DECODE_ATTN_DP:-1}" +DECODE_MOE_TP="${DECODE_MOE_TP:-1}" +DECODE_MOE_EP="${DECODE_MOE_EP:-1}" +DECODE_PP="${DECODE_PP:-1}" +DECODE_DEVICE="${DECODE_DEVICE:-a800}" +DECODE_MEMORY_MARGIN_FRACTION="${DECODE_MEMORY_MARGIN_FRACTION:-0.2}" +TOTAL_EXPERTS="${TOTAL_EXPERTS:-1}" +ROUTER_TOPK="${ROUTER_TOPK:-1}" +MOE_ROUTING_MODE="${MOE_ROUTING_MODE:-simulation}" +MOE_ROUTING_SEED="${MOE_ROUTING_SEED:-42}" +REPLICA_SCHEDULER="${REPLICA_SCHEDULER:-vllm_v1}" +NUM_REQUESTS="${NUM_REQUESTS:-8}" +PREFILL_TOKENS="${PREFILL_TOKENS:-512}" +DECODE_TOKENS="${DECODE_TOKENS:-64}" +QPS="${QPS:-1.0}" +ENABLE_DUMMY_MODE="${ENABLE_DUMMY_MODE:-true}" +DUMMY_EXEC_TIME_MS="${DUMMY_EXEC_TIME_MS:-1.0}" +DECODE_CUDA_GRAPH_MODE="${DECODE_CUDA_GRAPH_MODE:-none}" +ENABLE_CHUNKED_PREFILL="${ENABLE_CHUNKED_PREFILL:-true}" +MAX_TOKENS_IN_BATCH="${MAX_TOKENS_IN_BATCH:-1024}" +LONG_PREFILL_TOKEN_THRESHOLD="${LONG_PREFILL_TOKEN_THRESHOLD:-64}" +KV_TRANSFER_BANDWIDTH_GBPS="${KV_TRANSFER_BANDWIDTH_GBPS:-200.0}" +KV_TRANSFER_LATENCY_MS="${KV_TRANSFER_LATENCY_MS:-0.5}" +METRICS_OUTPUT_DIR="${METRICS_OUTPUT_DIR:-$REPO_ROOT/outputs/examples/pdd/offline}" +RUN_ID="${RUN_ID:-dense_model_basic}" + +require_bool() { + local name="$1" + local value="$2" + if [ "$value" != "true" ] && [ "$value" != "false" ]; then + echo "ERROR: $name must be true or false; got $value" >&2 + exit 2 + fi +} + +require_non_negative_integer() { + local name="$1" + local value="$2" + if [[ ! "$value" =~ ^[0-9]+$ ]]; then + echo "ERROR: $name must be a non-negative integer; got $value" >&2 + exit 2 + fi +} + +require_positive_integer() { + local name="$1" + local value="$2" + [[ "$value" =~ ^[1-9][0-9]*$ ]] +} + +require_bool "ENABLE_DUMMY_MODE" "$ENABLE_DUMMY_MODE" +require_bool "ENABLE_CHUNKED_PREFILL" "$ENABLE_CHUNKED_PREFILL" + +if [ "$SYS_ARCH" != "pd-disaggregation" ]; then + echo "ERROR: this example only supports SYS_ARCH=pd-disaggregation; got SYS_ARCH=$SYS_ARCH" >&2 + exit 2 +fi + +if [ "$DECODE_CUDA_GRAPH_MODE" = "none" ]; then + echo "INFO: Decode CUDA Graph modeling is disabled by DECODE_CUDA_GRAPH_MODE=none." +elif [ "$DECODE_CUDA_GRAPH_MODE" != "full_decode_only" ] && [ "$DECODE_CUDA_GRAPH_MODE" != "piecewise" ]; then + echo "ERROR: DECODE_CUDA_GRAPH_MODE must be none, full_decode_only, or piecewise; got $DECODE_CUDA_GRAPH_MODE" >&2 + exit 2 +fi + +if [ "$ENABLE_CHUNKED_PREFILL" = "false" ] && [ "$LONG_PREFILL_TOKEN_THRESHOLD" != "0" ]; then + echo "ERROR: LONG_PREFILL_TOKEN_THRESHOLD must be 0 when ENABLE_CHUNKED_PREFILL=false" >&2 + exit 2 +fi + +if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + echo "ERROR: PYTHON_BIN is not executable or not on PATH: $PYTHON_BIN" >&2 + exit 2 +fi + +CMD=( + "$PYTHON_BIN" -m frontier.main + --simulation_mode offline + --sys_arch "$SYS_ARCH" + --no-enable_parallel_clusters + --cluster_config_prefill_cluster_num_replicas "$PREFILL_REPLICAS" + --cluster_config_decode_cluster_num_replicas "$DECODE_REPLICAS" + --cluster_config_prefill_replica_config_num_pipeline_stages "$PREFILL_PP" + --cluster_config_prefill_replica_config_attn_tensor_parallel_size "$PREFILL_ATTN_TP" + --cluster_config_prefill_replica_config_attn_data_parallel_size "$PREFILL_ATTN_DP" + --cluster_config_prefill_replica_config_moe_tensor_parallel_size "$PREFILL_MOE_TP" + --cluster_config_prefill_replica_config_moe_expert_parallel_size "$PREFILL_MOE_EP" + --cluster_config_prefill_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_prefill_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_prefill_replica_config_device "$PREFILL_DEVICE" + --cluster_config_prefill_replica_config_memory_margin_fraction "$PREFILL_MEMORY_MARGIN_FRACTION" + --cluster_config_decode_replica_config_num_pipeline_stages "$DECODE_PP" + --cluster_config_decode_replica_config_attn_tensor_parallel_size "$DECODE_ATTN_TP" + --cluster_config_decode_replica_config_attn_data_parallel_size "$DECODE_ATTN_DP" + --cluster_config_decode_replica_config_moe_tensor_parallel_size "$DECODE_MOE_TP" + --cluster_config_decode_replica_config_moe_expert_parallel_size "$DECODE_MOE_EP" + --cluster_config_decode_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_decode_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_decode_replica_config_device "$DECODE_DEVICE" + --cluster_config_decode_replica_config_memory_margin_fraction "$DECODE_MEMORY_MARGIN_FRACTION" + --cc_backend_config_type analytical + --replica_config_model_name "$MODEL_NAME" + --replica_config_moe_routing_mode "$MOE_ROUTING_MODE" + --replica_config_moe_routing_seed "$MOE_ROUTING_SEED" + --replica_scheduler_config_type "$REPLICA_SCHEDULER" + --decode_cuda_graph_mode "$DECODE_CUDA_GRAPH_MODE" + --vllm_v1_scheduler_config_max_tokens_in_batch "$MAX_TOKENS_IN_BATCH" + --vllm_v1_scheduler_config_long_prefill_token_threshold "$LONG_PREFILL_TOKEN_THRESHOLD" + --vllm_v1_scheduler_config_block_size "${BLOCK_SIZE:-16}" + --vllm_v1_scheduler_config_num_blocks "${NUM_BLOCKS:-128}" + --request_generator_config_type synthetic + --synthetic_request_generator_config_num_requests "$NUM_REQUESTS" + --length_generator_config_type fixed + --fixed_request_length_generator_config_prefill_tokens "$PREFILL_TOKENS" + --fixed_request_length_generator_config_decode_tokens "$DECODE_TOKENS" + --interval_generator_config_type poisson + --poisson_request_interval_generator_config_qps "$QPS" + --analytical_kv_cache_transfer_config_network_bandwidth_gbps "$KV_TRANSFER_BANDWIDTH_GBPS" + --analytical_kv_cache_transfer_config_network_latency_ms "$KV_TRANSFER_LATENCY_MS" + --metrics_config_output_dir "$METRICS_OUTPUT_DIR" + --metrics_config_run_id "$RUN_ID" + --metrics_config_write_metrics + --metrics_config_store_request_metrics + --metrics_config_store_batch_metrics + --metrics_config_store_token_completion_metrics + --metrics_config_store_utilization_metrics + --no-metrics_config_store_plots + --no-metrics_config_enable_chrome_trace + --no-metrics_config_write_json_trace + +) + +if [ "$ENABLE_CHUNKED_PREFILL" = "true" ]; then + CMD+=(--vllm_v1_scheduler_config_enable_chunked_prefill) +else + CMD+=(--no-vllm_v1_scheduler_config_enable_chunked_prefill) +fi + +if [ "$ENABLE_DUMMY_MODE" = "true" ]; then + CMD+=( + --random_forrest_execution_time_predictor_config_enable_dummy_mode + --random_forrest_execution_time_predictor_config_dummy_execution_time_ms "$DUMMY_EXEC_TIME_MS" + ) +fi + +if [ "$#" -gt 0 ]; then + if [ "$1" = "--" ]; then + shift + fi + CMD+=("$@") +fi + +cat <&2 + exit "$exit_code" +fi diff --git a/examples/architecture/pdd/offline/moe_model_basic.sh b/examples/architecture/pdd/offline/moe_model_basic.sh new file mode 100755 index 0000000..e4e9dce --- /dev/null +++ b/examples/architecture/pdd/offline/moe_model_basic.sh @@ -0,0 +1,229 @@ +#!/bin/bash +# ============================================================================= +# PDD / pd-disaggregation Offline Mode - MoE Model Example +# ============================================================================= +# This script mirrors the co-location example surface while using the +# pre-release-v0.2 PDD / pd-disaggregation architecture: prefill runs in the PREFILL cluster, +# decode runs in the DECODE cluster, and KV cache is transferred between them. +# +# This script demonstrates the release-supported sequential pd-disaggregation path. +# Decode CUDA Graph modeling and Chunked Prefill can be toggled with +# DECODE_CUDA_GRAPH_MODE and ENABLE_CHUNKED_PREFILL. +## Override any uppercase variable from the shell, and append extra Frontier CLI +# flags after "--" if you need to customize the run. +# ============================================================================= + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +export PYTHONPATH="$REPO_ROOT${PYTHONPATH:+:$PYTHONPATH}" +export WANDB_DISABLED=true +export VIDUR_DISABLE_WANDB=1 +PYTHON_BIN="${PYTHON_BIN:-python3}" + +MODEL_NAME="${MODEL_NAME:-Phi-tiny-MoE-instruct}" +SYS_ARCH="${SYS_ARCH:-pd-disaggregation}" +PREFILL_REPLICAS="${PREFILL_REPLICAS:-1}" +DECODE_REPLICAS="${DECODE_REPLICAS:-1}" +PREFILL_ATTN_TP="${PREFILL_ATTN_TP:-2}" +PREFILL_ATTN_DP="${PREFILL_ATTN_DP:-1}" +PREFILL_MOE_TP="${PREFILL_MOE_TP:-1}" +PREFILL_MOE_EP="${PREFILL_MOE_EP:-2}" +PREFILL_PP="${PREFILL_PP:-1}" +PREFILL_DEVICE="${PREFILL_DEVICE:-a800}" +PREFILL_MEMORY_MARGIN_FRACTION="${PREFILL_MEMORY_MARGIN_FRACTION:-0.2}" +DECODE_ATTN_TP="${DECODE_ATTN_TP:-2}" +DECODE_ATTN_DP="${DECODE_ATTN_DP:-1}" +DECODE_MOE_TP="${DECODE_MOE_TP:-1}" +DECODE_MOE_EP="${DECODE_MOE_EP:-2}" +DECODE_PP="${DECODE_PP:-1}" +DECODE_DEVICE="${DECODE_DEVICE:-a800}" +DECODE_MEMORY_MARGIN_FRACTION="${DECODE_MEMORY_MARGIN_FRACTION:-0.2}" +TOTAL_EXPERTS="${TOTAL_EXPERTS:-8}" +ROUTER_TOPK="${ROUTER_TOPK:-2}" +MOE_ROUTING_MODE="${MOE_ROUTING_MODE:-simulation}" +MOE_ROUTING_SEED="${MOE_ROUTING_SEED:-42}" +REPLICA_SCHEDULER="${REPLICA_SCHEDULER:-vllm_v1}" +NUM_REQUESTS="${NUM_REQUESTS:-8}" +PREFILL_TOKENS="${PREFILL_TOKENS:-256}" +DECODE_TOKENS="${DECODE_TOKENS:-32}" +QPS="${QPS:-1.0}" +ENABLE_DUMMY_MODE="${ENABLE_DUMMY_MODE:-true}" +DUMMY_EXEC_TIME_MS="${DUMMY_EXEC_TIME_MS:-1.0}" +DECODE_CUDA_GRAPH_MODE="${DECODE_CUDA_GRAPH_MODE:-none}" +ENABLE_CHUNKED_PREFILL="${ENABLE_CHUNKED_PREFILL:-true}" +MAX_TOKENS_IN_BATCH="${MAX_TOKENS_IN_BATCH:-1024}" +LONG_PREFILL_TOKEN_THRESHOLD="${LONG_PREFILL_TOKEN_THRESHOLD:-64}" +KV_TRANSFER_BANDWIDTH_GBPS="${KV_TRANSFER_BANDWIDTH_GBPS:-200.0}" +KV_TRANSFER_LATENCY_MS="${KV_TRANSFER_LATENCY_MS:-0.5}" +METRICS_OUTPUT_DIR="${METRICS_OUTPUT_DIR:-$REPO_ROOT/outputs/examples/pdd/offline}" +RUN_ID="${RUN_ID:-moe_model_basic}" + +require_bool() { + local name="$1" + local value="$2" + if [ "$value" != "true" ] && [ "$value" != "false" ]; then + echo "ERROR: $name must be true or false; got $value" >&2 + exit 2 + fi +} + +require_non_negative_integer() { + local name="$1" + local value="$2" + if [[ ! "$value" =~ ^[0-9]+$ ]]; then + echo "ERROR: $name must be a non-negative integer; got $value" >&2 + exit 2 + fi +} + +require_positive_integer() { + local name="$1" + local value="$2" + [[ "$value" =~ ^[1-9][0-9]*$ ]] +} + +require_bool "ENABLE_DUMMY_MODE" "$ENABLE_DUMMY_MODE" +require_bool "ENABLE_CHUNKED_PREFILL" "$ENABLE_CHUNKED_PREFILL" + +if [ "$SYS_ARCH" != "pd-disaggregation" ]; then + echo "ERROR: this example only supports SYS_ARCH=pd-disaggregation; got SYS_ARCH=$SYS_ARCH" >&2 + exit 2 +fi + +if [ "$DECODE_CUDA_GRAPH_MODE" = "none" ]; then + echo "INFO: Decode CUDA Graph modeling is disabled by DECODE_CUDA_GRAPH_MODE=none." +elif [ "$DECODE_CUDA_GRAPH_MODE" != "full_decode_only" ] && [ "$DECODE_CUDA_GRAPH_MODE" != "piecewise" ]; then + echo "ERROR: DECODE_CUDA_GRAPH_MODE must be none, full_decode_only, or piecewise; got $DECODE_CUDA_GRAPH_MODE" >&2 + exit 2 +fi + +if [ "$ENABLE_CHUNKED_PREFILL" = "false" ] && [ "$LONG_PREFILL_TOKEN_THRESHOLD" != "0" ]; then + echo "ERROR: LONG_PREFILL_TOKEN_THRESHOLD must be 0 when ENABLE_CHUNKED_PREFILL=false" >&2 + exit 2 +fi + +if (( PREFILL_ATTN_TP * PREFILL_ATTN_DP != PREFILL_MOE_TP * PREFILL_MOE_EP )); then + echo "ERROR: shared-domain prefill MoE requires PREFILL_ATTN_TP * PREFILL_ATTN_DP == PREFILL_MOE_TP * PREFILL_MOE_EP" >&2 + echo " got PREFILL_ATTN_TP=$PREFILL_ATTN_TP, PREFILL_ATTN_DP=$PREFILL_ATTN_DP, PREFILL_MOE_TP=$PREFILL_MOE_TP, PREFILL_MOE_EP=$PREFILL_MOE_EP" >&2 + exit 2 +fi + +if (( DECODE_ATTN_TP * DECODE_ATTN_DP != DECODE_MOE_TP * DECODE_MOE_EP )); then + echo "ERROR: shared-domain decode MoE requires DECODE_ATTN_TP * DECODE_ATTN_DP == DECODE_MOE_TP * DECODE_MOE_EP" >&2 + echo " got DECODE_ATTN_TP=$DECODE_ATTN_TP, DECODE_ATTN_DP=$DECODE_ATTN_DP, DECODE_MOE_TP=$DECODE_MOE_TP, DECODE_MOE_EP=$DECODE_MOE_EP" >&2 + exit 2 +fi + +if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + echo "ERROR: PYTHON_BIN is not executable or not on PATH: $PYTHON_BIN" >&2 + exit 2 +fi + +CMD=( + "$PYTHON_BIN" -m frontier.main + --simulation_mode offline + --sys_arch "$SYS_ARCH" + --no-enable_parallel_clusters + --cluster_config_prefill_cluster_num_replicas "$PREFILL_REPLICAS" + --cluster_config_decode_cluster_num_replicas "$DECODE_REPLICAS" + --cluster_config_prefill_replica_config_num_pipeline_stages "$PREFILL_PP" + --cluster_config_prefill_replica_config_attn_tensor_parallel_size "$PREFILL_ATTN_TP" + --cluster_config_prefill_replica_config_attn_data_parallel_size "$PREFILL_ATTN_DP" + --cluster_config_prefill_replica_config_moe_tensor_parallel_size "$PREFILL_MOE_TP" + --cluster_config_prefill_replica_config_moe_expert_parallel_size "$PREFILL_MOE_EP" + --cluster_config_prefill_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_prefill_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_prefill_replica_config_device "$PREFILL_DEVICE" + --cluster_config_prefill_replica_config_memory_margin_fraction "$PREFILL_MEMORY_MARGIN_FRACTION" + --cluster_config_decode_replica_config_num_pipeline_stages "$DECODE_PP" + --cluster_config_decode_replica_config_attn_tensor_parallel_size "$DECODE_ATTN_TP" + --cluster_config_decode_replica_config_attn_data_parallel_size "$DECODE_ATTN_DP" + --cluster_config_decode_replica_config_moe_tensor_parallel_size "$DECODE_MOE_TP" + --cluster_config_decode_replica_config_moe_expert_parallel_size "$DECODE_MOE_EP" + --cluster_config_decode_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_decode_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_decode_replica_config_device "$DECODE_DEVICE" + --cluster_config_decode_replica_config_memory_margin_fraction "$DECODE_MEMORY_MARGIN_FRACTION" + --cc_backend_config_type analytical + --replica_config_model_name "$MODEL_NAME" + --replica_config_moe_routing_mode "$MOE_ROUTING_MODE" + --replica_config_moe_routing_seed "$MOE_ROUTING_SEED" + --replica_scheduler_config_type "$REPLICA_SCHEDULER" + --decode_cuda_graph_mode "$DECODE_CUDA_GRAPH_MODE" + --vllm_v1_scheduler_config_max_tokens_in_batch "$MAX_TOKENS_IN_BATCH" + --vllm_v1_scheduler_config_long_prefill_token_threshold "$LONG_PREFILL_TOKEN_THRESHOLD" + --vllm_v1_scheduler_config_block_size "${BLOCK_SIZE:-16}" + --vllm_v1_scheduler_config_num_blocks "${NUM_BLOCKS:-128}" + --request_generator_config_type synthetic + --synthetic_request_generator_config_num_requests "$NUM_REQUESTS" + --length_generator_config_type fixed + --fixed_request_length_generator_config_prefill_tokens "$PREFILL_TOKENS" + --fixed_request_length_generator_config_decode_tokens "$DECODE_TOKENS" + --interval_generator_config_type poisson + --poisson_request_interval_generator_config_qps "$QPS" + --analytical_kv_cache_transfer_config_network_bandwidth_gbps "$KV_TRANSFER_BANDWIDTH_GBPS" + --analytical_kv_cache_transfer_config_network_latency_ms "$KV_TRANSFER_LATENCY_MS" + --metrics_config_output_dir "$METRICS_OUTPUT_DIR" + --metrics_config_run_id "$RUN_ID" + --metrics_config_write_metrics + --metrics_config_store_request_metrics + --metrics_config_store_batch_metrics + --metrics_config_store_token_completion_metrics + --metrics_config_store_utilization_metrics + --no-metrics_config_store_plots + --no-metrics_config_enable_chrome_trace + --no-metrics_config_write_json_trace + +) + +if [ "$ENABLE_CHUNKED_PREFILL" = "true" ]; then + CMD+=(--vllm_v1_scheduler_config_enable_chunked_prefill) +else + CMD+=(--no-vllm_v1_scheduler_config_enable_chunked_prefill) +fi + +if [ "$ENABLE_DUMMY_MODE" = "true" ]; then + CMD+=( + --random_forrest_execution_time_predictor_config_enable_dummy_mode + --random_forrest_execution_time_predictor_config_dummy_execution_time_ms "$DUMMY_EXEC_TIME_MS" + ) +fi + +if [ "$#" -gt 0 ]; then + if [ "$1" = "--" ]; then + shift + fi + CMD+=("$@") +fi + +cat <&2 + exit "$exit_code" +fi diff --git a/examples/architecture/pdd/offline/moe_prefix_caching.sh b/examples/architecture/pdd/offline/moe_prefix_caching.sh new file mode 100755 index 0000000..ba62994 --- /dev/null +++ b/examples/architecture/pdd/offline/moe_prefix_caching.sh @@ -0,0 +1,241 @@ +#!/bin/bash +# ============================================================================= +# PDD / pd-disaggregation Offline Mode - MoE Prefix Caching Recipe +# ============================================================================= +# This script mirrors the co-location example surface while using the +# pre-release-v0.2 PDD / pd-disaggregation architecture: prefill runs in the PREFILL cluster, +# decode runs in the DECODE cluster, and KV cache is transferred between them. +# +# This recipe enables vLLM V1 Prefix Caching with a public shared-session trace +# fixture. Prefix Caching and Speculative Decoding are intentionally separate; +# this script does not enable speculative decoding. +## Override any uppercase variable from the shell, and append extra Frontier CLI +# flags after "--" if you need to customize the run. +# ============================================================================= + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +export PYTHONPATH="$REPO_ROOT${PYTHONPATH:+:$PYTHONPATH}" +export WANDB_DISABLED=true +export VIDUR_DISABLE_WANDB=1 +PYTHON_BIN="${PYTHON_BIN:-python3}" + +MODEL_NAME="${MODEL_NAME:-Phi-tiny-MoE-instruct}" +SYS_ARCH="${SYS_ARCH:-pd-disaggregation}" +PREFILL_REPLICAS="${PREFILL_REPLICAS:-2}" +DECODE_REPLICAS="${DECODE_REPLICAS:-2}" +PREFILL_ATTN_TP="${PREFILL_ATTN_TP:-2}" +PREFILL_ATTN_DP="${PREFILL_ATTN_DP:-1}" +PREFILL_MOE_TP="${PREFILL_MOE_TP:-1}" +PREFILL_MOE_EP="${PREFILL_MOE_EP:-2}" +PREFILL_PP="${PREFILL_PP:-1}" +PREFILL_DEVICE="${PREFILL_DEVICE:-a800}" +PREFILL_MEMORY_MARGIN_FRACTION="${PREFILL_MEMORY_MARGIN_FRACTION:-0.2}" +DECODE_ATTN_TP="${DECODE_ATTN_TP:-2}" +DECODE_ATTN_DP="${DECODE_ATTN_DP:-1}" +DECODE_MOE_TP="${DECODE_MOE_TP:-1}" +DECODE_MOE_EP="${DECODE_MOE_EP:-2}" +DECODE_PP="${DECODE_PP:-1}" +DECODE_DEVICE="${DECODE_DEVICE:-a800}" +DECODE_MEMORY_MARGIN_FRACTION="${DECODE_MEMORY_MARGIN_FRACTION:-0.2}" +TOTAL_EXPERTS="${TOTAL_EXPERTS:-8}" +ROUTER_TOPK="${ROUTER_TOPK:-2}" +MOE_ROUTING_MODE="${MOE_ROUTING_MODE:-simulation}" +MOE_ROUTING_SEED="${MOE_ROUTING_SEED:-42}" +REPLICA_SCHEDULER="${REPLICA_SCHEDULER:-vllm_v1}" +NUM_REQUESTS="${NUM_REQUESTS:-2}" +PREFILL_TOKENS="${PREFILL_TOKENS:-32}" +DECODE_TOKENS="${DECODE_TOKENS:-8}" +QPS="${QPS:-1.0}" +ENABLE_DUMMY_MODE="${ENABLE_DUMMY_MODE:-true}" +DUMMY_EXEC_TIME_MS="${DUMMY_EXEC_TIME_MS:-1.0}" +DECODE_CUDA_GRAPH_MODE="${DECODE_CUDA_GRAPH_MODE:-none}" +ENABLE_CHUNKED_PREFILL="${ENABLE_CHUNKED_PREFILL:-true}" +MAX_TOKENS_IN_BATCH="${MAX_TOKENS_IN_BATCH:-1024}" +LONG_PREFILL_TOKEN_THRESHOLD="${LONG_PREFILL_TOKEN_THRESHOLD:-64}" +KV_TRANSFER_BANDWIDTH_GBPS="${KV_TRANSFER_BANDWIDTH_GBPS:-200.0}" +KV_TRANSFER_LATENCY_MS="${KV_TRANSFER_LATENCY_MS:-0.5}" +TRACE_FILE="${TRACE_FILE:-$REPO_ROOT/examples/fixtures/prefix_cache_shared_session_trace.csv}" +MAX_TOKENS="${MAX_TOKENS:-128}" +EXPECTED_TRACE_REQUESTS="${EXPECTED_TRACE_REQUESTS:-2}" +BLOCK_SIZE="${BLOCK_SIZE:-16}" +NUM_BLOCKS="${NUM_BLOCKS:-128}" +METRICS_OUTPUT_DIR="${METRICS_OUTPUT_DIR:-$REPO_ROOT/outputs/examples/pdd/offline}" +RUN_ID="${RUN_ID:-moe_prefix_caching}" + +require_bool() { + local name="$1" + local value="$2" + if [ "$value" != "true" ] && [ "$value" != "false" ]; then + echo "ERROR: $name must be true or false; got $value" >&2 + exit 2 + fi +} + +require_non_negative_integer() { + local name="$1" + local value="$2" + if [[ ! "$value" =~ ^[0-9]+$ ]]; then + echo "ERROR: $name must be a non-negative integer; got $value" >&2 + exit 2 + fi +} + +require_positive_integer() { + local name="$1" + local value="$2" + [[ "$value" =~ ^[1-9][0-9]*$ ]] +} + +require_bool "ENABLE_DUMMY_MODE" "$ENABLE_DUMMY_MODE" +require_bool "ENABLE_CHUNKED_PREFILL" "$ENABLE_CHUNKED_PREFILL" + +if [ "$SYS_ARCH" != "pd-disaggregation" ]; then + echo "ERROR: this example only supports SYS_ARCH=pd-disaggregation; got SYS_ARCH=$SYS_ARCH" >&2 + exit 2 +fi + +if [ "$DECODE_CUDA_GRAPH_MODE" = "none" ]; then + echo "INFO: Decode CUDA Graph modeling is disabled by DECODE_CUDA_GRAPH_MODE=none." +elif [ "$DECODE_CUDA_GRAPH_MODE" != "full_decode_only" ] && [ "$DECODE_CUDA_GRAPH_MODE" != "piecewise" ]; then + echo "ERROR: DECODE_CUDA_GRAPH_MODE must be none, full_decode_only, or piecewise; got $DECODE_CUDA_GRAPH_MODE" >&2 + exit 2 +fi + +if [ "$ENABLE_CHUNKED_PREFILL" = "false" ] && [ "$LONG_PREFILL_TOKEN_THRESHOLD" != "0" ]; then + echo "ERROR: LONG_PREFILL_TOKEN_THRESHOLD must be 0 when ENABLE_CHUNKED_PREFILL=false" >&2 + exit 2 +fi + +if (( PREFILL_ATTN_TP * PREFILL_ATTN_DP != PREFILL_MOE_TP * PREFILL_MOE_EP )); then + echo "ERROR: shared-domain prefill MoE requires PREFILL_ATTN_TP * PREFILL_ATTN_DP == PREFILL_MOE_TP * PREFILL_MOE_EP" >&2 + echo " got PREFILL_ATTN_TP=$PREFILL_ATTN_TP, PREFILL_ATTN_DP=$PREFILL_ATTN_DP, PREFILL_MOE_TP=$PREFILL_MOE_TP, PREFILL_MOE_EP=$PREFILL_MOE_EP" >&2 + exit 2 +fi + +if (( DECODE_ATTN_TP * DECODE_ATTN_DP != DECODE_MOE_TP * DECODE_MOE_EP )); then + echo "ERROR: shared-domain decode MoE requires DECODE_ATTN_TP * DECODE_ATTN_DP == DECODE_MOE_TP * DECODE_MOE_EP" >&2 + echo " got DECODE_ATTN_TP=$DECODE_ATTN_TP, DECODE_ATTN_DP=$DECODE_ATTN_DP, DECODE_MOE_TP=$DECODE_MOE_TP, DECODE_MOE_EP=$DECODE_MOE_EP" >&2 + exit 2 +fi + +if [ ! -f "$TRACE_FILE" ]; then + echo "ERROR: TRACE_FILE does not exist: $TRACE_FILE" >&2 + exit 2 +fi + +if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + echo "ERROR: PYTHON_BIN is not executable or not on PATH: $PYTHON_BIN" >&2 + exit 2 +fi + +CMD=( + "$PYTHON_BIN" -m frontier.main + --simulation_mode offline + --sys_arch "$SYS_ARCH" + --no-enable_parallel_clusters + --cluster_config_prefill_cluster_num_replicas "$PREFILL_REPLICAS" + --cluster_config_decode_cluster_num_replicas "$DECODE_REPLICAS" + --cluster_scheduler_config_type sticky_round_robin + --cluster_config_prefill_replica_config_num_pipeline_stages "$PREFILL_PP" + --cluster_config_prefill_replica_config_attn_tensor_parallel_size "$PREFILL_ATTN_TP" + --cluster_config_prefill_replica_config_attn_data_parallel_size "$PREFILL_ATTN_DP" + --cluster_config_prefill_replica_config_moe_tensor_parallel_size "$PREFILL_MOE_TP" + --cluster_config_prefill_replica_config_moe_expert_parallel_size "$PREFILL_MOE_EP" + --cluster_config_prefill_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_prefill_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_prefill_replica_config_device "$PREFILL_DEVICE" + --cluster_config_prefill_replica_config_memory_margin_fraction "$PREFILL_MEMORY_MARGIN_FRACTION" + --cluster_config_decode_replica_config_num_pipeline_stages "$DECODE_PP" + --cluster_config_decode_replica_config_attn_tensor_parallel_size "$DECODE_ATTN_TP" + --cluster_config_decode_replica_config_attn_data_parallel_size "$DECODE_ATTN_DP" + --cluster_config_decode_replica_config_moe_tensor_parallel_size "$DECODE_MOE_TP" + --cluster_config_decode_replica_config_moe_expert_parallel_size "$DECODE_MOE_EP" + --cluster_config_decode_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_decode_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_decode_replica_config_device "$DECODE_DEVICE" + --cluster_config_decode_replica_config_memory_margin_fraction "$DECODE_MEMORY_MARGIN_FRACTION" + --cc_backend_config_type analytical + --replica_config_model_name "$MODEL_NAME" + --replica_config_moe_routing_mode "$MOE_ROUTING_MODE" + --replica_config_moe_routing_seed "$MOE_ROUTING_SEED" + --replica_scheduler_config_type "$REPLICA_SCHEDULER" + --decode_cuda_graph_mode "$DECODE_CUDA_GRAPH_MODE" + --vllm_v1_scheduler_config_max_tokens_in_batch "$MAX_TOKENS_IN_BATCH" + --vllm_v1_scheduler_config_long_prefill_token_threshold "$LONG_PREFILL_TOKEN_THRESHOLD" + --vllm_v1_scheduler_config_block_size "${BLOCK_SIZE:-16}" + --vllm_v1_scheduler_config_num_blocks "${NUM_BLOCKS:-128}" + --request_generator_config_type trace_replay + --trace_request_generator_config_trace_file "$TRACE_FILE" + --trace_request_generator_config_max_tokens "$MAX_TOKENS" + --analytical_kv_cache_transfer_config_network_bandwidth_gbps "$KV_TRANSFER_BANDWIDTH_GBPS" + --analytical_kv_cache_transfer_config_network_latency_ms "$KV_TRANSFER_LATENCY_MS" + --metrics_config_output_dir "$METRICS_OUTPUT_DIR" + --metrics_config_run_id "$RUN_ID" + --metrics_config_write_metrics + --metrics_config_store_request_metrics + --metrics_config_store_batch_metrics + --metrics_config_store_token_completion_metrics + --metrics_config_store_utilization_metrics + --no-metrics_config_store_plots + --no-metrics_config_enable_chrome_trace + --no-metrics_config_write_json_trace + +) + +if [ "$ENABLE_CHUNKED_PREFILL" = "true" ]; then + CMD+=(--vllm_v1_scheduler_config_enable_chunked_prefill) +else + CMD+=(--no-vllm_v1_scheduler_config_enable_chunked_prefill) +fi + +CMD+=(--vllm_v1_scheduler_config_enable_prefix_caching) + +if [ "$ENABLE_DUMMY_MODE" = "true" ]; then + CMD+=( + --random_forrest_execution_time_predictor_config_enable_dummy_mode + --random_forrest_execution_time_predictor_config_dummy_execution_time_ms "$DUMMY_EXEC_TIME_MS" + ) +fi + +if [ "$#" -gt 0 ]; then + if [ "$1" = "--" ]; then + shift + fi + CMD+=("$@") +fi + +cat <&2 + exit "$exit_code" +fi diff --git a/examples/architecture/pdd/offline/moe_spec_dec.sh b/examples/architecture/pdd/offline/moe_spec_dec.sh new file mode 100755 index 0000000..72f9fe6 --- /dev/null +++ b/examples/architecture/pdd/offline/moe_spec_dec.sh @@ -0,0 +1,282 @@ +#!/bin/bash +# ============================================================================= +# PDD / pd-disaggregation Offline Mode - MoE Speculative Decoding / MTP Recipe +# ============================================================================= +# This script mirrors the co-location example surface while using the +# pre-release-v0.2 PDD / pd-disaggregation architecture: prefill runs in the PREFILL cluster, +# decode runs in the DECODE cluster, and KV cache is transferred between them. +# +# Speculative decoding and Prefix Caching have separate runtime contracts. +# This recipe enables speculative decoding and intentionally leaves Prefix +# Caching disabled. It also defaults DECODE_CUDA_GRAPH_MODE to "none" because +# production speculative decoding requires eager decode scheduling. +# +# For MTP-style methods, set SPEC_METHOD to an MTP method and keep +# MTP_N_PREDICT / MTP_NUM_LAYERS positive. +## Override any uppercase variable from the shell, and append extra Frontier CLI +# flags after "--" if you need to customize the run. +# ============================================================================= + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +export PYTHONPATH="$REPO_ROOT${PYTHONPATH:+:$PYTHONPATH}" +export WANDB_DISABLED=true +export VIDUR_DISABLE_WANDB=1 +PYTHON_BIN="${PYTHON_BIN:-python3}" + +MODEL_NAME="${MODEL_NAME:-Phi-tiny-MoE-instruct}" +SYS_ARCH="${SYS_ARCH:-pd-disaggregation}" +PREFILL_REPLICAS="${PREFILL_REPLICAS:-1}" +DECODE_REPLICAS="${DECODE_REPLICAS:-1}" +PREFILL_ATTN_TP="${PREFILL_ATTN_TP:-2}" +PREFILL_ATTN_DP="${PREFILL_ATTN_DP:-1}" +PREFILL_MOE_TP="${PREFILL_MOE_TP:-1}" +PREFILL_MOE_EP="${PREFILL_MOE_EP:-2}" +PREFILL_PP="${PREFILL_PP:-1}" +PREFILL_DEVICE="${PREFILL_DEVICE:-a800}" +PREFILL_MEMORY_MARGIN_FRACTION="${PREFILL_MEMORY_MARGIN_FRACTION:-0.2}" +DECODE_ATTN_TP="${DECODE_ATTN_TP:-2}" +DECODE_ATTN_DP="${DECODE_ATTN_DP:-1}" +DECODE_MOE_TP="${DECODE_MOE_TP:-1}" +DECODE_MOE_EP="${DECODE_MOE_EP:-2}" +DECODE_PP="${DECODE_PP:-1}" +DECODE_DEVICE="${DECODE_DEVICE:-a800}" +DECODE_MEMORY_MARGIN_FRACTION="${DECODE_MEMORY_MARGIN_FRACTION:-0.2}" +TOTAL_EXPERTS="${TOTAL_EXPERTS:-8}" +ROUTER_TOPK="${ROUTER_TOPK:-2}" +MOE_ROUTING_MODE="${MOE_ROUTING_MODE:-simulation}" +MOE_ROUTING_SEED="${MOE_ROUTING_SEED:-42}" +REPLICA_SCHEDULER="${REPLICA_SCHEDULER:-vllm_v1}" +NUM_REQUESTS="${NUM_REQUESTS:-8}" +PREFILL_TOKENS="${PREFILL_TOKENS:-256}" +DECODE_TOKENS="${DECODE_TOKENS:-32}" +QPS="${QPS:-1.0}" +ENABLE_DUMMY_MODE="${ENABLE_DUMMY_MODE:-true}" +DUMMY_EXEC_TIME_MS="${DUMMY_EXEC_TIME_MS:-1.0}" +DECODE_CUDA_GRAPH_MODE="${DECODE_CUDA_GRAPH_MODE:-none}" +ENABLE_CHUNKED_PREFILL="${ENABLE_CHUNKED_PREFILL:-true}" +MAX_TOKENS_IN_BATCH="${MAX_TOKENS_IN_BATCH:-1024}" +LONG_PREFILL_TOKEN_THRESHOLD="${LONG_PREFILL_TOKEN_THRESHOLD:-64}" +KV_TRANSFER_BANDWIDTH_GBPS="${KV_TRANSFER_BANDWIDTH_GBPS:-200.0}" +KV_TRANSFER_LATENCY_MS="${KV_TRANSFER_LATENCY_MS:-0.5}" +SPEC_METHOD="${SPEC_METHOD:-ngram}" +SPEC_MODEL_NAME="${SPEC_MODEL_NAME:-}" +NUM_SPECULATIVE_TOKENS="${NUM_SPECULATIVE_TOKENS:-2}" +COMMITTED_TOKENS_PER_ITERATION="${COMMITTED_TOKENS_PER_ITERATION:-2}" +PROPOSER_OVERHEAD_MS_BY_METHOD="${PROPOSER_OVERHEAD_MS_BY_METHOD:-{\"ngram\":0.0,\"qwen3_next_mtp\":0.0,\"deepseek_mtp\":0.0,\"ernie_mtp\":0.0}}" +MTP_N_PREDICT="${MTP_N_PREDICT:-0}" +MTP_NUM_LAYERS="${MTP_NUM_LAYERS:-0}" +METRICS_OUTPUT_DIR="${METRICS_OUTPUT_DIR:-$REPO_ROOT/outputs/examples/pdd/offline}" +RUN_ID="${RUN_ID:-moe_spec_dec}" + +require_bool() { + local name="$1" + local value="$2" + if [ "$value" != "true" ] && [ "$value" != "false" ]; then + echo "ERROR: $name must be true or false; got $value" >&2 + exit 2 + fi +} + +require_non_negative_integer() { + local name="$1" + local value="$2" + if [[ ! "$value" =~ ^[0-9]+$ ]]; then + echo "ERROR: $name must be a non-negative integer; got $value" >&2 + exit 2 + fi +} + +require_positive_integer() { + local name="$1" + local value="$2" + [[ "$value" =~ ^[1-9][0-9]*$ ]] +} + +require_bool "ENABLE_DUMMY_MODE" "$ENABLE_DUMMY_MODE" +require_bool "ENABLE_CHUNKED_PREFILL" "$ENABLE_CHUNKED_PREFILL" + +if [ "$SYS_ARCH" != "pd-disaggregation" ]; then + echo "ERROR: this example only supports SYS_ARCH=pd-disaggregation; got SYS_ARCH=$SYS_ARCH" >&2 + exit 2 +fi + +if [ "$DECODE_CUDA_GRAPH_MODE" = "none" ]; then + echo "INFO: Decode CUDA Graph modeling is disabled by DECODE_CUDA_GRAPH_MODE=none." +elif [ "$DECODE_CUDA_GRAPH_MODE" != "full_decode_only" ] && [ "$DECODE_CUDA_GRAPH_MODE" != "piecewise" ]; then + echo "ERROR: DECODE_CUDA_GRAPH_MODE must be none, full_decode_only, or piecewise; got $DECODE_CUDA_GRAPH_MODE" >&2 + exit 2 +fi + +if [ "$ENABLE_CHUNKED_PREFILL" = "false" ] && [ "$LONG_PREFILL_TOKEN_THRESHOLD" != "0" ]; then + echo "ERROR: LONG_PREFILL_TOKEN_THRESHOLD must be 0 when ENABLE_CHUNKED_PREFILL=false" >&2 + exit 2 +fi + +if (( PREFILL_ATTN_TP * PREFILL_ATTN_DP != PREFILL_MOE_TP * PREFILL_MOE_EP )); then + echo "ERROR: shared-domain prefill MoE requires PREFILL_ATTN_TP * PREFILL_ATTN_DP == PREFILL_MOE_TP * PREFILL_MOE_EP" >&2 + echo " got PREFILL_ATTN_TP=$PREFILL_ATTN_TP, PREFILL_ATTN_DP=$PREFILL_ATTN_DP, PREFILL_MOE_TP=$PREFILL_MOE_TP, PREFILL_MOE_EP=$PREFILL_MOE_EP" >&2 + exit 2 +fi + +if (( DECODE_ATTN_TP * DECODE_ATTN_DP != DECODE_MOE_TP * DECODE_MOE_EP )); then + echo "ERROR: shared-domain decode MoE requires DECODE_ATTN_TP * DECODE_ATTN_DP == DECODE_MOE_TP * DECODE_MOE_EP" >&2 + echo " got DECODE_ATTN_TP=$DECODE_ATTN_TP, DECODE_ATTN_DP=$DECODE_ATTN_DP, DECODE_MOE_TP=$DECODE_MOE_TP, DECODE_MOE_EP=$DECODE_MOE_EP" >&2 + exit 2 +fi + +require_non_negative_integer "MTP_N_PREDICT" "$MTP_N_PREDICT" +require_non_negative_integer "MTP_NUM_LAYERS" "$MTP_NUM_LAYERS" + +if [ "$DECODE_CUDA_GRAPH_MODE" != "none" ]; then + echo "ERROR: speculative decoding currently requires DECODE_CUDA_GRAPH_MODE=none in production recipes; got $DECODE_CUDA_GRAPH_MODE" >&2 + exit 2 +fi + +case "$SPEC_METHOD" in + deepseek_mtp|ernie_mtp|qwen3_moe_mtp|qwen3_next_mtp) + if ! require_positive_integer "MTP_N_PREDICT" "$MTP_N_PREDICT" || ! require_positive_integer "MTP_NUM_LAYERS" "$MTP_NUM_LAYERS"; then + echo "ERROR: SPEC_METHOD=$SPEC_METHOD requires MTP_N_PREDICT>0 and MTP_NUM_LAYERS>0" >&2 + exit 2 + fi + ;; + *) + if [ "$MTP_N_PREDICT" -ne 0 ] || [ "$MTP_NUM_LAYERS" -ne 0 ]; then + echo "ERROR: MTP_N_PREDICT/MTP_NUM_LAYERS are only valid for MTP SPEC_METHOD values" >&2 + exit 2 + fi + ;; +esac + +if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + echo "ERROR: PYTHON_BIN is not executable or not on PATH: $PYTHON_BIN" >&2 + exit 2 +fi + +CMD=( + "$PYTHON_BIN" -m frontier.main + --simulation_mode offline + --sys_arch "$SYS_ARCH" + --no-enable_parallel_clusters + --cluster_config_prefill_cluster_num_replicas "$PREFILL_REPLICAS" + --cluster_config_decode_cluster_num_replicas "$DECODE_REPLICAS" + --cluster_config_prefill_replica_config_num_pipeline_stages "$PREFILL_PP" + --cluster_config_prefill_replica_config_attn_tensor_parallel_size "$PREFILL_ATTN_TP" + --cluster_config_prefill_replica_config_attn_data_parallel_size "$PREFILL_ATTN_DP" + --cluster_config_prefill_replica_config_moe_tensor_parallel_size "$PREFILL_MOE_TP" + --cluster_config_prefill_replica_config_moe_expert_parallel_size "$PREFILL_MOE_EP" + --cluster_config_prefill_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_prefill_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_prefill_replica_config_device "$PREFILL_DEVICE" + --cluster_config_prefill_replica_config_memory_margin_fraction "$PREFILL_MEMORY_MARGIN_FRACTION" + --cluster_config_decode_replica_config_num_pipeline_stages "$DECODE_PP" + --cluster_config_decode_replica_config_attn_tensor_parallel_size "$DECODE_ATTN_TP" + --cluster_config_decode_replica_config_attn_data_parallel_size "$DECODE_ATTN_DP" + --cluster_config_decode_replica_config_moe_tensor_parallel_size "$DECODE_MOE_TP" + --cluster_config_decode_replica_config_moe_expert_parallel_size "$DECODE_MOE_EP" + --cluster_config_decode_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_decode_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_decode_replica_config_device "$DECODE_DEVICE" + --cluster_config_decode_replica_config_memory_margin_fraction "$DECODE_MEMORY_MARGIN_FRACTION" + --cc_backend_config_type analytical + --replica_config_model_name "$MODEL_NAME" + --replica_config_moe_routing_mode "$MOE_ROUTING_MODE" + --replica_config_moe_routing_seed "$MOE_ROUTING_SEED" + --replica_scheduler_config_type "$REPLICA_SCHEDULER" + --decode_cuda_graph_mode "$DECODE_CUDA_GRAPH_MODE" + --vllm_v1_scheduler_config_max_tokens_in_batch "$MAX_TOKENS_IN_BATCH" + --vllm_v1_scheduler_config_long_prefill_token_threshold "$LONG_PREFILL_TOKEN_THRESHOLD" + --vllm_v1_scheduler_config_block_size "${BLOCK_SIZE:-16}" + --vllm_v1_scheduler_config_num_blocks "${NUM_BLOCKS:-128}" + --request_generator_config_type synthetic + --synthetic_request_generator_config_num_requests "$NUM_REQUESTS" + --length_generator_config_type fixed + --fixed_request_length_generator_config_prefill_tokens "$PREFILL_TOKENS" + --fixed_request_length_generator_config_decode_tokens "$DECODE_TOKENS" + --interval_generator_config_type poisson + --poisson_request_interval_generator_config_qps "$QPS" + --analytical_kv_cache_transfer_config_network_bandwidth_gbps "$KV_TRANSFER_BANDWIDTH_GBPS" + --analytical_kv_cache_transfer_config_network_latency_ms "$KV_TRANSFER_LATENCY_MS" + --metrics_config_output_dir "$METRICS_OUTPUT_DIR" + --metrics_config_run_id "$RUN_ID" + --metrics_config_write_metrics + --metrics_config_store_request_metrics + --metrics_config_store_batch_metrics + --metrics_config_store_token_completion_metrics + --metrics_config_store_utilization_metrics + --no-metrics_config_store_plots + --no-metrics_config_enable_chrome_trace + --no-metrics_config_write_json_trace + +) + +if [ "$ENABLE_CHUNKED_PREFILL" = "true" ]; then + CMD+=(--vllm_v1_scheduler_config_enable_chunked_prefill) +else + CMD+=(--no-vllm_v1_scheduler_config_enable_chunked_prefill) +fi + +CMD+=( + --speculative_decoding_config_enabled + --speculative_decoding_config_method "$SPEC_METHOD" + --speculative_decoding_config_spec_model_name "$SPEC_MODEL_NAME" + --speculative_decoding_config_num_speculative_tokens "$NUM_SPECULATIVE_TOKENS" + --speculative_decoding_config_committed_tokens_per_iteration "$COMMITTED_TOKENS_PER_ITERATION" + --speculative_decoding_config_proposer_overhead_ms_by_method "$PROPOSER_OVERHEAD_MS_BY_METHOD" +) + +if [ "$MTP_N_PREDICT" -gt 0 ]; then + CMD+=(--speculative_decoding_config_mtp_n_predict "$MTP_N_PREDICT") +fi + +if [ "$MTP_NUM_LAYERS" -gt 0 ]; then + CMD+=(--speculative_decoding_config_mtp_num_layers "$MTP_NUM_LAYERS") +fi + +if [ "$ENABLE_DUMMY_MODE" = "true" ]; then + CMD+=( + --random_forrest_execution_time_predictor_config_enable_dummy_mode + --random_forrest_execution_time_predictor_config_dummy_execution_time_ms "$DUMMY_EXEC_TIME_MS" + ) +fi + +if [ "$#" -gt 0 ]; then + if [ "$1" = "--" ]; then + shift + fi + CMD+=("$@") +fi + +cat <&2 + exit "$exit_code" +fi diff --git a/examples/architecture/pdd/offline/thinking_mode_basic.sh b/examples/architecture/pdd/offline/thinking_mode_basic.sh new file mode 100755 index 0000000..7cf4a9e --- /dev/null +++ b/examples/architecture/pdd/offline/thinking_mode_basic.sh @@ -0,0 +1,225 @@ +#!/bin/bash +# ============================================================================= +# PDD / pd-disaggregation Offline Mode - Dense Thinking Mode Example +# ============================================================================= +# This script mirrors the co-location example surface while using the +# pre-release-v0.2 PDD / pd-disaggregation architecture: prefill runs in the PREFILL cluster, +# decode runs in the DECODE cluster, and KV cache is transferred between them. +# +# This script demonstrates Thinking Mode on the release-supported sequential +# pd-disaggregation path. +## Override any uppercase variable from the shell, and append extra Frontier CLI +# flags after "--" if you need to customize the run. +# ============================================================================= + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +export PYTHONPATH="$REPO_ROOT${PYTHONPATH:+:$PYTHONPATH}" +export WANDB_DISABLED=true +export VIDUR_DISABLE_WANDB=1 +PYTHON_BIN="${PYTHON_BIN:-python3}" + +MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-2-7b-hf}" +SYS_ARCH="${SYS_ARCH:-pd-disaggregation}" +PREFILL_REPLICAS="${PREFILL_REPLICAS:-2}" +DECODE_REPLICAS="${DECODE_REPLICAS:-2}" +PREFILL_ATTN_TP="${PREFILL_ATTN_TP:-1}" +PREFILL_ATTN_DP="${PREFILL_ATTN_DP:-1}" +PREFILL_MOE_TP="${PREFILL_MOE_TP:-1}" +PREFILL_MOE_EP="${PREFILL_MOE_EP:-1}" +PREFILL_PP="${PREFILL_PP:-1}" +PREFILL_DEVICE="${PREFILL_DEVICE:-a800}" +PREFILL_MEMORY_MARGIN_FRACTION="${PREFILL_MEMORY_MARGIN_FRACTION:-0.2}" +DECODE_ATTN_TP="${DECODE_ATTN_TP:-1}" +DECODE_ATTN_DP="${DECODE_ATTN_DP:-1}" +DECODE_MOE_TP="${DECODE_MOE_TP:-1}" +DECODE_MOE_EP="${DECODE_MOE_EP:-1}" +DECODE_PP="${DECODE_PP:-1}" +DECODE_DEVICE="${DECODE_DEVICE:-a800}" +DECODE_MEMORY_MARGIN_FRACTION="${DECODE_MEMORY_MARGIN_FRACTION:-0.2}" +TOTAL_EXPERTS="${TOTAL_EXPERTS:-1}" +ROUTER_TOPK="${ROUTER_TOPK:-1}" +MOE_ROUTING_MODE="${MOE_ROUTING_MODE:-simulation}" +MOE_ROUTING_SEED="${MOE_ROUTING_SEED:-42}" +REPLICA_SCHEDULER="${REPLICA_SCHEDULER:-vllm_v1}" +NUM_REQUESTS="${NUM_REQUESTS:-1}" +PREFILL_TOKENS="${PREFILL_TOKENS:-8}" +DECODE_TOKENS="${DECODE_TOKENS:-2}" +QPS="${QPS:-1.0}" +ENABLE_DUMMY_MODE="${ENABLE_DUMMY_MODE:-true}" +DUMMY_EXEC_TIME_MS="${DUMMY_EXEC_TIME_MS:-1.0}" +DECODE_CUDA_GRAPH_MODE="${DECODE_CUDA_GRAPH_MODE:-none}" +ENABLE_CHUNKED_PREFILL="${ENABLE_CHUNKED_PREFILL:-true}" +MAX_TOKENS_IN_BATCH="${MAX_TOKENS_IN_BATCH:-1024}" +LONG_PREFILL_TOKEN_THRESHOLD="${LONG_PREFILL_TOKEN_THRESHOLD:-64}" +KV_TRANSFER_BANDWIDTH_GBPS="${KV_TRANSFER_BANDWIDTH_GBPS:-200.0}" +KV_TRANSFER_LATENCY_MS="${KV_TRANSFER_LATENCY_MS:-0.5}" +THINKING_DEPTH="${THINKING_DEPTH:-2}" +TOOL_CALL_LATENCY="${TOOL_CALL_LATENCY:-0.001}" +THINKING_ROUND_PREFILL_TOKENS="${THINKING_ROUND_PREFILL_TOKENS:-3}" +THINKING_ROUND_DECODE_TOKENS="${THINKING_ROUND_DECODE_TOKENS:-1}" +METRICS_OUTPUT_DIR="${METRICS_OUTPUT_DIR:-$REPO_ROOT/outputs/examples/pdd/offline}" +RUN_ID="${RUN_ID:-thinking_mode_basic}" + +require_bool() { + local name="$1" + local value="$2" + if [ "$value" != "true" ] && [ "$value" != "false" ]; then + echo "ERROR: $name must be true or false; got $value" >&2 + exit 2 + fi +} + +require_non_negative_integer() { + local name="$1" + local value="$2" + if [[ ! "$value" =~ ^[0-9]+$ ]]; then + echo "ERROR: $name must be a non-negative integer; got $value" >&2 + exit 2 + fi +} + +require_positive_integer() { + local name="$1" + local value="$2" + [[ "$value" =~ ^[1-9][0-9]*$ ]] +} + +require_bool "ENABLE_DUMMY_MODE" "$ENABLE_DUMMY_MODE" +require_bool "ENABLE_CHUNKED_PREFILL" "$ENABLE_CHUNKED_PREFILL" + +if [ "$SYS_ARCH" != "pd-disaggregation" ]; then + echo "ERROR: this example only supports SYS_ARCH=pd-disaggregation; got SYS_ARCH=$SYS_ARCH" >&2 + exit 2 +fi + +if [ "$DECODE_CUDA_GRAPH_MODE" = "none" ]; then + echo "INFO: Decode CUDA Graph modeling is disabled by DECODE_CUDA_GRAPH_MODE=none." +elif [ "$DECODE_CUDA_GRAPH_MODE" != "full_decode_only" ] && [ "$DECODE_CUDA_GRAPH_MODE" != "piecewise" ]; then + echo "ERROR: DECODE_CUDA_GRAPH_MODE must be none, full_decode_only, or piecewise; got $DECODE_CUDA_GRAPH_MODE" >&2 + exit 2 +fi + +if [ "$ENABLE_CHUNKED_PREFILL" = "false" ] && [ "$LONG_PREFILL_TOKEN_THRESHOLD" != "0" ]; then + echo "ERROR: LONG_PREFILL_TOKEN_THRESHOLD must be 0 when ENABLE_CHUNKED_PREFILL=false" >&2 + exit 2 +fi + +if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + echo "ERROR: PYTHON_BIN is not executable or not on PATH: $PYTHON_BIN" >&2 + exit 2 +fi + +CMD=( + "$PYTHON_BIN" -m frontier.main + --simulation_mode offline + --sys_arch "$SYS_ARCH" + --no-enable_parallel_clusters + --cluster_config_prefill_cluster_num_replicas "$PREFILL_REPLICAS" + --cluster_config_decode_cluster_num_replicas "$DECODE_REPLICAS" + --cluster_config_prefill_replica_config_num_pipeline_stages "$PREFILL_PP" + --cluster_config_prefill_replica_config_attn_tensor_parallel_size "$PREFILL_ATTN_TP" + --cluster_config_prefill_replica_config_attn_data_parallel_size "$PREFILL_ATTN_DP" + --cluster_config_prefill_replica_config_moe_tensor_parallel_size "$PREFILL_MOE_TP" + --cluster_config_prefill_replica_config_moe_expert_parallel_size "$PREFILL_MOE_EP" + --cluster_config_prefill_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_prefill_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_prefill_replica_config_device "$PREFILL_DEVICE" + --cluster_config_prefill_replica_config_memory_margin_fraction "$PREFILL_MEMORY_MARGIN_FRACTION" + --cluster_config_decode_replica_config_num_pipeline_stages "$DECODE_PP" + --cluster_config_decode_replica_config_attn_tensor_parallel_size "$DECODE_ATTN_TP" + --cluster_config_decode_replica_config_attn_data_parallel_size "$DECODE_ATTN_DP" + --cluster_config_decode_replica_config_moe_tensor_parallel_size "$DECODE_MOE_TP" + --cluster_config_decode_replica_config_moe_expert_parallel_size "$DECODE_MOE_EP" + --cluster_config_decode_replica_config_total_expert_num "$TOTAL_EXPERTS" + --cluster_config_decode_replica_config_router_topk "$ROUTER_TOPK" + --cluster_config_decode_replica_config_device "$DECODE_DEVICE" + --cluster_config_decode_replica_config_memory_margin_fraction "$DECODE_MEMORY_MARGIN_FRACTION" + --cc_backend_config_type analytical + --replica_config_model_name "$MODEL_NAME" + --replica_config_moe_routing_mode "$MOE_ROUTING_MODE" + --replica_config_moe_routing_seed "$MOE_ROUTING_SEED" + --replica_scheduler_config_type "$REPLICA_SCHEDULER" + --decode_cuda_graph_mode "$DECODE_CUDA_GRAPH_MODE" + --vllm_v1_scheduler_config_max_tokens_in_batch "$MAX_TOKENS_IN_BATCH" + --vllm_v1_scheduler_config_long_prefill_token_threshold "$LONG_PREFILL_TOKEN_THRESHOLD" + --vllm_v1_scheduler_config_block_size "${BLOCK_SIZE:-16}" + --vllm_v1_scheduler_config_num_blocks "${NUM_BLOCKS:-128}" + --request_generator_config_type synthetic + --synthetic_request_generator_config_num_requests "$NUM_REQUESTS" + --length_generator_config_type fixed + --fixed_request_length_generator_config_prefill_tokens "$PREFILL_TOKENS" + --fixed_request_length_generator_config_decode_tokens "$DECODE_TOKENS" + --interval_generator_config_type poisson + --poisson_request_interval_generator_config_qps "$QPS" + --analytical_kv_cache_transfer_config_network_bandwidth_gbps "$KV_TRANSFER_BANDWIDTH_GBPS" + --analytical_kv_cache_transfer_config_network_latency_ms "$KV_TRANSFER_LATENCY_MS" + --metrics_config_output_dir "$METRICS_OUTPUT_DIR" + --metrics_config_run_id "$RUN_ID" + --metrics_config_write_metrics + --metrics_config_store_request_metrics + --metrics_config_store_batch_metrics + --metrics_config_store_token_completion_metrics + --metrics_config_store_utilization_metrics + --no-metrics_config_store_plots + --no-metrics_config_enable_chrome_trace + --no-metrics_config_write_json_trace + --enable_thinking_mode + --thinking_depth "$THINKING_DEPTH" + --tool_call_latency "$TOOL_CALL_LATENCY" + --thinking_round_prefill_tokens "$THINKING_ROUND_PREFILL_TOKENS" + --thinking_round_decode_tokens "$THINKING_ROUND_DECODE_TOKENS" +) + +if [ "$ENABLE_CHUNKED_PREFILL" = "true" ]; then + CMD+=(--vllm_v1_scheduler_config_enable_chunked_prefill) +else + CMD+=(--no-vllm_v1_scheduler_config_enable_chunked_prefill) +fi + +if [ "$ENABLE_DUMMY_MODE" = "true" ]; then + CMD+=( + --random_forrest_execution_time_predictor_config_enable_dummy_mode + --random_forrest_execution_time_predictor_config_dummy_execution_time_ms "$DUMMY_EXEC_TIME_MS" + ) +fi + +if [ "$#" -gt 0 ]; then + if [ "$1" = "--" ]; then + shift + fi + CMD+=("$@") +fi + +cat <