diff --git a/CODE_IMPROVEMENT_PLAN.md b/CODE_IMPROVEMENT_PLAN.md index 2b181f29..d7904251 100644 --- a/CODE_IMPROVEMENT_PLAN.md +++ b/CODE_IMPROVEMENT_PLAN.md @@ -12,7 +12,8 @@ This document outlines a comprehensive, phased improvement plan for the mlpstora 6. [Phase 4: Test Infrastructure Enhancement](#phase-4-test-infrastructure-enhancement) 7. [Phase 5: Documentation and Type Annotations](#phase-5-documentation-and-type-annotations) 8. [Phase 6: KV Cache Benchmark Integration](#phase-6-kv-cache-benchmark-integration) -9. [Phase 7: Plugin Architecture](#phase-7-plugin-architecture) +9. [Phase 7: Reporting System Refactoring](#phase-7-reporting-system-refactoring) +10. [Phase 8: Error Handling and User Messaging](#phase-8-error-handling-and-user-messaging) --- @@ -234,27 +235,38 @@ class Benchmark(BenchmarkInterface, abc.ABC): ## Phase 2: Modular Rules Engine -**Goal:** Split the monolithic `rules.py` (1,434 LOC) into focused modules. +**Goal:** Split the monolithic `rules.py` (1,434 LOC) into focused modules with a clear separation between single-run rule checkers (`RunRulesChecker`) and multi-run submission checkers (`MultiRunRulesChecker`). ### Step 2.1: Create Rules Package Structure +The rules package should separate concerns into distinct subpackages: + ``` mlpstorage/rules/ -├── __init__.py # Public API exports -├── base.py # RulesChecker ABC, RuleState enum -├── issues.py # Issue dataclass -├── models.py # Data classes (HostInfo, ClusterInformation, etc.) -├── training.py # TrainingRunRulesChecker, TrainingSubmissionRulesChecker -├── checkpointing.py # CheckpointingRunRulesChecker, CheckpointSubmissionRulesChecker -├── vectordb.py # VectorDBRulesChecker (new) -├── kvcache.py # KVCacheRulesChecker (placeholder) -├── verifier.py # BenchmarkVerifier -└── utils.py # calculate_training_data_size, generate_output_location +├── __init__.py # Public API exports +├── base.py # RulesChecker ABC, RuleState enum, RunRulesChecker, MultiRunRulesChecker +├── issues.py # Issue dataclass +├── models.py # Data classes (HostInfo, ClusterInformation, etc.) +├── run_checkers/ # Single-run validation checkers +│ ├── __init__.py +│ ├── base.py # RunRulesChecker base class +│ ├── training.py # TrainingRunRulesChecker +│ ├── checkpointing.py # CheckpointingRunRulesChecker +│ ├── vectordb.py # VectorDBRunRulesChecker +│ └── kvcache.py # KVCacheRunRulesChecker +├── submission_checkers/ # Multi-run submission validation checkers +│ ├── __init__.py +│ ├── base.py # MultiRunRulesChecker base class +│ ├── training.py # TrainingSubmissionRulesChecker +│ └── checkpointing.py # CheckpointSubmissionRulesChecker +├── verifier.py # BenchmarkVerifier orchestrator +└── utils.py # calculate_training_data_size, generate_output_location ``` ### Step 2.2: Extract Data Classes to models.py Move to `mlpstorage/rules/models.py`: + - `HostMemoryInfo` - `HostCPUInfo` - `HostInfo` @@ -263,98 +275,479 @@ Move to `mlpstorage/rules/models.py`: - `ProcessedRun` - `BenchmarkResult` - `BenchmarkRun` +- `BenchmarkRunData` + +### Step 2.3: Define Base Classes with Clear Contracts + +Create `mlpstorage/rules/base.py`: + +```python +from abc import ABC, abstractmethod +from typing import List, Optional +from mlpstorage.rules.issues import Issue + +class RulesChecker(ABC): + """Base class for all rule checkers.""" + + def __init__(self, logger, *args, **kwargs): + self.logger = logger + self.issues = [] + self.check_methods = [ + getattr(self, method) for method in dir(self) + if callable(getattr(self, method)) and method.startswith('check_') + ] + + def run_checks(self) -> List[Issue]: + """Run all check methods and return issues.""" + self.issues = [] + for check_method in self.check_methods: + try: + result = check_method() + if result: + if isinstance(result, list): + self.issues.extend(result) + else: + self.issues.append(result) + except Exception as e: + self.logger.error(f"Check {check_method.__name__} failed: {e}") + self.issues.append(Issue( + validation=PARAM_VALIDATION.INVALID, + message=f"Check failed: {e}", + severity="error" + )) + return self.issues +``` + +### Step 2.4: Create RunRulesChecker Base Class + +Create `mlpstorage/rules/run_checkers/base.py`: + +```python +from mlpstorage.rules.base import RulesChecker +from mlpstorage.rules.models import BenchmarkRun + +class RunRulesChecker(RulesChecker): + """ + Base class for single-run validation. + + RunRulesCheckers validate individual benchmark runs for: + - Parameter correctness (model, accelerator, batch size, etc.) + - System requirements (memory, CPU, etc.) + - Configuration validity (allowed parameter overrides) + + Each check method should: + 1. Return None if the check passes for CLOSED submission + 2. Return Issue with PARAM_VALIDATION.OPEN if the check fails but is allowed for OPEN + 3. Return Issue with PARAM_VALIDATION.INVALID if the check fails completely + """ + + def __init__(self, benchmark_run: BenchmarkRun, logger, *args, **kwargs): + super().__init__(logger, *args, **kwargs) + self.benchmark_run = benchmark_run + + @property + def parameters(self): + """Access benchmark run parameters.""" + return self.benchmark_run.parameters -### Step 2.3: Extract Training Rules + @property + def override_parameters(self): + """Access override parameters.""" + return self.benchmark_run.override_parameters + + @property + def system_info(self): + """Access system information.""" + return self.benchmark_run.system_info +``` -Create `mlpstorage/rules/training.py`: +### Step 2.5: Create MultiRunRulesChecker Base Class + +Create `mlpstorage/rules/submission_checkers/base.py`: ```python -from mlpstorage.rules.base import RulesChecker, RunRulesChecker, MultiRunRulesChecker +from typing import List +from mlpstorage.rules.base import RulesChecker +from mlpstorage.rules.models import BenchmarkRun from mlpstorage.rules.issues import Issue +from mlpstorage.config import PARAM_VALIDATION + +class MultiRunRulesChecker(RulesChecker): + """ + Base class for multi-run submission validation. + + MultiRunRulesCheckers validate groups of benchmark runs as a submission: + - Required number of runs per workload + - Consistency across runs (same model, accelerator, etc.) + - Aggregate metrics requirements + + These checkers operate on already-validated individual runs. + """ + + def __init__(self, benchmark_runs: List[BenchmarkRun], logger, *args, **kwargs): + super().__init__(logger, *args, **kwargs) + if not isinstance(benchmark_runs, (list, tuple)): + raise TypeError("benchmark_runs must be a list or tuple") + self.benchmark_runs = benchmark_runs + + def check_runs_valid(self) -> Optional[Issue]: + """Verify all individual runs passed validation.""" + category_set = {run.category for run in self.benchmark_runs} + + if PARAM_VALIDATION.INVALID in category_set: + return Issue( + validation=PARAM_VALIDATION.INVALID, + message="Submission contains INVALID runs", + parameter="run_categories", + expected="All runs OPEN or CLOSED", + actual=[cat.value.upper() for cat in category_set] + ) + elif PARAM_VALIDATION.OPEN in category_set: + return Issue( + validation=PARAM_VALIDATION.OPEN, + message="Submission contains OPEN runs - qualifies for OPEN category only", + parameter="run_categories", + actual=[cat.value.upper() for cat in category_set] + ) + return None # All runs are CLOSED + + def check_run_consistency(self) -> Optional[Issue]: + """Verify all runs have consistent configuration.""" + models = {run.model for run in self.benchmark_runs} + if len(models) > 1: + return Issue( + validation=PARAM_VALIDATION.INVALID, + message="Inconsistent models across runs in submission", + parameter="model", + expected="Single model", + actual=list(models) + ) + return None +``` + +### Step 2.6: Extract Training Run Rules + +Create `mlpstorage/rules/run_checkers/training.py`: + +```python +from typing import Optional, List +from mlpstorage.rules.run_checkers.base import RunRulesChecker +from mlpstorage.rules.issues import Issue +from mlpstorage.config import PARAM_VALIDATION, BENCHMARK_TYPES class TrainingRunRulesChecker(RunRulesChecker): - """Validate training benchmark parameters.""" + """Validate training benchmark parameters for a single run.""" + + # Parameters allowed for CLOSED submission + CLOSED_ALLOWED_PARAMS = [ + 'dataset.num_files_train', + 'dataset.num_subfolders_train', + 'dataset.data_folder', + 'reader.read_threads', + 'reader.computation_threads', + 'reader.transfer_size', + 'reader.odirect', + 'reader.prefetch_size', + 'checkpoint.checkpoint_folder', + 'storage.storage_type', + 'storage.storage_root', + ] - def check_model(self) -> Optional[Issue]: - # Move from rules.py - pass + # Parameters allowed for OPEN submission (but not CLOSED) + OPEN_ALLOWED_PARAMS = [ + 'framework', + 'dataset.format', + 'dataset.num_samples_per_file', + 'reader.data_loader', + ] + + def check_benchmark_type(self) -> Optional[Issue]: + """Verify this is a training benchmark.""" + if self.benchmark_run.benchmark_type != BENCHMARK_TYPES.training: + return Issue( + validation=PARAM_VALIDATION.INVALID, + message=f"Expected training benchmark, got {self.benchmark_run.benchmark_type}", + parameter="benchmark_type", + expected=BENCHMARK_TYPES.training.name, + actual=self.benchmark_run.benchmark_type.name if self.benchmark_run.benchmark_type else None + ) + return None - def check_accelerator(self) -> Optional[Issue]: - # Move from rules.py + def check_allowed_params(self) -> List[Issue]: + """Check if parameter overrides are allowed for CLOSED submission.""" + issues = [] + for param, value in self.override_parameters.items(): + if param.startswith("workflow"): + continue # Handled separately + + if param in self.CLOSED_ALLOWED_PARAMS: + issues.append(Issue( + validation=PARAM_VALIDATION.CLOSED, + message=f"Parameter override allowed for CLOSED: {param}", + parameter=param, + actual=value + )) + elif param in self.OPEN_ALLOWED_PARAMS: + issues.append(Issue( + validation=PARAM_VALIDATION.OPEN, + message=f"Parameter override only allowed for OPEN: {param}", + parameter=param, + actual=value + )) + else: + issues.append(Issue( + validation=PARAM_VALIDATION.INVALID, + message=f"Parameter override not allowed: {param}", + parameter=param, + expected="Allowed parameter", + actual=value + )) + return issues + + def check_num_files_train(self) -> Optional[Issue]: + """Check if training file count meets requirements.""" + # Implementation moved from rules.py pass +``` - # ... other training-specific checks +### Step 2.7: Extract Training Submission Rules + +Create `mlpstorage/rules/submission_checkers/training.py`: + +```python +from typing import Optional +from mlpstorage.rules.submission_checkers.base import MultiRunRulesChecker +from mlpstorage.rules.issues import Issue +from mlpstorage.config import PARAM_VALIDATION, MODELS + +class TrainingSubmissionRulesChecker(MultiRunRulesChecker): + """Validate training benchmark submissions (multiple runs).""" + + REQUIRED_RUNS = 5 # Required number of runs for closed submission + supported_models = MODELS + + def check_num_runs(self) -> Optional[Issue]: + """Require 5 runs for training benchmark closed submission.""" + num_runs = len(self.benchmark_runs) + if num_runs < self.REQUIRED_RUNS: + return Issue( + validation=PARAM_VALIDATION.INVALID, + message=f"Training submission requires {self.REQUIRED_RUNS} runs", + parameter="num_runs", + expected=self.REQUIRED_RUNS, + actual=num_runs + ) + return Issue( + validation=PARAM_VALIDATION.CLOSED, + message=f"Training submission has required {self.REQUIRED_RUNS} runs", + parameter="num_runs", + expected=self.REQUIRED_RUNS, + actual=num_runs + ) ``` -### Step 2.4: Extract Checkpointing Rules +### Step 2.8: Extract Checkpointing Run Rules -Create `mlpstorage/rules/checkpointing.py`: +Create `mlpstorage/rules/run_checkers/checkpointing.py`: ```python +from typing import Optional +from mlpstorage.rules.run_checkers.base import RunRulesChecker +from mlpstorage.rules.issues import Issue +from mlpstorage.config import PARAM_VALIDATION, BENCHMARK_TYPES, LLM_MODELS + class CheckpointingRunRulesChecker(RunRulesChecker): - """Validate checkpointing benchmark parameters.""" + """Validate checkpointing benchmark parameters for a single run.""" + + def check_benchmark_type(self) -> Optional[Issue]: + """Verify this is a checkpointing benchmark.""" + if self.benchmark_run.benchmark_type != BENCHMARK_TYPES.checkpointing: + return Issue( + validation=PARAM_VALIDATION.INVALID, + message=f"Expected checkpointing benchmark", + parameter="benchmark_type", + expected=BENCHMARK_TYPES.checkpointing.name, + actual=self.benchmark_run.benchmark_type.name if self.benchmark_run.benchmark_type else None + ) + return None def check_model(self) -> Optional[Issue]: - # LLM model validation - pass + """Verify model is a valid LLM model.""" + if self.benchmark_run.model not in [m.value for m in LLM_MODELS]: + return Issue( + validation=PARAM_VALIDATION.INVALID, + message=f"Invalid model for checkpointing benchmark", + parameter="model", + expected=[m.value for m in LLM_MODELS], + actual=self.benchmark_run.model + ) + return None +``` - def check_zero_level(self) -> Optional[Issue]: - # DeepSpeed Zero level validation - pass +### Step 2.9: Extract Checkpointing Submission Rules + +Create `mlpstorage/rules/submission_checkers/checkpointing.py`: + +```python +from typing import Optional, List +from mlpstorage.rules.submission_checkers.base import MultiRunRulesChecker +from mlpstorage.rules.issues import Issue +from mlpstorage.config import PARAM_VALIDATION, BENCHMARK_TYPES, LLM_MODELS + +class CheckpointSubmissionRulesChecker(MultiRunRulesChecker): + """Validate checkpointing benchmark submissions.""" + + REQUIRED_WRITES = 10 + REQUIRED_READS = 10 + supported_models = LLM_MODELS + + def check_num_operations(self) -> List[Issue]: + """ + Require 10 total writes and 10 total reads for checkpointing submissions. + + Operations can be spread across multiple runs. + """ + issues = [] + num_writes = num_reads = 0 + + for run in self.benchmark_runs: + if run.benchmark_type == BENCHMARK_TYPES.checkpointing: + checkpoint_params = run.parameters.get('checkpoint', {}) + num_writes += checkpoint_params.get('num_checkpoints_write', 0) + num_reads += checkpoint_params.get('num_checkpoints_read', 0) + + # Check writes + if num_writes < self.REQUIRED_WRITES: + issues.append(Issue( + validation=PARAM_VALIDATION.INVALID, + message=f"Insufficient checkpoint write operations", + parameter="num_checkpoints_write", + expected=self.REQUIRED_WRITES, + actual=num_writes + )) + else: + issues.append(Issue( + validation=PARAM_VALIDATION.CLOSED, + message=f"Checkpoint write operations meet requirement", + parameter="num_checkpoints_write", + expected=self.REQUIRED_WRITES, + actual=num_writes + )) + + # Check reads + if num_reads < self.REQUIRED_READS: + issues.append(Issue( + validation=PARAM_VALIDATION.INVALID, + message=f"Insufficient checkpoint read operations", + parameter="num_checkpoints_read", + expected=self.REQUIRED_READS, + actual=num_reads + )) + else: + issues.append(Issue( + validation=PARAM_VALIDATION.CLOSED, + message=f"Checkpoint read operations meet requirement", + parameter="num_checkpoints_read", + expected=self.REQUIRED_READS, + actual=num_reads + )) + + return issues ``` -### Step 2.5: Create VectorDB Rules (New) +### Step 2.10: Create KV Cache Rules -Create `mlpstorage/rules/vectordb.py`: +Create `mlpstorage/rules/run_checkers/kvcache.py`: ```python -class VectorDBRunRulesChecker(RunRulesChecker): - """Validate vector database benchmark parameters.""" +from typing import Optional +from mlpstorage.rules.run_checkers.base import RunRulesChecker +from mlpstorage.rules.issues import Issue +from mlpstorage.config import PARAM_VALIDATION - def check_dimension(self) -> Optional[Issue]: - pass +class KVCacheRunRulesChecker(RunRulesChecker): + """Validation rules for KV cache benchmark runs.""" - def check_index_type(self) -> Optional[Issue]: - pass + CLOSED_REQUIREMENTS = { + 'min_runtime': 60, # seconds + 'allowed_backends': ['disk', 'lmcache'], + } - def check_metric_type(self) -> Optional[Issue]: - pass + def check_runtime(self) -> Optional[Issue]: + """Check runtime meets CLOSED requirements.""" + runtime = getattr(self.benchmark_run.args, 'runtime', 0) + min_runtime = self.CLOSED_REQUIREMENTS['min_runtime'] + + if runtime < min_runtime: + return Issue( + validation=PARAM_VALIDATION.OPEN, + message=f"Runtime below CLOSED requirement", + parameter='runtime', + expected=f">= {min_runtime}s", + actual=f"{runtime}s" + ) + return None + + def check_cache_backend(self) -> Optional[Issue]: + """Check cache backend is allowed for CLOSED submission.""" + backend = getattr(self.benchmark_run.args, 'cache_backend', 'memory') + allowed = self.CLOSED_REQUIREMENTS['allowed_backends'] + + if backend not in allowed: + return Issue( + validation=PARAM_VALIDATION.OPEN, + message=f"Cache backend not allowed for CLOSED submission", + parameter='cache_backend', + expected=allowed, + actual=backend + ) + return None ``` -### Step 2.6: Update __init__.py for Backwards Compatibility +### Step 2.11: Update __init__.py for Backwards Compatibility Create `mlpstorage/rules/__init__.py`: ```python # Maintain backwards compatibility from mlpstorage.rules.models import ( - HostMemoryInfo, - HostCPUInfo, - HostInfo, - ClusterInformation, - RunID, - ProcessedRun, - BenchmarkResult, - BenchmarkRun, + HostMemoryInfo, HostCPUInfo, HostInfo, ClusterInformation, + RunID, ProcessedRun, BenchmarkResult, BenchmarkRun, BenchmarkRunData, ) from mlpstorage.rules.base import RulesChecker, RuleState from mlpstorage.rules.issues import Issue -from mlpstorage.rules.training import TrainingRunRulesChecker, TrainingSubmissionRulesChecker -from mlpstorage.rules.checkpointing import CheckpointingRunRulesChecker, CheckpointSubmissionRulesChecker -from mlpstorage.rules.vectordb import VectorDBRunRulesChecker + +# Run checkers (single-run validation) +from mlpstorage.rules.run_checkers import ( + RunRulesChecker, + TrainingRunRulesChecker, + CheckpointingRunRulesChecker, + VectorDBRunRulesChecker, + KVCacheRunRulesChecker, +) + +# Submission checkers (multi-run validation) +from mlpstorage.rules.submission_checkers import ( + MultiRunRulesChecker, + TrainingSubmissionRulesChecker, + CheckpointSubmissionRulesChecker, +) + from mlpstorage.rules.verifier import BenchmarkVerifier from mlpstorage.rules.utils import calculate_training_data_size, generate_output_location, get_runs_files __all__ = [ # Models 'HostMemoryInfo', 'HostCPUInfo', 'HostInfo', 'ClusterInformation', - 'RunID', 'ProcessedRun', 'BenchmarkResult', 'BenchmarkRun', + 'RunID', 'ProcessedRun', 'BenchmarkResult', 'BenchmarkRun', 'BenchmarkRunData', # Base 'RulesChecker', 'RuleState', 'Issue', - # Validators - 'TrainingRunRulesChecker', 'TrainingSubmissionRulesChecker', - 'CheckpointingRunRulesChecker', 'CheckpointSubmissionRulesChecker', - 'VectorDBRunRulesChecker', + # Run Checkers + 'RunRulesChecker', 'TrainingRunRulesChecker', 'CheckpointingRunRulesChecker', + 'VectorDBRunRulesChecker', 'KVCacheRunRulesChecker', + # Submission Checkers + 'MultiRunRulesChecker', 'TrainingSubmissionRulesChecker', 'CheckpointSubmissionRulesChecker', + # Verifier 'BenchmarkVerifier', # Utils 'calculate_training_data_size', 'generate_output_location', 'get_runs_files', @@ -946,11 +1339,19 @@ Create `mlpstorage/tests/benchmarks/test_mybenchmark.py` ### Step 6.1: Analyze Existing KV Cache Code -The `kv_cache_benchmark/` directory contains: -- `kv-cache.py` (168K) - Main benchmark implementation -- `kv-cache_sharegpt_replay.py` (141K) - ShareGPT replay variant -- `kv-cache-wrapper.sh` (51K) - Shell wrapper -- `validate.sh` - Validation script +The `kv_cache_benchmark/` directory contains several files, but only the main benchmark script should be wrapped: + +**Main Entrypoint (to be wrapped):** + +- `kv-cache.py` - Main benchmark implementation for KV cache benchmarking + +**Test/Development Scripts (NOT to be wrapped):** + +- `kv-cache_sharegpt_replay.py` - ShareGPT replay variant (for testing only) +- `kv-cache-wrapper.sh` - Shell wrapper (for development/testing) +- `validate.sh` - Validation script (for testing) + +The KVCacheBenchmark class should only invoke `kv-cache.py`. The other scripts in the directory are for development, testing, and validation purposes and should not be exposed through the mlpstorage CLI. ### Step 6.2: Create KVCacheBenchmark Class @@ -1119,122 +1520,770 @@ BenchmarkRegistry.register('kvcache', KVCacheBenchmark, add_kvcache_arguments) --- -## Phase 7: Plugin Architecture +## Phase 7: Reporting System Refactoring + +**Goal:** Improve the reporting system to provide clear validation feedback, better error handling for malformed directory structures, and explicit OPEN vs CLOSED submission messaging. + +### Step 7.1: Analyze Current Reporting Issues -**Goal:** Enable third-party benchmarks to be added without modifying core code. +The current `reporting.py` has several areas for improvement: -### Step 7.1: Create Plugin Discovery +**Current Architecture:** -Create `mlpstorage/plugins/__init__.py`: +- `ReportGenerator` class handles all reporting logic +- `accumulate_results()` walks directory tree and validates runs +- `print_results()` outputs to console +- Limited error handling for malformed directories + +**Issues Identified:** + +1. **Directory Structure Validation**: No explicit validation of expected directory structure +2. **Error Recovery**: Errors during directory walking can halt the entire process +3. **Message Clarity**: OPEN vs CLOSED distinction not always clear to users +4. **Workload Grouping**: Implicit grouping by (model, accelerator) without validation + +### Step 7.2: Create Directory Structure Validator + +Create `mlpstorage/reporting/directory_validator.py`: ```python -import importlib -import pkgutil +from dataclasses import dataclass +from typing import List, Optional, Dict, Any from pathlib import Path -from typing import List, Type +import os -from mlpstorage.interfaces import BenchmarkInterface -from mlpstorage.registry import BenchmarkRegistry +@dataclass +class DirectoryValidationError: + """Represents an error in the results directory structure.""" + path: str + error_type: str # 'missing', 'malformed', 'unexpected' + message: str + suggestion: str # How to fix the issue + +class ResultsDirectoryValidator: + """ + Validates the structure of a results directory. + + Expected structure: + results_dir/ + / # training, checkpointing, vector_database + / # unet3d, resnet50, llama3-8b, etc. + / # run, datagen (for training) + / # YYYYMMDD_HHMMSS format + *_metadata.json + summary.json (for DLIO runs) + """ -def discover_plugins(plugin_dirs: List[str] = None) -> List[str]: - """Discover and load benchmark plugins. + EXPECTED_BENCHMARK_TYPES = ['training', 'checkpointing', 'vector_database', 'kv_cache'] + + def __init__(self, results_dir: str, logger=None): + self.results_dir = Path(results_dir) + self.logger = logger + self.errors: List[DirectoryValidationError] = [] + self.warnings: List[str] = [] + + def validate(self) -> bool: + """ + Validate the directory structure. + + Returns: + True if structure is valid, False otherwise. + """ + self.errors = [] + self.warnings = [] + + if not self.results_dir.exists(): + self.errors.append(DirectoryValidationError( + path=str(self.results_dir), + error_type='missing', + message=f"Results directory does not exist: {self.results_dir}", + suggestion="Create the directory or specify a different --results-dir path" + )) + return False + + if not self.results_dir.is_dir(): + self.errors.append(DirectoryValidationError( + path=str(self.results_dir), + error_type='malformed', + message=f"Results path is not a directory: {self.results_dir}", + suggestion="Specify a directory path, not a file" + )) + return False + + # Check for benchmark type directories + found_benchmark_dirs = False + for entry in self.results_dir.iterdir(): + if entry.is_dir(): + if entry.name in self.EXPECTED_BENCHMARK_TYPES: + found_benchmark_dirs = True + self._validate_benchmark_type_dir(entry) + else: + self.warnings.append( + f"Unexpected directory '{entry.name}' in results root. " + f"Expected benchmark types: {self.EXPECTED_BENCHMARK_TYPES}" + ) + + if not found_benchmark_dirs: + self.errors.append(DirectoryValidationError( + path=str(self.results_dir), + error_type='malformed', + message="No benchmark type directories found", + suggestion=f"Results should contain directories named: {self.EXPECTED_BENCHMARK_TYPES}" + )) + + return len(self.errors) == 0 + + def _validate_benchmark_type_dir(self, benchmark_dir: Path): + """Validate a benchmark type directory (e.g., training/).""" + for model_dir in benchmark_dir.iterdir(): + if model_dir.is_dir(): + self._validate_model_dir(model_dir, benchmark_dir.name) + + def _validate_model_dir(self, model_dir: Path, benchmark_type: str): + """Validate a model directory.""" + has_valid_runs = False + + for entry in model_dir.iterdir(): + if entry.is_dir(): + # Check if this is a datetime directory or a command directory + if self._is_datetime_dir(entry.name): + self._validate_run_dir(entry) + has_valid_runs = True + elif entry.name in ['run', 'datagen', 'datasize']: + # Training has command subdirectories + for datetime_dir in entry.iterdir(): + if datetime_dir.is_dir() and self._is_datetime_dir(datetime_dir.name): + self._validate_run_dir(datetime_dir) + has_valid_runs = True + + if not has_valid_runs: + self.warnings.append( + f"No valid run directories found in {model_dir}" + ) - Args: - plugin_dirs: Additional directories to search for plugins. + def _validate_run_dir(self, run_dir: Path): + """Validate a single run directory.""" + files = list(run_dir.iterdir()) + file_names = [f.name for f in files if f.is_file()] + + # Check for metadata file + metadata_files = [f for f in file_names if f.endswith('_metadata.json')] + if not metadata_files: + self.errors.append(DirectoryValidationError( + path=str(run_dir), + error_type='malformed', + message=f"Missing metadata file in {run_dir.name}", + suggestion="Run directory should contain a *_metadata.json file" + )) + + # Check for summary.json (required for completed DLIO runs) + if 'summary.json' not in file_names: + self.warnings.append( + f"Missing summary.json in {run_dir} - run may be incomplete" + ) - Returns: - List of loaded plugin names. - """ - loaded = [] + def _is_datetime_dir(self, name: str) -> bool: + """Check if directory name matches expected datetime format.""" + # Expected format: YYYYMMDD_HHMMSS or similar + import re + return bool(re.match(r'^\d{8}_\d{6}', name)) + + def get_error_report(self) -> str: + """Generate a human-readable error report.""" + lines = [] + + if self.errors: + lines.append("=== Directory Structure Errors ===\n") + for error in self.errors: + lines.append(f"ERROR: {error.message}") + lines.append(f" Path: {error.path}") + lines.append(f" Fix: {error.suggestion}") + lines.append("") + + if self.warnings: + lines.append("=== Warnings ===\n") + for warning in self.warnings: + lines.append(f"WARNING: {warning}") + + return "\n".join(lines) +``` - # Default plugin locations - search_paths = [ - Path(__file__).parent / "builtin", # Built-in plugins - Path.home() / ".mlpstorage" / "plugins", # User plugins - ] +### Step 7.3: Refactor ReportGenerator with Better Error Handling + +Update `mlpstorage/reporting.py`: - if plugin_dirs: - search_paths.extend(Path(d) for d in plugin_dirs) +```python +from mlpstorage.reporting.directory_validator import ResultsDirectoryValidator, DirectoryValidationError + +class ReportGenerator: + """Generate validation reports for benchmark results.""" + + def __init__(self, results_dir, args=None, logger=None): + self.args = args + self.logger = logger or setup_logging(name="mlpstorage_reporter") + self.results_dir = results_dir + + # Validate directory structure first + self.validator = ResultsDirectoryValidator(results_dir, logger=self.logger) + if not self.validator.validate(): + self._report_directory_errors() + raise ValueError(f"Invalid results directory structure: {results_dir}") + + self.run_results = dict() + self.workload_results = dict() + + def _report_directory_errors(self): + """Report directory structure errors to user.""" + self.logger.error("Results directory structure validation failed:") + self.logger.error(self.validator.get_error_report()) + + # Provide specific guidance + self.logger.error("\nExpected directory structure:") + self.logger.error(" results_dir/") + self.logger.error(" training/") + self.logger.error(" /") + self.logger.error(" run/") + self.logger.error(" /") + self.logger.error(" *_metadata.json") + self.logger.error(" summary.json") + + def accumulate_results(self): + """Accumulate results with improved error handling.""" + try: + benchmark_runs = get_runs_files(self.results_dir, logger=self.logger) + except Exception as e: + self.logger.error(f"Failed to scan results directory: {e}") + raise + + if not benchmark_runs: + self.logger.warning( + f"No valid benchmark runs found in {self.results_dir}. " + "Ensure runs have completed and contain summary.json files." + ) + return - for path in search_paths: - if not path.exists(): - continue + self.logger.info(f"Found {len(benchmark_runs)} benchmark runs") - for finder, name, ispkg in pkgutil.iter_modules([str(path)]): + # Process each run with error isolation + for benchmark_run in benchmark_runs: try: - module = importlib.import_module(f"mlpstorage.plugins.{name}") - if hasattr(module, 'register_benchmark'): - module.register_benchmark(BenchmarkRegistry) - loaded.append(name) - except ImportError as e: - print(f"Warning: Failed to load plugin {name}: {e}") - - return loaded + self._process_single_run(benchmark_run) + except Exception as e: + self.logger.error( + f"Failed to process run {benchmark_run.run_id}: {e}. Skipping." + ) + continue + + # Process workload groups + self._process_workload_groups(benchmark_runs) + + def _process_single_run(self, benchmark_run): + """Process a single benchmark run with clear error handling.""" + verifier = BenchmarkVerifier(benchmark_run, logger=self.logger) + category = verifier.verify() + issues = verifier.issues + + result = Result( + multi=False, + benchmark_run=benchmark_run, + benchmark_type=benchmark_run.benchmark_type, + benchmark_command=benchmark_run.command, + benchmark_model=benchmark_run.model, + issues=issues, + category=category, + metrics=benchmark_run.metrics + ) + self.run_results[benchmark_run.run_id] = result ``` -### Step 7.2: Create Plugin Template +### Step 7.4: Add Clear OPEN vs CLOSED Messaging -Create `mlpstorage/plugins/template.py`: +Create `mlpstorage/reporting/formatters.py`: ```python -""" -Template for creating a new benchmark plugin. +from typing import List +from mlpstorage.rules.issues import Issue +from mlpstorage.config import PARAM_VALIDATION + +class ValidationMessageFormatter: + """Format validation results with clear OPEN vs CLOSED messaging.""" -To create a plugin: -1. Copy this file to ~/.mlpstorage/plugins/my_plugin.py -2. Implement the required classes and functions -3. The plugin will be auto-discovered on next run + @staticmethod + def format_category_summary(category: PARAM_VALIDATION, issues: List[Issue]) -> str: + """Generate a clear summary of why a run is in a particular category.""" + if category == PARAM_VALIDATION.CLOSED: + return ( + "This run qualifies for CLOSED submission.\n" + "All parameters meet the strict requirements for closed division." + ) + + elif category == PARAM_VALIDATION.OPEN: + open_issues = [i for i in issues if i.validation == PARAM_VALIDATION.OPEN] + reasons = [] + for issue in open_issues: + reasons.append(f" - {issue.parameter}: {issue.message}") + + return ( + "This run qualifies for OPEN submission only.\n" + "The following parameters do not meet CLOSED requirements:\n" + + "\n".join(reasons) + + "\n\nTo qualify for CLOSED submission, modify these parameters." + ) + + else: # INVALID + invalid_issues = [i for i in issues if i.validation == PARAM_VALIDATION.INVALID] + reasons = [] + for issue in invalid_issues: + reasons.append(f" - {issue.parameter}: {issue.message}") + if issue.expected and issue.actual: + reasons.append(f" Expected: {issue.expected}") + reasons.append(f" Actual: {issue.actual}") + + return ( + "This run is INVALID and cannot be submitted.\n" + "The following issues must be resolved:\n" + + "\n".join(reasons) + + "\n\nFix these issues and re-run the benchmark." + ) + + @staticmethod + def format_closed_requirements_checklist(benchmark_type: str) -> str: + """Generate a checklist of CLOSED submission requirements.""" + if benchmark_type == "training": + return """ +CLOSED Submission Requirements for Training: + [ ] Dataset size >= 5x total cluster memory + [ ] At least 500 steps per epoch + [ ] 5 complete runs required + [ ] Only allowed parameter overrides used + [ ] No modifications to core workload parameters + +Allowed Parameter Overrides (CLOSED): + - dataset.num_files_train + - dataset.num_subfolders_train + - dataset.data_folder + - reader.read_threads + - reader.computation_threads + - reader.transfer_size + - storage.storage_type + - storage.storage_root +""" + elif benchmark_type == "checkpointing": + return """ +CLOSED Submission Requirements for Checkpointing: + [ ] 10 checkpoint write operations + [ ] 10 checkpoint read operations + [ ] Valid LLM model (llama3-8b, llama3-70b, llama3-405b) + [ ] Only allowed parameter overrides used """ + return "" +``` -from mlpstorage.benchmarks.base import Benchmark -from mlpstorage.interfaces import BenchmarkInterface -from mlpstorage.registry import BenchmarkRegistry +### Step 7.5: Update Print Methods with Better Formatting -class MyPluginBenchmark(Benchmark): - """Example plugin benchmark implementation.""" +```python +def print_results(self): + """Print results with clear OPEN/CLOSED distinction.""" + formatter = ValidationMessageFormatter() + + print("\n" + "=" * 60) + print("BENCHMARK VALIDATION REPORT") + print("=" * 60) + + # Summary counts + closed_count = sum(1 for r in self.run_results.values() + if r.category == PARAM_VALIDATION.CLOSED) + open_count = sum(1 for r in self.run_results.values() + if r.category == PARAM_VALIDATION.OPEN) + invalid_count = sum(1 for r in self.run_results.values() + if r.category == PARAM_VALIDATION.INVALID) + + print(f"\nSummary: {len(self.run_results)} runs analyzed") + print(f" CLOSED: {closed_count} runs") + print(f" OPEN: {open_count} runs") + print(f" INVALID: {invalid_count} runs") + + # Print INVALID runs first (most important) + if invalid_count > 0: + print("\n" + "-" * 60) + print("INVALID RUNS - These runs cannot be submitted") + print("-" * 60) + for result in self.run_results.values(): + if result.category == PARAM_VALIDATION.INVALID: + self._print_run_details(result, formatter) + + # Print OPEN runs + if open_count > 0: + print("\n" + "-" * 60) + print("OPEN RUNS - These runs qualify for OPEN division only") + print("-" * 60) + for result in self.run_results.values(): + if result.category == PARAM_VALIDATION.OPEN: + self._print_run_details(result, formatter) + + # Print CLOSED runs + if closed_count > 0: + print("\n" + "-" * 60) + print("CLOSED RUNS - These runs qualify for CLOSED division") + print("-" * 60) + for result in self.run_results.values(): + if result.category == PARAM_VALIDATION.CLOSED: + self._print_run_details(result, formatter) +``` - BENCHMARK_TYPE = None # Will be set dynamically +--- - def __init__(self, args, **kwargs): - super().__init__(args, **kwargs) +## Phase 8: Error Handling and User Messaging - def _run(self): - # Implement benchmark logic - pass +**Goal:** Improve error handling throughout the project to provide clear, actionable messages when users make mistakes. -def add_cli_arguments(parser): - """Add CLI arguments for this benchmark.""" - subparsers = parser.add_subparsers(dest='command') - run = subparsers.add_parser('run') - run.add_argument('--my-option', help='Plugin-specific option') - -def register_benchmark(registry: BenchmarkRegistry): - """Called by plugin discovery to register this benchmark.""" - registry.register( - name='my-plugin', - benchmark_class=MyPluginBenchmark, - cli_builder=add_cli_arguments - ) +### Step 8.1: Create Error Handling Framework + +Create `mlpstorage/errors.py`: + +```python +from typing import Optional, List +from dataclasses import dataclass + +@dataclass +class MLPSError: + """Base error class with user-friendly messaging.""" + code: str # Machine-readable error code + message: str # User-facing error message + details: str # Technical details + suggestion: str # How to fix the issue + related_docs: Optional[str] = None # Link to documentation + +class ConfigurationError(Exception): + """Raised when configuration is invalid.""" + + def __init__(self, message: str, parameter: str = None, + expected: str = None, actual: str = None, + suggestion: str = None): + self.parameter = parameter + self.expected = expected + self.actual = actual + self.suggestion = suggestion + + full_message = f"Configuration Error: {message}" + if parameter: + full_message += f"\n Parameter: {parameter}" + if expected: + full_message += f"\n Expected: {expected}" + if actual: + full_message += f"\n Actual: {actual}" + if suggestion: + full_message += f"\n Suggestion: {suggestion}" + + super().__init__(full_message) + +class BenchmarkExecutionError(Exception): + """Raised when benchmark execution fails.""" + + def __init__(self, message: str, command: str = None, + exit_code: int = None, stderr: str = None, + suggestion: str = None): + self.command = command + self.exit_code = exit_code + self.stderr = stderr + self.suggestion = suggestion + + full_message = f"Benchmark Execution Failed: {message}" + if command: + full_message += f"\n Command: {command}" + if exit_code is not None: + full_message += f"\n Exit Code: {exit_code}" + if stderr: + full_message += f"\n Error Output: {stderr[:500]}..." + if suggestion: + full_message += f"\n Suggestion: {suggestion}" + + super().__init__(full_message) + +class ValidationError(Exception): + """Raised when validation fails.""" + + def __init__(self, message: str, issues: List = None, + category: str = None, suggestion: str = None): + self.issues = issues or [] + self.category = category + self.suggestion = suggestion + + full_message = f"Validation Failed: {message}" + if category: + full_message += f"\n Category: {category}" + if issues: + full_message += "\n Issues:" + for issue in issues[:5]: # Limit to first 5 + full_message += f"\n - {issue}" + if len(issues) > 5: + full_message += f"\n ... and {len(issues) - 5} more" + if suggestion: + full_message += f"\n Suggestion: {suggestion}" + + super().__init__(full_message) + +class FileSystemError(Exception): + """Raised when file system operations fail.""" + + def __init__(self, message: str, path: str = None, + operation: str = None, suggestion: str = None): + self.path = path + self.operation = operation + self.suggestion = suggestion + + full_message = f"File System Error: {message}" + if path: + full_message += f"\n Path: {path}" + if operation: + full_message += f"\n Operation: {operation}" + if suggestion: + full_message += f"\n Suggestion: {suggestion}" + + super().__init__(full_message) +``` + +### Step 8.2: Create Error Message Templates + +Create `mlpstorage/error_messages.py`: + +```python +""" +Centralized error message templates for consistent user messaging. +""" + +ERROR_MESSAGES = { + # Configuration Errors + 'CONFIG_MISSING_REQUIRED': ( + "Required parameter '{param}' is missing.\n" + "Please provide this parameter via command line or configuration file.\n" + "Example: mlpstorage {benchmark} run --{param} " + ), + + 'CONFIG_INVALID_VALUE': ( + "Invalid value for parameter '{param}': {actual}\n" + "Expected: {expected}\n" + "Please correct this value and try again." + ), + + 'CONFIG_FILE_NOT_FOUND': ( + "Configuration file not found: {path}\n" + "Please ensure the file exists and the path is correct.\n" + "You can create a default config with: mlpstorage init --config" + ), + + # Benchmark Execution Errors + 'BENCHMARK_COMMAND_FAILED': ( + "Benchmark command failed with exit code {exit_code}.\n" + "Command: {command}\n" + "This may indicate:\n" + " - Missing dependencies (check that DLIO/benchmark tool is installed)\n" + " - Insufficient permissions\n" + " - Invalid benchmark parameters\n" + "Check the error output above for specific details." + ), + + 'BENCHMARK_TIMEOUT': ( + "Benchmark timed out after {timeout} seconds.\n" + "The benchmark may still be running in the background.\n" + "Consider:\n" + " - Increasing the timeout with --timeout\n" + " - Reducing the workload size\n" + " - Checking system resources" + ), + + # Validation Errors + 'VALIDATION_NOT_CLOSED': ( + "This run does not qualify for CLOSED submission.\n" + "Category: {category}\n" + "To submit in the CLOSED division, the following must be addressed:\n" + "{issues}\n" + "See the MLPerf Storage rules for CLOSED submission requirements." + ), + + 'VALIDATION_INVALID': ( + "This run is INVALID and cannot be submitted.\n" + "The following critical issues were found:\n" + "{issues}\n" + "These issues must be resolved before submission." + ), + + # File System Errors + 'RESULTS_DIR_NOT_FOUND': ( + "Results directory not found: {path}\n" + "Please ensure:\n" + " - The directory exists\n" + " - You have read permissions\n" + " - The path is correct" + ), + + 'RESULTS_DIR_EMPTY': ( + "No benchmark results found in: {path}\n" + "This directory should contain completed benchmark runs.\n" + "Expected structure:\n" + " {path}/\n" + " training/\n" + " /\n" + " run/\n" + " /\n" + " summary.json\n" + " *_metadata.json" + ), + + 'METADATA_FILE_MISSING': ( + "Metadata file not found in: {path}\n" + "Each run directory should contain a *_metadata.json file.\n" + "This file is created automatically when running benchmarks.\n" + "If missing, the run may have failed or been interrupted." + ), + + # MPI/Cluster Errors + 'MPI_NOT_AVAILABLE': ( + "MPI is not available on this system.\n" + "MPI is required for distributed benchmarks.\n" + "Install MPI:\n" + " - Ubuntu/Debian: apt-get install openmpi-bin libopenmpi-dev\n" + " - RHEL/CentOS: yum install openmpi openmpi-devel\n" + "Or use --skip-cluster-info to skip cluster information collection." + ), + + 'HOST_UNREACHABLE': ( + "Cannot reach host: {host}\n" + "Please verify:\n" + " - The hostname is correct\n" + " - The host is online and reachable\n" + " - SSH access is configured (passwordless SSH recommended)" + ), +} + +def format_error(error_key: str, **kwargs) -> str: + """Format an error message with the given parameters.""" + template = ERROR_MESSAGES.get(error_key, f"Unknown error: {error_key}") + try: + return template.format(**kwargs) + except KeyError as e: + return f"{template}\n(Missing format parameter: {e})" ``` -### Step 7.3: Update Main Entry Point for Plugin Loading +### Step 8.3: Add Error Handling to CLI -Modify `mlpstorage/main.py`: +Update `mlpstorage/cli.py`: ```python -from mlpstorage.plugins import discover_plugins +from mlpstorage.errors import ConfigurationError, BenchmarkExecutionError, ValidationError +from mlpstorage.error_messages import format_error def main(): - # Discover and load plugins before parsing arguments - plugin_dirs = os.environ.get('MLPS_PLUGIN_DIRS', '').split(':') - loaded_plugins = discover_plugins(plugin_dirs) + try: + args = parse_arguments() + validate_args(args) + run_benchmark(args) + except ConfigurationError as e: + logger.error(str(e)) + sys.exit(EXIT_CODE.CONFIG_ERROR) + except BenchmarkExecutionError as e: + logger.error(str(e)) + sys.exit(EXIT_CODE.BENCHMARK_FAILED) + except ValidationError as e: + logger.error(str(e)) + sys.exit(EXIT_CODE.VALIDATION_FAILED) + except FileNotFoundError as e: + logger.error(format_error('RESULTS_DIR_NOT_FOUND', path=str(e))) + sys.exit(EXIT_CODE.FILE_NOT_FOUND) + except KeyboardInterrupt: + logger.info("Benchmark interrupted by user") + sys.exit(EXIT_CODE.USER_INTERRUPTED) + except Exception as e: + logger.error(f"Unexpected error: {e}") + logger.debug("Stack trace:", exc_info=True) + sys.exit(EXIT_CODE.UNKNOWN_ERROR) +``` - if loaded_plugins: - logging.debug(f"Loaded plugins: {loaded_plugins}") +### Step 8.4: Add User-Friendly Validation Messages - # Continue with normal argument parsing - args = parse_arguments() - # ... +Update benchmark validation to provide clear guidance: + +```python +def validate_for_closed_submission(benchmark, logger): + """ + Validate benchmark for CLOSED submission with clear messaging. + """ + verifier = BenchmarkVerifier(benchmark, logger=logger) + result = verifier.verify() + + if result == PARAM_VALIDATION.CLOSED: + logger.success("Benchmark meets CLOSED submission requirements") + return True + + elif result == PARAM_VALIDATION.OPEN: + logger.warning("Benchmark qualifies for OPEN submission only") + logger.warning("") + logger.warning("To qualify for CLOSED submission, address these items:") + for issue in verifier.issues: + if issue.validation == PARAM_VALIDATION.OPEN: + logger.warning(f" - {issue.parameter}: {issue.message}") + if issue.expected: + logger.warning(f" Required: {issue.expected}") + if issue.actual: + logger.warning(f" Current: {issue.actual}") + logger.warning("") + logger.warning("See MLPerf Storage rules for CLOSED requirements.") + return False + + else: # INVALID + logger.error("Benchmark is INVALID - cannot be submitted") + logger.error("") + logger.error("The following errors must be fixed:") + for issue in verifier.issues: + if issue.validation == PARAM_VALIDATION.INVALID: + logger.error(f" - {issue.parameter}: {issue.message}") + if issue.expected: + logger.error(f" Required: {issue.expected}") + if issue.actual: + logger.error(f" Actual: {issue.actual}") + logger.error("") + logger.error("Fix these issues and re-run the benchmark.") + return False +``` + +### Step 8.5: Add Pre-Run Validation with Clear Messages + +```python +def validate_pre_run(args, logger): + """ + Validate configuration before running benchmark. + Provides clear error messages for common mistakes. + """ + errors = [] + + # Check required parameters + if not args.model: + errors.append(format_error('CONFIG_MISSING_REQUIRED', param='model')) + + if not args.results_dir: + errors.append(format_error('CONFIG_MISSING_REQUIRED', param='results-dir')) + + # Check hosts are reachable (if provided) + if args.hosts: + for host in args.hosts: + if not is_host_reachable(host): + errors.append(format_error('HOST_UNREACHABLE', host=host)) + + # Check data directory exists (for run command) + if args.command == 'run' and hasattr(args, 'data_dir'): + if not os.path.exists(args.data_dir): + errors.append(format_error('RESULTS_DIR_NOT_FOUND', path=args.data_dir)) + + if errors: + logger.error("Pre-run validation failed:") + for error in errors: + logger.error(error) + raise ConfigurationError( + "Configuration validation failed", + suggestion="Fix the above errors and try again" + ) + + logger.info("Pre-run validation passed") ``` --- @@ -1247,19 +2296,21 @@ def main(): | Phase 2: Rules Modularization | High | High | Better organization, testability | | Phase 4: Test Infrastructure | High | Medium | Enables safe refactoring | | Phase 6: KV Cache Integration | High | Medium | Immediate user value | +| Phase 7: Reporting Refactoring | High | Medium | Better validation feedback | +| Phase 8: Error Handling | High | Medium | Improved user experience | | Phase 3: CLI Refactoring | Medium | High | Extensibility | | Phase 5: Documentation | Medium | Low | Long-term maintainability | -| Phase 7: Plugin Architecture | Low | Medium | Future extensibility | ## Recommended Order 1. **Phase 1** (Interfaces) - Create foundation 2. **Phase 4** (Tests) - Ensure safety net for refactoring -3. **Phase 2** (Rules) - Modularize largest file -4. **Phase 6** (KV Cache) - Add new benchmark using new patterns -5. **Phase 3** (CLI) - Improve extensibility -6. **Phase 5** (Docs) - Document everything -7. **Phase 7** (Plugins) - Enable external extensions +3. **Phase 2** (Rules) - Modularize largest file, separate RunRulesChecker from MultiRunRulesChecker +4. **Phase 8** (Error Handling) - Improve user-facing error messages +5. **Phase 7** (Reporting) - Improve validation reporting and directory handling +6. **Phase 6** (KV Cache) - Add new benchmark using new patterns +7. **Phase 3** (CLI) - Improve extensibility +8. **Phase 5** (Docs) - Document everything --- @@ -1268,31 +2319,38 @@ def main(): After completing all phases: - [ ] All benchmarks implement `BenchmarkInterface` -- [ ] `rules.py` split into <300 LOC modules +- [ ] `rules.py` split into <300 LOC modules with clear separation between RunRulesChecker and MultiRunRulesChecker hierarchies - [ ] New benchmarks can be added without modifying `cli.py` - [ ] Test coverage >80% - [ ] All public functions have docstrings and type annotations - [ ] KV Cache benchmark integrated and working -- [ ] Plugin system allows external benchmark registration +- [ ] Reporting system provides clear feedback for OPEN vs CLOSED validation failures +- [ ] All error messages clearly indicate what failed and how to fix it +- [ ] Malformed directory structures are handled gracefully with actionable error messages --- ## Appendix: File Changes Summary ### New Files Created + - `mlpstorage/interfaces/__init__.py` - `mlpstorage/interfaces/benchmark.py` - `mlpstorage/interfaces/validator.py` - `mlpstorage/interfaces/collector.py` - `mlpstorage/registry.py` - `mlpstorage/rules/__init__.py` (package) -- `mlpstorage/rules/base.py` +- `mlpstorage/rules/base.py` - RulesChecker, RunRulesChecker, MultiRunRulesChecker base classes - `mlpstorage/rules/issues.py` - `mlpstorage/rules/models.py` -- `mlpstorage/rules/training.py` -- `mlpstorage/rules/checkpointing.py` -- `mlpstorage/rules/vectordb.py` -- `mlpstorage/rules/kvcache.py` +- `mlpstorage/rules/run_checkers/__init__.py` - Package for single-run rule checkers +- `mlpstorage/rules/run_checkers/training.py` - TrainingRunRulesChecker +- `mlpstorage/rules/run_checkers/checkpointing.py` - CheckpointingRunRulesChecker +- `mlpstorage/rules/run_checkers/vectordb.py` - VectorDBRunRulesChecker +- `mlpstorage/rules/run_checkers/kvcache.py` - KVCacheRunRulesChecker +- `mlpstorage/rules/submission_checkers/__init__.py` - Package for multi-run submission checkers +- `mlpstorage/rules/submission_checkers/training.py` - TrainingSubmissionRulesChecker +- `mlpstorage/rules/submission_checkers/checkpointing.py` - CheckpointSubmissionRulesChecker - `mlpstorage/rules/verifier.py` - `mlpstorage/rules/utils.py` - `mlpstorage/cli/__init__.py` (package) @@ -1301,8 +2359,6 @@ After completing all phases: - `mlpstorage/cli/vectordb_args.py` - `mlpstorage/cli/kvcache_args.py` - `mlpstorage/benchmarks/kvcache.py` -- `mlpstorage/plugins/__init__.py` -- `mlpstorage/plugins/template.py` - `mlpstorage/tests/fixtures/__init__.py` - `mlpstorage/tests/fixtures/mock_*.py` - `mlpstorage/tests/conftest.py` @@ -1310,12 +2366,14 @@ After completing all phases: - `mlpstorage/docs/ADDING_BENCHMARKS.md` ### Files Modified + - `mlpstorage/benchmarks/base.py` - Implement interface - `mlpstorage/benchmarks/__init__.py` - Registry integration - `mlpstorage/cli.py` - Dynamic parser building - `mlpstorage/config.py` - Add kv_cache benchmark type -- `mlpstorage/main.py` - Plugin discovery +- `mlpstorage/reporting.py` - Refactored with better error handling and directory validation - `mlpstorage/rules.py` - Converted to package (deprecated) ### Files Deprecated + - `mlpstorage/rules.py` - Replaced by `mlpstorage/rules/` package