diff --git a/pyproject.toml b/pyproject.toml index 76bf7b22..e98eeb69 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mlpstorage" -version = "2.0.0b1" +version = "3.0" description = "MLPerf Storage Benchmark Suite" readme = "README.md" license = {text = "Apache-2.0"} @@ -20,6 +20,7 @@ dependencies = [ "rich>=13.0", "s3dlio>=0.9.86", "dlio-benchmark", # Required dependency + "yamale", ] [project.optional-dependencies] diff --git a/system_configuration.yaml b/system_configuration.yaml deleted file mode 100755 index 11332345..00000000 --- a/system_configuration.yaml +++ /dev/null @@ -1,110 +0,0 @@ -System: - name: FastAmazingAcmeStorage 9000 - description: - storage_location: [ remote | local | hyper-converged ] - client_software: [ in-box | proprietary ] - storage_interface: [ block | file | object ] - required_rack_units: - shared_capabilities: - multi_host_support: True # False is used for local storage - simultaneous_write_support: False # Are simultaneous writes by multiple hosts supported in the submitted configuration - simultaneous_read__support: True # Are simultaneous reads by multiple hosts supported in the submitted configuration - max_sequential_read: # Optional - GiB/s - max_sequential_write: # Optional - GiB/s - max_random_read: # Optional - GiB/s - max_random_write: # Optional - GiB/s - -PowerRequirements: - dlio_client: - quantity: # number of dlio_client nodes - psu1_nameplate_power: # power in watts - psu2_nameplate_power: # power in watts - psu3_nameplate_power: # power in watts - design_power: 2400 - num_active_psus: 2 - num_passive_psus: 1 - - # All storage nodes need to be listed (data, metadata, etc) as well as any required backed switching - storage_data_node: - quantity: # number of storage data nodes - psu1_nameplate_power: # power in watts - psu2_nameplate_power: # power in watts - psu3_nameplate_power: # power in watts - design_power: 2400 - num_active_psus: 2 - num_passive_psus: 1 - backend_switch: - quantity: 1 - psu1_nameplate_power: 700 # network PSU - psu2_nameplate_power: 700 # network PSU - design_power: 700 - num_active_psus: 1 - num_passive_psus: 1 - - -# All nodes used need to be listed. Clients, Data storage, metadata, front-end, back-end, etc -Nodes: - # Useful name for the client describing it's role in the system under test - dlio_client: # This can be DLIO Client, storage node, storage controller, AwesomeMarketingName_Type1, etc - quantity: 8 # How many of this node - hardware: - model: SMC - rack_units: 2 - power_supplies: 2 - psu_configuration: active/passive - psu_rating: 1200 - memory_capacity: 256GB - memory_configuration: 8x32GB - cpu_qty: 2 - cpu_model: AMD 9555 - cpu_cores: 96 - networking: - management: - model: intel i210 - speed: 1Gbps - qty: 1 - operating_system: - name: Ubuntu - version: 22.04 LTS - release_date: 2022-04-12 - kernel_version: 5.15.0-56-generic - cpu_architecture: x86_64 - tuning: - # All non-default tunings for OS need to be listed - mpi_configuration: - environment_variables: - version: Open MPI 4.1.4 - sysctl_parameters: - - storage_data_node: # This can be DLIO Client, storage node, storage controller, AwesomeMarketingName_Type1, etc - quantity: 8 # How many of this node - hardware: - model: SMC - rack_units: 2 - power_supplies: 2 - psu_configuration: active/passive - psu_rating: 1200 - memory_capacity: 256GB - memory_configuration: 8x32GB - cpu_qty: 2 - cpu_model: AMD 9555 - cpu_cores: 96 - networking: - management: - model: intel i210 - speed: 1Gbps - qty: 1 - operating_system: - name: Ubuntu - version: 22.04 LTS - release_date: 2022-04-12 - kernel_version: 5.15.0-56-generic - cpu_architecture: x86_64 - tuning: - mpi_configuration: - environment_variables: - version: Open MPI 4.1.4 - sysctl_parameters: - - - diff --git a/system_description/example_NAS.yaml b/system_description/example_NAS.yaml new file mode 100644 index 00000000..79d15ee5 --- /dev/null +++ b/system_description/example_NAS.yaml @@ -0,0 +1,107 @@ +system_under_test: + system: + submission_name: NAS_Example + friendly_description: A generic example of an Enterprise NAS solution + architecture: + storage_location: remote + benchmark_API: file + product_API: file + client_footprint: open_source + client_installation: in_box + capabilities: + multi_host: true + simultaneous_write: true + simultaneous_read: true + remap_time_in_seconds: 0 + storage: + - friendly_description: NAS Node + quantity: 4 + chassis: + model_name: FAS A100 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + networking: + - type: ethernet + speed: 100 + traffic: + - management + - metadata + - data + unit_count: 2 + operating_system: + name: NAS OS + version: "9.5" + environment: + - name: foobar + value: "12" + - name: foobarbaz + value: "13" + sysctl: + - name: barfoo + value: "21" + - name: bazbarfoo + value: "31" + switches: + - vendor_name: Cisco + model_name: Mongo 5000 + ports: + - type: ethernet + speed: 100 + traffic: + - data + unit_count: 128 + configuration: + - name: LACP + value: layer 3 + rack_units: 2 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + unit_count: 2 + total_rack_units: 4 + clients: + - friendly_description: Benchmark load system + quantity: 16 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + networking: + - type: ethernet + speed: 100 + traffic: + - data + unit_count: 1 + operating_system: + name: RHEL + version: "10.1" diff --git a/system_description/example_NFS.yaml b/system_description/example_NFS.yaml new file mode 100644 index 00000000..1dfb9147 --- /dev/null +++ b/system_description/example_NFS.yaml @@ -0,0 +1,95 @@ +system_under_test: + system: + submission_name: NFS_example + friendly_description: A generic example based upon NFS + architecture: + storage_location: remote + benchmark_API: file + product_API: file + client_footprint: open_source + client_installation: in_box + capabilities: + multi_host: true + simultaneous_write: true + simultaneous_read: true + remap_time_in_seconds: 0 + storage: + - friendly_description: NFS Server + quantity: 1 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + networking: + - type: ethernet + speed: 100 + traffic: + - management + - metadata + - data + unit_count: 1 + drives: + - vendor_name: Micron + model_name: X9000 TLC + interface: nvme + capacity_in_GB: 8000 + unit_count: 8 + - vendor_name: Micron + model_name: X1650 QLC + interface: nvme + capacity_in_GB: 24000 + unit_count: 8 + operating_system: + name: Rocky Linux + version: "9.5" + environment: + - name: foobar + value: "12" + - name: foobarbaz + value: "13" + sysctl: + - name: barfoo + value: "21" + - name: bazbarfoo + value: "31" + total_rack_units: 2 + clients: + - friendly_description: Benchmark load system + quantity: 16 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + networking: + - type: ethernet + speed: 100 + traffic: + - data + unit_count: 1 + operating_system: + name: RHEL + version: "10.1" diff --git a/system_description/example_PFS.yaml b/system_description/example_PFS.yaml new file mode 100644 index 00000000..fd2aeb19 --- /dev/null +++ b/system_description/example_PFS.yaml @@ -0,0 +1,122 @@ +system_under_test: + system: + submission_name: PFS_example + friendly_description: A generic example based upon a Parallel filesystem + architecture: + storage_location: remote_and_local + benchmark_API: file + product_API: file + client_footprint: open_source + client_installation: installable + capabilities: + multi_host: true + simultaneous_write: true + simultaneous_read: true + remap_time_in_seconds: 0 + storage: + - friendly_description: Metadata Processing + quantity: 2 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + networking: + - type: ethernet + speed: 100 + traffic: + - management + - metadata + unit_count: 1 + operating_system: + name: Rocky Linux + version: "9.5" + environment: + - name: foobar + value: "12" + - name: foobarbaz + value: "13" + sysctl: + - name: barfoo + value: "21" + - name: bazbarfoo + value: "31" + - friendly_description: Data Storage Node + quantity: 12 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + networking: + - type: ethernet + speed: 100 + traffic: + - management + - data + unit_count: 1 + drives: + - vendor_name: Micron + model_name: X9000 TLC + interface: nvme + capacity_in_GB: 8000 + unit_count: 8 + - vendor_name: Micron + model_name: X1650 QLC + interface: nvme + capacity_in_GB: 24000 + unit_count: 8 + operating_system: + name: Rocky Linux + version: "9.5" + total_rack_units: 14 + clients: + - friendly_description: Benchmark load system + quantity: 16 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + networking: + - type: ethernet + speed: 100 + traffic: + - data + unit_count: 1 + operating_system: + name: RHEL + version: "10.1" diff --git a/system_description/example_drive.yaml b/system_description/example_drive.yaml new file mode 100644 index 00000000..7eeb1c8a --- /dev/null +++ b/system_description/example_drive.yaml @@ -0,0 +1,60 @@ +system_under_test: + system: + submission_name: NVMe_Example + friendly_description: A generic example of a single-drive submission + architecture: + storage_location: local + benchmark_API: file + product_API: block + client_footprint: open_source + client_installation: in_box + capabilities: + multi_host: false + simultaneous_write: false + simultaneous_read: false + remap_time_in_seconds: 0 + storage: + - friendly_description: Data + quantity: 1 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + operating_system: + name: Rocky Linux + version: "9.5" + total_rack_units: 4 + clients: + - friendly_description: Benchmark load system + quantity: 16 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + operating_system: + name: RHEL + version: "10.1" diff --git a/system_description/example_remote_block.yaml b/system_description/example_remote_block.yaml new file mode 100644 index 00000000..3388044d --- /dev/null +++ b/system_description/example_remote_block.yaml @@ -0,0 +1,93 @@ +system_under_test: + system: + submission_name: remote_block_example + friendly_description: A generic example based upon a remote-block solution + architecture: + storage_location: remote + benchmark_API: file + product_API: block + client_footprint: closed_source + client_installation: installable + capabilities: + multi_host: true + simultaneous_write: false + simultaneous_read: true + remap_time_in_seconds: 15 + storage: + - friendly_description: Block Server + quantity: 1 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + networking: + - type: ethernet + speed: 100 + traffic: + - data + unit_count: 1 + drives: + - vendor_name: Micron + model_name: X9000 TLC + interface: nvme + capacity_in_GB: 8000 + unit_count: 8 + - vendor_name: Micron + model_name: X1650 QLC + interface: nvme + capacity_in_GB: 24000 + unit_count: 8 + operating_system: + name: Rocky Linux + version: "9.5" + environment: + - name: foobar + value: "12" + - name: foobarbaz + value: "13" + sysctl: + - name: barfoo + value: "21" + - name: bazbarfoo + value: "31" + total_rack_units: 2 + clients: + - friendly_description: Benchmark load system + quantity: 16 + chassis: + model_name: Dell7680 + rack_units: 1 + cpu_model: XEON 90000 + cpu_qty: 2 + cpu_cores: 96 + memory_capacity: 256 + power: + min_psus_active: 1 + psus_configured: + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + - Name: Sparke 100 + PowerCapacityWatts: 1800 + Efficiency: Platinum + networking: + - type: ethernet + speed: 100 + traffic: + - data + unit_count: 1 + operating_system: + name: RHEL + version: "10.1" diff --git a/system_description/schema.yaml b/system_description/schema.yaml new file mode 100644 index 00000000..6e799349 --- /dev/null +++ b/system_description/schema.yaml @@ -0,0 +1,160 @@ +# +# The root of the yaml tree. +# +system_under_test: + system: + submission_name: str( min=1 ) + friendly_description: str( min=1 ) + architecture: + storage_location: enum( 'remote', 'local', 'remote_and_local' ) + benchmark_API: enum( 'file', 'object', 'block' ) # what the benchmark uses + product_API: enum( 'file', 'object', 'block' ) # What the product supports + client_footprint: enum( 'open_source', 'closed_source', 'none' ) + client_installation: enum( 'in_box', 'installable', 'n/a' ) + capabilities: + multi_host: bool( ) # False is used for local storage + simultaneous_write: bool( ) # Are simultaneous writes by multiple hosts supported? + simultaneous_read: bool( ) # Are simultaneous reads by multiple hosts supported? + remap_time_in_seconds: int( min=0 ) # Time required to go from writing to reading + storage: list( include( 'node_description' ), required=False ) + switches: list( include( 'switch_description' ), required=False ) + total_rack_units: int( min=0 ) # Not including clients + clients: list( include( 'node_description' ), required=True ) + +--- +# +# +# +node_description: + friendly_description: str( min=1 ) + quantity: int( min=1 ) + chassis: + model_name: str( min=1 ) + rack_units: int( min=1 ) + cpu_model: str( min=1 ) + cpu_qty: int( min=1 ) + cpu_cores: int( min=1 ) + memory_capacity: int( min=1 ) # in GiB, eg: 256 + power: include( 'power_device' ) + networking: list( include( 'network_instance' ), required=False ) + drives: list( include( 'drive_instance' ), required=False ) + operating_system: + name: str( min=1 ) + version: str( min=1 ) + environment: list( include( 'key_value' ), required=False ) # Just non-default values + sysctl: list( include( 'key_value' ), required=False ) # Just non-default values + +--- +# +# All non-default tunings for OS need to be listed +# +key_value: + name: str( min=1 ) + value: str( min=1 ) + +--- +# +# +# +network_instance: + type: enum( 'ethernet', 'infiniband', 'other' ) + speed: int( min=1 ) # in Gigabits/s + traffic: list( enum( 'management', 'data', 'metadata', 'backend' ), min=1 ) + unit_count: int( min=1 ) # Number of ports that look like this + +--- +# +# Not including boot drives. +# +drive_instance: + vendor_name: str( min=1 ) + model_name: str( min=1 ) + interface: enum( 'nvme', 'SAS', 'SATA', 'other' ) + capacity_in_GB: int( min=1 ) # in base 10 GB, not in GiB + unit_count: int( min=1 ) # Number of drives that look like this + +--- +# +# +# +switch_description: + vendor_name: str( min=1 ) + model_name: str( min=1 ) + ports: list( include( 'network_instance' ), min=1 ) + configuration: list( include( 'key_value' ), required=False ) # Just non-default values + rack_units: int( min=1 ) + power: include( 'power_device' ) + unit_count: int( min=1 ) # Number of switches that look like this + +--- +# +# +# +power_device: + min_psus_active: int( min=1 ) + psus_configured: list( include( 'power_supply' ), min=1 ) + +--- +# +# +# +power_supply: + Name: str( min=1 ) + PowerCapacityWatts: int( min=1 ) + Efficiency: enum( 'Gold', 'Platinum', 'Titanium', 'Ruby' ) + + + +# Each on-prem MLPerf submission include the nameplate / design power of the submitted System Under Test (SUT). +# This information will complement existing measured-power methodologies and provide a uniform baseline reference across systems and vendors. +# The intent is not to replace measured power but to give additional visibility into the designed power capacity of submitted systems. +# By including design power, MLPerf can help stakeholders—submitters, reviewers, and end users—better contextualize energy efficiency results, +# improve reproducibility, and align with related MLCommons initiatives such as Storage and Training benchmarks that already collect rated +# power information.We propose to include the following: +# Option 1 (Preferred): Submit the aggregate nameplate / design power for the SUT, listing all major components (accelerators, CPUs, storage, networking, etc.). +# Option 2: Provide per-component TDP/TGP data and a summarized total for the SUT. +# === Reporting YAML FormatThe reported design power (total of all needed PSU capacities in watts) will appear alongside the measured-power and non-power views, +# clearly labeled as “Design Power.”This data is reported through a yaml file. +# The structure of the YAML file is a hierarchy of components, with the power capacity based on the DMTF RedFish PowerSupply schema. +# For example, a simple single-device system can have a single level: +# +# My Device: +# - Description: 'Optional Description' +# Min PSUs Needed: 1 +# PSUs: +# - Name: PSU 1 +# PowerCapacityWatts: 1200 +# - Name: PSU 2 +# PowerCapacityWatts: 1200 +# +# A more complex system can have two (or more) levels: +# My System: +# - My Rack 1: +# - My Server 1: +# - Description: 'Optional Description' +# Min PSUs Needed: 1 +# PSUs: +# - Name: PSU 1 +# PowerCapacityWatts: 1200 +# - Name: PSU 2 +# PowerCapacityWatts: 1200 +# - My Switch 1: +# - Description: 'Optional Description' +# Min PSUs needed: 1 +# PSUs: +# - Name: PSU 1 +# PowerCapacityWatts: 1200 +# - Name: PSU 2 +# PowerCapacityWatts: 1200 +# +# The labels “My System”, “My Rack 1”, “My Server 1”, etc. are submitter-defined labels corresponding to their system topology. +# Rack-level systems are also supported: +# My System: +# - My Rack 1: +# - Description: 'Optional Description' +# Min PSUs Needed: 1 +# PSUs: +# - Name: PSU 1 +# PowerCapacityWatts: 12000 +# - Name: PSU 2 +# PowerCapacityWatts: 12000 diff --git a/uv.lock b/uv.lock index aa532e41..5658f94b 100755 --- a/uv.lock +++ b/uv.lock @@ -480,7 +480,7 @@ wheels = [ [[package]] name = "mlpstorage" -version = "2.0.0b1" +version = "3.0" source = { editable = "." } dependencies = [ { name = "dlio-benchmark" }, @@ -490,6 +490,7 @@ dependencies = [ { name = "pyyaml" }, { name = "rich" }, { name = "s3dlio" }, + { name = "yamale" }, ] [package.optional-dependencies] @@ -524,6 +525,7 @@ requires-dist = [ { name = "pyyaml", specifier = ">=6.0" }, { name = "rich", specifier = ">=13.0" }, { name = "s3dlio", specifier = ">=0.9.86" }, + { name = "yamale" }, { name = "tabulate", marker = "extra == 'vectordb'", specifier = ">=0.9" }, ] provides-extras = ["test", "full", "vectordb"] @@ -1406,6 +1408,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993, upload-time = "2026-03-06T02:53:12.905Z" }, ] +[[package]] +name = "yamale" +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/64/9e5de0e829920b848dcf5fe3ff64936d83cc7471babd264588b08bca97e0/yamale-6.1.0.tar.gz", hash = "sha256:fd435aa7b830c73e89a9ef548c0ace2d3d8dc3e5e180e6b57ff70b31495672fd", size = 42402, upload-time = "2025-11-20T16:52:30.258Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c9/fc/cbad39af7e761525077690ddff1ae19ace7e2f54552e90fb848a43a270fa/yamale-6.1.0-py3-none-any.whl", hash = "sha256:7e109c9d83e3a7e42703516cb2b70b9c7aa5b7a738019c4a6c202b6b0b9096c5", size = 58215, upload-time = "2025-11-20T16:52:28.806Z" }, +] + [[package]] name = "zstandard" version = "0.25.0"