diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 03c6a7c..f9a708c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -42,6 +42,10 @@ jobs: run: | pytest tests/test_gridder.py -v --tb=short + - name: Run conversion tests + run: | + pytest tests/test_conversion.py -v --tb=short + - name: Run integration tests run: | bash tests/run_tests.sh --integration @@ -132,6 +136,7 @@ jobs: - name: Run tests with debug build run: | pytest tests/test_gridder.py -v --tb=short + pytest tests/test_conversion.py -v --tb=short - name: Upload test artifacts on failure if: failure() diff --git a/README.md b/README.md index b8fb642..328085e 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,26 @@ The gridder produces HDF5 files containing: - Kernel radius information - Processing metadata and timestamps +## Converting Arbitrary Snapshots + +Use the conversion tool to process snapshots from any simulation code: + +```bash +python tools/convert_to_gridder_format.py input.hdf5 output.hdf5 \ + --coordinates-key PartType1/Coordinates \ + --masses-key PartType1/Masses \ + --copy-header +``` + +**Key features:** +- Handles arbitrary HDF5 dataset names +- Creates required cell structure automatically +- Supports both cubic and non-cubic simulation boxes +- MPI parallelization for large files +- Configurable cell grid resolution (default: 16×16×16) + +**See [Conversion Guide](docs/conversion.md) for detailed documentation.** + ## Generating Test Data Use the included Python script to create test snapshots: diff --git a/docs/conversion.md b/docs/conversion.md new file mode 100644 index 0000000..de00af3 --- /dev/null +++ b/docs/conversion.md @@ -0,0 +1,308 @@ +# Converting Arbitrary Snapshots + +The `convert_to_gridder_format.py` tool converts HDF5 simulation snapshots from arbitrary formats into the format required by the gridder. This allows you to process simulations from any code (not just SWIFT) with the FLARES-2 Gridder. + +## Overview + +The conversion process: + +1. Reads particle coordinates and masses from arbitrary HDF5 dataset keys +2. Creates a spatial cell structure for efficient particle lookup +3. Sorts particles by cell index +4. Writes output in the standardized gridder format + +The gridder requires a hierarchical cell structure for spatial indexing. The conversion script creates a regular grid of cells (default: 16×16×16 = 4,096 cells) and assigns each particle to a cell based on its position. + +## Requirements + +- Python 3.7+ +- h5py +- numpy +- mpi4py (optional, for MPI mode) + +## Basic Usage + +```bash +python tools/convert_to_gridder_format.py input.hdf5 output.hdf5 \ + --coordinates-key PartType1/Coordinates \ + --masses-key PartType1/Masses \ + --copy-header +``` + +### Required Arguments + +- `input_file`: Path to input HDF5 snapshot +- `output_file`: Path to output HDF5 file +- `--coordinates-key`: HDF5 dataset path for particle coordinates +- `--masses-key`: HDF5 dataset path for particle masses + +### Optional Arguments + +- `--copy-header`: Copy Header group from input to output (recommended) +- `--header-key`: HDF5 key for header group (default: `Header`) +- `--particle-type`: Output particle type group name (default: `PartType1`) +- `--cdim`: Number of cells per dimension for spatial indexing (default: **16**) +- `--boxsize X Y Z`: Manually specify box size (if not in Header) + +## Examples + +### SWIFT-like Format + +If your snapshot already uses SWIFT-style keys: + +```bash +python tools/convert_to_gridder_format.py \ + swift_snapshot.hdf5 \ + gridder_snapshot.hdf5 \ + --coordinates-key PartType1/Coordinates \ + --masses-key PartType1/Masses \ + --copy-header +``` + +### Custom Format with Non-Standard Keys + +For simulations with different naming conventions: + +```bash +python tools/convert_to_gridder_format.py \ + gadget_snapshot.hdf5 \ + gridder_snapshot.hdf5 \ + --coordinates-key DarkMatter/Positions \ + --masses-key DarkMatter/Mass \ + --copy-header \ + --cdim 32 +``` + +### Without Header (Manual BoxSize) + +If your input file doesn't have a Header group: + +```bash +python tools/convert_to_gridder_format.py \ + custom_snapshot.hdf5 \ + gridder_snapshot.hdf5 \ + --coordinates-key Coordinates \ + --masses-key Masses \ + --boxsize 100.0 100.0 100.0 \ + --cdim 16 +``` + +### Non-Cubic Simulation Box + +For simulations with different box sizes in each dimension: + +```bash +python tools/convert_to_gridder_format.py \ + noncubic_snapshot.hdf5 \ + gridder_snapshot.hdf5 \ + --coordinates-key Coords \ + --masses-key Mass \ + --boxsize 100.0 200.0 150.0 \ + --copy-header +``` + +### Fine Cell Grid for Large Simulations + +For very large simulations, increase `cdim` for better spatial indexing: + +```bash +python tools/convert_to_gridder_format.py \ + large_simulation.hdf5 \ + gridder_snapshot.hdf5 \ + --coordinates-key PartType1/Coordinates \ + --masses-key PartType1/Masses \ + --cdim 64 \ + --copy-header +``` + +## MPI Mode + +For very large snapshots, use MPI to parallelize the conversion: + +```bash +mpirun -np 4 python tools/convert_to_gridder_format.py \ + huge_snapshot.hdf5 \ + gridder_snapshot.hdf5 \ + --coordinates-key PartType1/Coordinates \ + --masses-key PartType1/Masses \ + --copy-header \ + --cdim 32 +``` + +**MPI mode creates:** +- Per-rank files: `gridder_snapshot_rank_0.hdf5`, `gridder_snapshot_rank_1.hdf5`, ... +- Virtual file: `gridder_snapshot.hdf5` (combines all ranks) + +Use the virtual file (`gridder_snapshot.hdf5`) as input to the gridder. + +## Output HDF5 Structure + +The conversion script produces files with this structure: + +``` +/Header # Simulation metadata (if --copy-header used) + BoxSize: [X, Y, Z] # Box dimensions + NumPart_Total: [0, N, 0, 0, 0, 0] # Total particle counts + Redshift: float # Redshift value + +/PartType1 # Dark matter particles + /Coordinates # Particle positions (sorted by cell) + shape: (N, 3) + dtype: float64 + /Masses # Particle masses (sorted by cell) + shape: (N,) + dtype: float64 + +/Cells # Spatial indexing structure + /Meta-data + dimension: [cdim, cdim, cdim] # Number of cells per dimension + size: [dx, dy, dz] # Physical size of each cell + /Counts + /PartType1 # Number of particles in each cell + shape: (cdim³,) + dtype: int32 + /OffsetsInFile + /PartType1 # Starting index for particles in each cell + shape: (cdim³,) + dtype: int64 +``` + +## Choosing `cdim` + +The `cdim` parameter controls the cell grid resolution. Choose based on your simulation size: + +| Simulation Size | Recommended cdim | Total Cells | Use Case | +|----------------|------------------|-------------|----------| +| < 1M particles | 8-16 | 512-4,096 | Small test simulations | +| 1-10M particles | 16-32 | 4,096-32,768 | Medium simulations | +| 10-100M particles | 32-64 | 32,768-262,144 | Large simulations | +| > 100M particles | 64-128 | 262,144-2M | Very large simulations | + +**Guidelines:** +- Higher `cdim` → More cells → Better spatial locality → Faster gridder +- But: Very high `cdim` with sparse distributions may waste memory +- For uniform distributions: `cdim ≈ (nparticles^(1/3)) / 10` is a good starting point + +## Input File Requirements + +Your input HDF5 file must contain: + +**Required:** +- Particle coordinates as a (N, 3) array +- Particle masses as a (N,) array + +**Optional (but recommended):** +- `Header/BoxSize`: Box dimensions [X, Y, Z] + - If missing, use `--boxsize` argument +- `Header/NumPart_Total`: Total particle counts +- `Header/Redshift`: Redshift value + +**Not required:** +- Cell structure (created by conversion script) +- Velocities +- Particle IDs (will be created if missing) + +## Common Issues + +### Missing BoxSize + +**Error:** +``` +ValueError: BoxSize not found in input file and not provided via --boxsize +``` + +**Solution:** +Provide BoxSize manually: +```bash +--boxsize 100.0 100.0 100.0 +``` + +### Particles Outside Box + +Particles with coordinates outside `[0, BoxSize]` will be clamped to the nearest cell boundary. + +### Memory Usage + +For very large simulations: +- Serial mode: Entire particle array loaded into memory +- MPI mode: Particles split across ranks, reducing per-rank memory + +## Verification + +After conversion, verify the output structure: + +```python +import h5py + +with h5py.File('gridder_snapshot.hdf5', 'r') as f: + # Check required groups + assert '/PartType1/Coordinates' in f + assert '/PartType1/Masses' in f + assert '/Cells/Counts/PartType1' in f + assert '/Cells/OffsetsInFile/PartType1' in f + + # Check cell structure + coords = f['/PartType1/Coordinates'][:] + cell_counts = f['/Cells/Counts/PartType1'][:] + + print(f"Total particles: {len(coords)}") + print(f"Particles in cells: {cell_counts.sum()}") + print(f"Non-empty cells: {(cell_counts > 0).sum()}") +``` + +## Full Workflow Example + +Complete example converting a Gadget snapshot: + +```bash +# 1. Convert Gadget snapshot to gridder format +python tools/convert_to_gridder_format.py \ + gadget_snapshot_099.hdf5 \ + gridder_input.hdf5 \ + --coordinates-key PartType1/Coordinates \ + --masses-key PartType1/Masses \ + --boxsize 100.0 100.0 100.0 \ + --cdim 32 \ + --copy-header + +# 2. Create parameter file (params.yml) +cat > params.yml << EOF +Kernels: + nkernels: 3 + kernel_radius_1: 1.0 + kernel_radius_2: 2.0 + kernel_radius_3: 5.0 + +Grid: + type: uniform + cdim: 50 + +Cosmology: + h: 0.7 + Omega_cdm: 0.25 + Omega_b: 0.05 + +Tree: + max_leaf_count: 200 + +Input: + filepath: gridder_input.hdf5 + +Output: + filepath: output/ + basename: gridded_output.hdf5 + write_masses: 1 +EOF + +# 3. Run gridder +./build/parent_gridder params.yml 8 + +# 4. Verify output +ls -lh output/gridded_output.hdf5 +``` + +## See Also + +- [Parameter Reference](parameters.md) - Gridder parameter file documentation +- [Quick Start](quickstart.md) - Getting started with the gridder +- [Installation Guide](installation.md) - Building the gridder diff --git a/docs/index.md b/docs/index.md index 6da3572..04d0ad4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -41,12 +41,14 @@ mpirun -n 4 ./build_mpi/parent_gridder params.yml 1 - **[Getting Started](getting-started/installation.md)**: Installation, quick start, and configuration - **[Parameter Reference](getting-started/parameters.md)**: Detailed parameter file documentation - **[Performance](performance/openmp.md)**: OpenMP and MPI optimization guides +- **[Conversion Tool](conversion.md)**: Converting arbitrary simulation snapshots to gridder format ## Quick Links - [Installation Guide](getting-started/installation.md) - [Quick Start Tutorial](getting-started/quickstart.md) - [Parameter File Reference](getting-started/parameters.md) +- [Snapshot Conversion Guide](conversion.md) - [OpenMP Threading](performance/openmp.md) - [MPI Parallelization](performance/mpi.md) diff --git a/docs/quickstart.md b/docs/quickstart.md index c27e33b..538a036 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -22,6 +22,7 @@ Before starting, ensure you have: - Built the gridder (see [Installation](installation.md)) - Access to a SWIFT simulation snapshot (HDF5 format) + - **Note:** For non-SWIFT snapshots, use the [Conversion Tool](conversion.md) to create compatible HDF5 files - Basic understanding of YAML syntax ## Minimal Example diff --git a/test_conversion_manual.hdf5 b/test_conversion_manual.hdf5 new file mode 100644 index 0000000..dacddca Binary files /dev/null and b/test_conversion_manual.hdf5 differ diff --git a/tests/test_conversion.py b/tests/test_conversion.py new file mode 100644 index 0000000..95b2fb6 --- /dev/null +++ b/tests/test_conversion.py @@ -0,0 +1,571 @@ +#!/usr/bin/env python3 +""" +Test suite for convert_to_gridder_format.py + +Tests the conversion script's ability to: +1. Create proper cell structures +2. Handle various input formats +3. Produce files that the gridder can read successfully +""" + +import subprocess +import sys +import tempfile +from pathlib import Path +import textwrap + +import h5py +import numpy as np +import pytest + +# Get paths +PROJECT_ROOT = Path(__file__).parent.parent +CONVERSION_SCRIPT = PROJECT_ROOT / "tools" / "convert_to_gridder_format.py" +GRIDDER_EXE = PROJECT_ROOT / "build" / "parent_gridder" + + +@pytest.fixture(scope="session") +def build_gridder(): + """Ensure gridder is built before running tests.""" + if not GRIDDER_EXE.exists(): + print("Building gridder...") + subprocess.run( + ["cmake", "-B", str(PROJECT_ROOT / "build")], cwd=PROJECT_ROOT, check=True + ) + subprocess.run( + ["cmake", "--build", str(PROJECT_ROOT / "build")], + cwd=PROJECT_ROOT, + check=True, + ) + assert GRIDDER_EXE.exists(), f"Gridder executable not found at {GRIDDER_EXE}" + return GRIDDER_EXE + + +@pytest.fixture +def temp_dir(): + """Create a temporary directory for test files.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + +def run_gridder_on_converted( + build_gridder, temp_dir, input_file, basename="gridded_output.hdf5", grid_cdim=3 +): + """ + Run the C++ gridder against a converted snapshot to ensure it is readable. + """ + param_file = temp_dir / f"{basename}.yml" + param_content = textwrap.dedent( + f""" + Kernels: + nkernels: 1 + kernel_radius_1: 1.0 + + Grid: + type: uniform + cdim: {grid_cdim} + + Cosmology: + h: 0.681 + Omega_cdm: 0.256011 + Omega_b: 0.048600 + + Tree: + max_leaf_count: 200 + + Input: + filepath: {input_file} + + Output: + filepath: {temp_dir}/ + basename: {basename} + write_masses: 1 + """ + ).strip() + param_file.write_text(param_content) + + result = subprocess.run( + [str(build_gridder), str(param_file), "1"], capture_output=True, text=True + ) + assert result.returncode == 0, ( + f"Gridder failed on converted file {input_file}:\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) + + output_path = temp_dir / basename + assert output_path.exists(), f"Gridder output missing at {output_path}" + return output_path + + +def create_arbitrary_snapshot( + filepath, + coords_key="Coords", + masses_key="Masses", + npart=1000, + boxsize=10.0, + include_header=True, +): + """ + Create a test snapshot with arbitrary key names. + + Args: + filepath: Output file path + coords_key: Key name for coordinates + masses_key: Key name for masses + npart: Number of particles + boxsize: Box size + include_header: Whether to include Header group + """ + np.random.seed(42) + coords = np.random.uniform(0, boxsize, (npart, 3)) + masses = np.ones(npart) * 0.5 + + with h5py.File(filepath, "w") as f: + # Write particles with arbitrary keys + f.create_dataset(coords_key, data=coords) + f.create_dataset(masses_key, data=masses) + + # Add header if requested + if include_header: + header = f.create_group("Header") + header.attrs["BoxSize"] = np.array([boxsize, boxsize, boxsize]) + header.attrs["NumPart_Total"] = np.array( + [0, npart, 0, 0, 0, 0], dtype=np.uint64 + ) + header.attrs["Redshift"] = 0.0 + + +def create_uniform_snapshot(filepath, npart_per_dim=5, boxsize=10.0): + """ + Create a snapshot with particles on a uniform grid. + + Args: + filepath: Output file path + npart_per_dim: Number of particles per dimension + boxsize: Box size + """ + spacing = boxsize / npart_per_dim + coords = [] + for i in range(npart_per_dim): + for j in range(npart_per_dim): + for k in range(npart_per_dim): + x = (i + 0.5) * spacing + y = (j + 0.5) * spacing + z = (k + 0.5) * spacing + coords.append([x, y, z]) + + coords = np.array(coords) + npart = len(coords) + masses = np.ones(npart) + + with h5py.File(filepath, "w") as f: + f.create_dataset("DarkMatter/Positions", data=coords) + f.create_dataset("DarkMatter/Masses", data=masses) + + header = f.create_group("Header") + header.attrs["BoxSize"] = np.array([boxsize, boxsize, boxsize]) + header.attrs["NumPart_Total"] = np.array( + [0, npart, 0, 0, 0, 0], dtype=np.uint64 + ) + header.attrs["Redshift"] = 0.0 + + +def create_noncubic_snapshot(filepath, boxsize_xyz, npart=500): + """Create a snapshot with non-cubic box.""" + np.random.seed(123) + coords = np.random.uniform([0, 0, 0], boxsize_xyz, (npart, 3)) + masses = np.ones(npart) * 2.0 + + with h5py.File(filepath, "w") as f: + f.create_dataset("PartType1/Coordinates", data=coords) + f.create_dataset("PartType1/Masses", data=masses) + + header = f.create_group("Header") + header.attrs["BoxSize"] = np.array(boxsize_xyz) + header.attrs["NumPart_Total"] = np.array( + [0, npart, 0, 0, 0, 0], dtype=np.uint64 + ) + header.attrs["Redshift"] = 0.0 + + +def verify_cell_structure(hdf_file, expected_npart, cdim): + """ + Verify that an HDF5 file has correct cell structure. + + Args: + hdf_file: Path to HDF5 file + expected_npart: Expected total number of particles + cdim: Expected cell dimension + + Returns: + True if structure is valid, raises AssertionError otherwise + """ + with h5py.File(hdf_file, "r") as f: + # Check required groups exist + assert "/PartType1" in f, "Missing PartType1 group" + assert "/Cells" in f, "Missing Cells group" + + # Check particle data + assert "/PartType1/Coordinates" in f, "Missing Coordinates dataset" + assert "/PartType1/Masses" in f, "Missing Masses dataset" + + coords = f["/PartType1/Coordinates"][:] + masses = f["/PartType1/Masses"][:] + + assert coords.shape == (expected_npart, 3), ( + f"Wrong coordinates shape: {coords.shape} vs expected ({expected_npart}, 3)" + ) + assert masses.shape == (expected_npart,), ( + f"Wrong masses shape: {masses.shape} vs expected ({expected_npart},)" + ) + + # Check cell metadata + assert "/Cells/Meta-data" in f, "Missing Cells/Meta-data group" + metadata = f["/Cells/Meta-data"] + + assert "dimension" in metadata.attrs, "Missing dimension attribute" + assert "size" in metadata.attrs, "Missing size attribute" + + dimension = metadata.attrs["dimension"] + assert np.array_equal(dimension, [cdim, cdim, cdim]), ( + f"Wrong dimension: {dimension} vs expected [{cdim}, {cdim}, {cdim}]" + ) + + # Check cell counts and offsets + assert "/Cells/Counts/PartType1" in f, "Missing Counts dataset" + assert "/Cells/OffsetsInFile/PartType1" in f, "Missing OffsetsInFile dataset" + + cell_counts = f["/Cells/Counts/PartType1"][:] + cell_offsets = f["/Cells/OffsetsInFile/PartType1"][:] + + ncells = cdim**3 + assert cell_counts.shape == (ncells,), ( + f"Wrong cell_counts shape: {cell_counts.shape} vs expected ({ncells},)" + ) + assert cell_offsets.shape == (ncells,), ( + f"Wrong cell_offsets shape: {cell_offsets.shape} vs expected ({ncells},)" + ) + + # Verify counts sum to total particles + assert np.sum(cell_counts) == expected_npart, ( + f"Cell counts sum to {np.sum(cell_counts)}, expected {expected_npart}" + ) + + # Verify offsets are cumulative + expected_offsets = np.zeros(ncells, dtype=np.int64) + expected_offsets[1:] = np.cumsum(cell_counts[:-1]) + assert np.array_equal(cell_offsets, expected_offsets), ( + "Cell offsets are not cumulative" + ) + + # Verify particles are sorted by cell + # Recompute cell assignments + cell_size = metadata.attrs["size"] + i = np.floor(coords[:, 0] / cell_size[0]).astype(np.int32) + j = np.floor(coords[:, 1] / cell_size[1]).astype(np.int32) + k = np.floor(coords[:, 2] / cell_size[2]).astype(np.int32) + i = np.clip(i, 0, cdim - 1) + j = np.clip(j, 0, cdim - 1) + k = np.clip(k, 0, cdim - 1) + cell_indices = k + j * cdim + i * cdim * cdim + + # Check that particles are sorted + for idx in range(1, len(cell_indices)): + assert cell_indices[idx] >= cell_indices[idx - 1], ( + f"Particles not sorted by cell at index {idx}" + ) + + return True + + +class TestConversion: + """Test conversion script functionality.""" + + def test_uniform_distribution(self, build_gridder, temp_dir): + """Test conversion with uniform particle distribution.""" + input_file = temp_dir / "uniform_input.hdf5" + output_file = temp_dir / "uniform_output.hdf5" + + # Create test input + npart_per_dim = 5 + npart = npart_per_dim**3 + create_uniform_snapshot(input_file, npart_per_dim=npart_per_dim) + + # Run conversion + cmd = [ + sys.executable, + str(CONVERSION_SCRIPT), + str(input_file), + str(output_file), + "--coordinates-key", + "DarkMatter/Positions", + "--masses-key", + "DarkMatter/Masses", + "--copy-header", + "--cdim", + "4", + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, f"Conversion failed: {result.stderr}" + + # Verify output structure + verify_cell_structure(output_file, npart, cdim=4) + + # Test with gridder + run_gridder_on_converted( + build_gridder, + temp_dir, + output_file, + basename="uniform_gridded_output.hdf5", + grid_cdim=3, + ) + + def test_random_distribution(self, build_gridder, temp_dir): + """Test conversion with random particle distribution.""" + input_file = temp_dir / "random_input.hdf5" + output_file = temp_dir / "random_output.hdf5" + + # Create test input with arbitrary keys + npart = 1000 + create_arbitrary_snapshot( + input_file, + coords_key="MyCoordinates", + masses_key="MyMasses", + npart=npart, + boxsize=20.0, + ) + + # Run conversion + cmd = [ + sys.executable, + str(CONVERSION_SCRIPT), + str(input_file), + str(output_file), + "--coordinates-key", + "MyCoordinates", + "--masses-key", + "MyMasses", + "--copy-header", + "--cdim", + "10", + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, f"Conversion failed: {result.stderr}" + + # Verify output structure + verify_cell_structure(output_file, npart, cdim=10) + + # Ensure gridder can read converted file + run_gridder_on_converted( + build_gridder, + temp_dir, + output_file, + basename="random_gridded_output.hdf5", + grid_cdim=4, + ) + + def test_noncubic_box(self, build_gridder, temp_dir): + """Test conversion with non-cubic box.""" + input_file = temp_dir / "noncubic_input.hdf5" + output_file = temp_dir / "noncubic_output.hdf5" + + # Create test input with non-cubic box + npart = 500 + boxsize_xyz = [10.0, 20.0, 15.0] + create_noncubic_snapshot(input_file, boxsize_xyz, npart) + + # Run conversion + cmd = [ + sys.executable, + str(CONVERSION_SCRIPT), + str(input_file), + str(output_file), + "--coordinates-key", + "PartType1/Coordinates", + "--masses-key", + "PartType1/Masses", + "--copy-header", + "--cdim", + "8", + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, f"Conversion failed: {result.stderr}" + + # Verify output structure + verify_cell_structure(output_file, npart, cdim=8) + + # Verify cell sizes match non-cubic box + with h5py.File(output_file, "r") as f: + cell_size = f["/Cells/Meta-data"].attrs["size"] + expected_size = np.array(boxsize_xyz) / 8 + assert np.allclose(cell_size, expected_size), ( + f"Cell sizes incorrect: {cell_size} vs expected {expected_size}" + ) + + run_gridder_on_converted( + build_gridder, + temp_dir, + output_file, + basename="noncubic_gridded_output.hdf5", + grid_cdim=4, + ) + + def test_boundary_particles(self, build_gridder, temp_dir): + """Test handling of particles at box boundaries.""" + input_file = temp_dir / "boundary_input.hdf5" + output_file = temp_dir / "boundary_output.hdf5" + + boxsize = 10.0 + + # Create particles at boundaries and random interior + np.random.seed(789) + boundary_coords = np.array( + [ + [0.0, 5.0, 5.0], # x=0 + [10.0, 5.0, 5.0], # x=boxsize + [5.0, 0.0, 5.0], # y=0 + [5.0, 10.0, 5.0], # y=boxsize + [5.0, 5.0, 0.0], # z=0 + [5.0, 5.0, 10.0], # z=boxsize + [0.0, 0.0, 0.0], # corner + [10.0, 10.0, 10.0], # opposite corner + ] + ) + interior_coords = np.random.uniform(0.1, 9.9, (100, 3)) + coords = np.vstack([boundary_coords, interior_coords]) + npart = len(coords) + masses = np.ones(npart) + + with h5py.File(input_file, "w") as f: + f.create_dataset("Coordinates", data=coords) + f.create_dataset("Masses", data=masses) + + header = f.create_group("Header") + header.attrs["BoxSize"] = np.array([boxsize, boxsize, boxsize]) + header.attrs["NumPart_Total"] = np.array( + [0, npart, 0, 0, 0, 0], dtype=np.uint64 + ) + header.attrs["Redshift"] = 0.0 + + # Run conversion + cmd = [ + sys.executable, + str(CONVERSION_SCRIPT), + str(input_file), + str(output_file), + "--coordinates-key", + "Coordinates", + "--masses-key", + "Masses", + "--copy-header", + "--cdim", + "5", + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, f"Conversion failed: {result.stderr}" + + # Verify output structure + verify_cell_structure(output_file, npart, cdim=5) + + run_gridder_on_converted( + build_gridder, + temp_dir, + output_file, + basename="boundary_gridded_output.hdf5", + grid_cdim=3, + ) + + def test_manual_boxsize(self, build_gridder, temp_dir): + """Test specifying BoxSize manually.""" + input_file = temp_dir / "no_header_input.hdf5" + output_file = temp_dir / "no_header_output.hdf5" + + # Create input without header + npart = 200 + boxsize = 50.0 + np.random.seed(456) + coords = np.random.uniform(0, boxsize, (npart, 3)) + masses = np.ones(npart) * 1.5 + + with h5py.File(input_file, "w") as f: + f.create_dataset("Positions", data=coords) + f.create_dataset("ParticleMasses", data=masses) + # No Header group! + + # Run conversion with manual BoxSize + cmd = [ + sys.executable, + str(CONVERSION_SCRIPT), + str(input_file), + str(output_file), + "--coordinates-key", + "Positions", + "--masses-key", + "ParticleMasses", + "--boxsize", + "50.0", + "50.0", + "50.0", + "--cdim", + "8", + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, f"Conversion failed: {result.stderr}" + + # Verify output structure + verify_cell_structure(output_file, npart, cdim=8) + + run_gridder_on_converted( + build_gridder, + temp_dir, + output_file, + basename="manual_boxsize_gridded_output.hdf5", + grid_cdim=4, + ) + + def test_different_cdim_values(self, build_gridder, temp_dir): + """Test various cell dimension values.""" + for cdim in [4, 10, 16, 32]: + input_file = temp_dir / f"cdim{cdim}_input.hdf5" + output_file = temp_dir / f"cdim{cdim}_output.hdf5" + + npart = 500 + create_arbitrary_snapshot(input_file, npart=npart, boxsize=10.0) + + cmd = [ + sys.executable, + str(CONVERSION_SCRIPT), + str(input_file), + str(output_file), + "--coordinates-key", + "Coords", + "--masses-key", + "Masses", + "--copy-header", + "--cdim", + str(cdim), + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, ( + f"Conversion failed for cdim={cdim}: {result.stderr}" + ) + + verify_cell_structure(output_file, npart, cdim=cdim) + + run_gridder_on_converted( + build_gridder, + temp_dir, + output_file, + basename=f"cdim{cdim}_gridded_output.hdf5", + grid_cdim=3, + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tools/convert_to_gridder_format.py b/tools/convert_to_gridder_format.py index 6e6dff7..40f299d 100755 --- a/tools/convert_to_gridder_format.py +++ b/tools/convert_to_gridder_format.py @@ -3,11 +3,35 @@ Convert HDF5 simulation snapshots to gridder-compatible format. This script converts HDF5 files with arbitrary key names to the format -expected by the parent_gridder code. It supports both serial and MPI -execution for processing large files efficiently. - -In MPI mode, each rank writes a separate file and a virtual HDF5 file -is created to provide a unified view of the data. +expected by the parent_gridder code. The conversion includes: + +1. Reading particle coordinates and masses from arbitrary HDF5 keys +2. Creating a spatial cell structure for efficient particle lookup +3. Sorting particles by cell index +4. Writing output in the standardized gridder format + +The gridder requires a hierarchical cell structure for spatial indexing. +This script creates a regular grid of cells (default: 16x16x16 = 4096 cells) +and assigns each particle to a cell based on its position. Particles are +then sorted by cell index before being written to the output file. + +Output HDF5 Structure: + /Header # Simulation metadata (copied if --copy-header) + /PartType1 + /Coordinates # Particle positions (sorted by cell) + /Masses # Particle masses (sorted by cell) + /Cells + /Meta-data + dimension: [cdim, cdim, cdim] # Number of cells per dimension + size: [dx, dy, dz] # Physical size of each cell + /Counts + /PartType1 # Number of particles in each cell + /OffsetsInFile + /PartType1 # Starting index for particles in each cell + +Supports both serial and MPI execution for processing large files efficiently. +In MPI mode, each rank writes a separate file and a virtual HDF5 file is +created to provide a unified view of the data. """ import argparse @@ -31,18 +55,25 @@ def parse_args(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # Serial conversion + # Serial conversion (BoxSize read from Header) python convert_to_gridder_format.py input.hdf5 output.hdf5 \\ - --coordinates-key Coordinates --masses-key Masses + --coordinates-key Coordinates --masses-key Masses \\ + --copy-header # MPI conversion (creates output_rank_*.hdf5 files + virtual file) mpirun -np 4 python convert_to_gridder_format.py input.hdf5 output.hdf5 \\ - --coordinates-key PartType1/Coordinates --masses-key PartType1/Masses + --coordinates-key PartType1/Coordinates --masses-key PartType1/Masses \\ + --copy-header - # With custom particle type prefix + # Specify BoxSize and cell dimension manually python convert_to_gridder_format.py input.hdf5 output.hdf5 \\ --coordinates-key MyCoords --masses-key MyMasses \\ - --particle-type PartType1 + --boxsize 100.0 100.0 100.0 --cdim 32 + + # With custom particle type prefix and finer cell grid + python convert_to_gridder_format.py input.hdf5 output.hdf5 \\ + --coordinates-key DarkMatter/Positions --masses-key DarkMatter/Masses \\ + --particle-type PartType1 --cdim 64 --copy-header """ ) @@ -86,9 +117,34 @@ def parse_args(): help="HDF5 key for header group in input file (default: Header)" ) + parser.add_argument( + "--cdim", + type=int, + default=16, + help="Number of cells per dimension for spatial indexing (default: 16)" + ) + + parser.add_argument( + "--boxsize", + type=float, + nargs=3, + help="Box size [X, Y, Z] in same units as coordinates. If not provided, will try to read from Header/BoxSize" + ) + return parser.parse_args() +def write_minimal_header(h5file, boxsize, npart, redshift=0.0, header_name="Header"): + """Write a minimal Header group expected by the gridder.""" + hdr = h5file.require_group(header_name) + hdr.attrs["BoxSize"] = np.array(boxsize, dtype=np.float64) + + numpart = np.zeros(6, dtype=np.uint64) + numpart[1] = npart # PartType1 slot + hdr.attrs["NumPart_Total"] = numpart + hdr.attrs["Redshift"] = np.float64(redshift) + + def get_mpi_info(): """Get MPI rank and size, or return defaults if not using MPI.""" if HAS_MPI and MPI.COMM_WORLD.size > 1: @@ -134,6 +190,142 @@ def get_particle_count(input_file, masses_key, rank, size): return total_particles, start_idx, count +def get_boxsize(input_file, args, rank=0, coords=None): + """ + Get box size from command line args or Header in input file. If missing, + optionally infer from coordinates (max extent). + + Returns: + boxsize: numpy array [X, Y, Z] box dimensions + """ + if args.boxsize is not None: + boxsize = np.array(args.boxsize, dtype=np.float64) + if rank == 0: + print(f" Using provided BoxSize: {boxsize}") + return boxsize + + # Try to read from Header + with h5py.File(input_file, 'r') as f: + if args.header_key in f and 'BoxSize' in f[args.header_key].attrs: + boxsize = np.array(f[args.header_key].attrs['BoxSize'], dtype=np.float64) + if boxsize.shape == (): # Scalar -> cubic box + boxsize = np.array([boxsize, boxsize, boxsize], dtype=np.float64) + if rank == 0: + print(f" Read BoxSize from {args.header_key}/BoxSize: {boxsize}") + return boxsize + + # If boxsize still unknown, optionally infer from coords + if coords is not None: + if coords.ndim != 2 or coords.shape[1] != 3: + raise ValueError("Coordinates must be shape (N, 3) to infer BoxSize") + inferred = np.max(coords, axis=0) + if np.any(inferred <= 0): + raise ValueError( + "Failed to infer BoxSize from coordinates (non-positive extents). " + "Provide --boxsize or add Header/BoxSize." + ) + if rank == 0: + print(f" Inferring BoxSize from coordinates: {inferred}") + return inferred + + raise ValueError( + "BoxSize not found in input file and not provided via --boxsize. " + "Please specify --boxsize X Y Z or ensure Header/BoxSize exists in input file." + ) + + +def create_cell_structure(coords, masses, boxsize, cdim): + """ + Create cell structure for spatial indexing. + + This function: + 1. Assigns each particle to a cell based on position + 2. Counts particles per cell + 3. Sorts particles by cell index + 4. Computes cumulative offsets + + Args: + coords: Particle coordinates (N, 3) + masses: Particle masses (N,) + boxsize: Box dimensions [X, Y, Z] + cdim: Number of cells per dimension + + Returns: + sorted_coords: Coordinates sorted by cell index + sorted_masses: Masses sorted by cell index + cell_counts: Number of particles in each cell + cell_offsets: Starting index for each cell + cell_size: Physical size of cells [X, Y, Z] + """ + npart = coords.shape[0] + ncells = cdim ** 3 + + # Clamp coordinates to stay strictly within [0, boxsize) to avoid boundary issues + upper = np.nextafter(boxsize, np.full_like(boxsize, -np.inf)) + coords = np.clip(coords, 0.0, upper) + + # Calculate cell size + cell_size = boxsize / cdim + + # Assign particles to cells + # cell_id = k + j*cdim + i*cdim*cdim (row-major order) + i = np.floor(coords[:, 0] / cell_size[0]).astype(np.int32) + j = np.floor(coords[:, 1] / cell_size[1]).astype(np.int32) + k = np.floor(coords[:, 2] / cell_size[2]).astype(np.int32) + + # Clamp to cell bounds (handle particles exactly at box edge) + i = np.clip(i, 0, cdim - 1) + j = np.clip(j, 0, cdim - 1) + k = np.clip(k, 0, cdim - 1) + + # Compute cell index + cell_indices = k + j * cdim + i * cdim * cdim + + # Count particles per cell + cell_counts = np.bincount(cell_indices, minlength=ncells).astype(np.int32) + + # Get sorting indices + sort_idx = np.argsort(cell_indices) + + # Sort particles by cell + sorted_coords = coords[sort_idx] + sorted_masses = masses[sort_idx] + + # Compute cumulative offsets + cell_offsets = np.zeros(ncells, dtype=np.int64) + cell_offsets[1:] = np.cumsum(cell_counts[:-1]) + + return sorted_coords, sorted_masses, cell_counts, cell_offsets, cell_size + + +def write_cell_structure(f_out, cell_counts, cell_offsets, cdim, cell_size): + """ + Write cell structure to HDF5 file. + + Args: + f_out: Output HDF5 file handle + cell_counts: Particle counts per cell + cell_offsets: Cumulative offsets per cell + cdim: Number of cells per dimension + cell_size: Physical size of cells [X, Y, Z] + """ + # Create Cells group + cells_group = f_out.create_group('Cells') + + # Create Counts subgroup + counts_group = cells_group.create_group('Counts') + counts_group.create_dataset('PartType1', data=cell_counts, compression='gzip', compression_opts=4) + + # Create OffsetsInFile subgroup + offsets_group = cells_group.create_group('OffsetsInFile') + offsets_group.create_dataset('PartType1', data=cell_offsets, compression='gzip', compression_opts=4) + + # Create Meta-data subgroup + metadata_group = cells_group.create_group('Meta-data') + metadata_group.attrs['dimension'] = np.array([cdim, cdim, cdim], dtype=np.int32) + metadata_group.attrs['size'] = cell_size + + def convert_file_serial(args): """Convert file in serial mode (single output file).""" print(f"Converting {args.input_file} -> {args.output_file}") @@ -162,39 +354,64 @@ def convert_file_serial(args): f"Coordinates must be shape (N, 3), got {coords.shape}" ) + # Get box size now that shapes are known + boxsize = get_boxsize(args.input_file, args, coords=coords) + print(f" Found {npart} particles") print(f" Coordinates shape: {coords.shape}") print(f" Masses shape: {masses.shape}") + # Create cell structure + print(f" Creating cell structure (cdim={args.cdim})...") + sorted_coords, sorted_masses, cell_counts, cell_offsets, cell_size = \ + create_cell_structure(coords, masses, boxsize, args.cdim) + + ncells = args.cdim ** 3 + non_empty = np.count_nonzero(cell_counts) + print(f" Cells: {ncells} total, {non_empty} non-empty") + print(f" Cell size: {cell_size}") + # Create output file with h5py.File(args.output_file, 'w') as f_out: - # Create particle type group + # Create particle type group with sorted data pt_group = f_out.create_group(args.particle_type) - # Write coordinates and masses + # Write sorted coordinates and masses pt_group.create_dataset( 'Coordinates', - data=coords, + data=sorted_coords, compression='gzip', compression_opts=4 ) pt_group.create_dataset( 'Masses', - data=masses, + data=sorted_masses, compression='gzip', compression_opts=4 ) - # Copy header if requested + # Write cell structure + write_cell_structure(f_out, cell_counts, cell_offsets, args.cdim, cell_size) + + # Copy header if requested, otherwise synthesize when boxsize is provided if args.copy_header and args.header_key in f_in: print(f" Copying header from {args.header_key}") f_in.copy(args.header_key, f_out, 'Header') + elif args.boxsize is not None: + print(" Writing minimal Header from provided BoxSize") + write_minimal_header(f_out, boxsize, npart) print(f"✓ Conversion complete: {args.output_file}") def convert_file_mpi(args, comm, rank, size): - """Convert file in MPI mode (one file per rank + virtual file).""" + """ + Convert file in MPI mode (one file per rank + virtual file). + + Note: In MPI mode, particles are sorted by cell within each rank's file, + but not globally across all ranks. This is acceptable for the gridder + which can handle per-rank cell structures. + """ # Generate output filenames base_name = args.output_file.replace('.hdf5', '') rank_file = f"{base_name}_rank_{rank}.hdf5" @@ -205,6 +422,9 @@ def convert_file_mpi(args, comm, rank, size): args.input_file, args.masses_key, rank, size ) + # Get box size (all ranks need this) + boxsize = get_boxsize(args.input_file, args, rank) + if rank == 0: print(f"Converting {args.input_file} -> {base_name}_rank_*.hdf5") print(f" Total particles: {total_particles}") @@ -232,26 +452,41 @@ def convert_file_mpi(args, comm, rank, size): f"coordinates shape {coords.shape}" ) + # Create cell structure for this rank's particles + print(f"Rank {rank}: Creating cell structure (cdim={args.cdim})...") + sorted_coords, sorted_masses, cell_counts, cell_offsets, cell_size = \ + create_cell_structure(coords, masses, boxsize, args.cdim) + + ncells = args.cdim ** 3 + non_empty = np.count_nonzero(cell_counts) + print(f"Rank {rank}: Cells: {ncells} total, {non_empty} non-empty in this rank") + # Write to rank-specific file with h5py.File(rank_file, 'w') as f_out: pt_group = f_out.create_group(args.particle_type) pt_group.create_dataset( 'Coordinates', - data=coords, + data=sorted_coords, compression='gzip', compression_opts=4 ) pt_group.create_dataset( 'Masses', - data=masses, + data=sorted_masses, compression='gzip', compression_opts=4 ) - # Copy header to first rank's file - if rank == 0 and args.copy_header and args.header_key in f_in: - f_in.copy(args.header_key, f_out, 'Header') + # Write cell structure for this rank + write_cell_structure(f_out, cell_counts, cell_offsets, args.cdim, cell_size) + + # Copy header to first rank's file, or synthesize if BoxSize provided + if rank == 0: + if args.copy_header and args.header_key in f_in: + f_in.copy(args.header_key, f_out, 'Header') + elif args.boxsize is not None: + write_minimal_header(f_out, boxsize, total_particles) print(f"Rank {rank}: Wrote {count} particles to {rank_file}") @@ -262,13 +497,13 @@ def convert_file_mpi(args, comm, rank, size): print(f"Rank 0: Creating virtual file {virtual_file}") create_virtual_file( base_name, size, total_particles, args.particle_type, - args.copy_header + args.copy_header or args.boxsize is not None, args.cdim, boxsize ) print(f"✓ Conversion complete: {virtual_file}") def create_virtual_file(base_name, nranks, total_particles, particle_type, - include_header): + include_header, cdim, boxsize): """ Create a virtual HDF5 file that combines all rank files. @@ -278,6 +513,8 @@ def create_virtual_file(base_name, nranks, total_particles, particle_type, total_particles: Total number of particles across all ranks particle_type: Particle type group name include_header: Whether to include header from rank 0 + cdim: Number of cells per dimension + boxsize: Box dimensions [X, Y, Z] """ virtual_file = f"{base_name}.hdf5" @@ -306,14 +543,25 @@ def create_virtual_file(base_name, nranks, total_particles, particle_type, dtype=dtype ) - # Get particle counts from each rank file + # Get particle counts from each rank file and merge cell structures particle_counts = [] + ncells = cdim ** 3 + merged_cell_counts = np.zeros(ncells, dtype=np.int32) + for i in range(nranks): rank_file = f"{base_name}_rank_{i}.hdf5" with h5py.File(rank_file, 'r') as f: npart = f[f'{particle_type}/Masses'].shape[0] particle_counts.append(npart) + # Merge cell counts from each rank + if 'Cells/Counts/PartType1' in f: + merged_cell_counts += f['Cells/Counts/PartType1'][:] + + # Recompute global offsets from merged counts + merged_cell_offsets = np.zeros(ncells, dtype=np.int64) + merged_cell_offsets[1:] = np.cumsum(merged_cell_counts[:-1]) + # Map each rank's data into the virtual datasets offset = 0 for i in range(nranks): @@ -346,6 +594,10 @@ def create_virtual_file(base_name, nranks, total_particles, particle_type, pt_group.create_virtual_dataset('Coordinates', coord_layout) pt_group.create_virtual_dataset('Masses', mass_layout) + # Write merged cell structure + cell_size = boxsize / cdim + write_cell_structure(f, merged_cell_counts, merged_cell_offsets, cdim, cell_size) + # Copy header from rank 0 file if requested if include_header: rank0_file = f"{base_name}_rank_0.hdf5"