dalia-project · lisa-gm · Sep 9, 2025 · Sep 12, 2025 · Sep 21, 2025 · Sep 24, 2025
@@ -2,3 +2,4 @@
 examples/**/*.npz filter=lfs diff=lfs merge=lfs -text
 examples/**/*.npy filter=lfs diff=lfs merge=lfs -text
 examples/**/*.dat filter=lfs diff=lfs merge=lfs -text
+
@@ -171,6 +171,16 @@ runs_sc25/*
 settings.json
 *.out
 
+# data files that can easily be generated
+examples/brainiac/**/*.npy
+examples/brainiac/**/*.npz
+
+examples/g_ar1/**/*.npy
+examples/g_ar1/**/*.npz
+
+examples/p_ar1/**/*.npy
+examples/p_ar1/**/*.npz
+
 # Profiler outputs
 *.nsys-rep
 *.qdstrm

@@ -0,0 +1,32 @@
+@inproceedings{10.1145/3712285.3759832,
+      author = {Gaedke-Merzh\"{a}user, Lisa and Maillou, Vincent and Rodriguez Avellaneda, Fernando and Schenk, Olaf and Moraga, Paula and Luisier, Mathieu and Ziogas, Alexandros Nikolaos and Rue, H\r{a}vard},
+      title = {Accelerated Spatio-Temporal Bayesian Modeling for Multivariate Gaussian Processes},
+      year = {2025},
+      isbn = {9798400714665},
+      publisher = {Association for Computing Machinery},
+      address = {New York, NY, USA},
+      url = {https://doi.org/10.1145/3712285.3759832},
+      doi = {10.1145/3712285.3759832},
+      booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
+      pages = {949–972},
+      numpages = {24},
+      keywords = {Large-Scale Bayesian Inference, Spatio-Temporal Modeling, Distributed Memory Computing},
+      location = {},
+      series = {SC '25}
+}
+
+@inproceedings{11186484,
+      author = {Maillou, Vincent and Gaedke-Merzhauser, Lisa and Ziogas, Alexandros Nikolaos and Schenk, Olaf and Luisier, Mathieu},
+      booktitle = { 2025 IEEE International Conference on Cluster Computing (CLUSTER) },
+      title = {{ Parallel Selected Inversion of Block-Tridiagonal with Arrowhead Matrices }},
+      year = {2025},
+      volume = {},
+      ISSN = {},
+      pages = {1-12},
+      keywords = {Materials science and technology;Temperature distribution;Computational modeling;Graphics processing units;Linear algebra;Predictive models;Libraries;Supercomputers;Sparse matrices;Parallel algorithms},
+      doi = {10.1109/CLUSTER59342.2025.11186484},
+      url = {https://doi.ieeecomputersociety.org/10.1109/CLUSTER59342.2025.11186484},
+      publisher = {IEEE Computer Society},
+      address = {Los Alamitos, CA, USA},
+      month =sep
+}
@@ -0,0 +1,123 @@
+import os
+from pathlib import Path
+
+import numpy as np
+from scipy import sparse
+
+
+BASE_DIR = Path(__file__).resolve().parent / "synthetic_data"
+
+
+def make_balanced_group_indices(n_obs: int, n_groups: int) -> np.ndarray:
+    """Create near-balanced group assignments and shuffle them."""
+    base = np.repeat(np.arange(n_groups), n_obs // n_groups)
+    remainder = np.arange(n_obs % n_groups)
+    groups = np.concatenate([base, remainder])
+    rng = np.random.default_rng(123)
+    rng.shuffle(groups)
+    return groups
+
+
+def main() -> None:
+    rng = np.random.default_rng(41)
+
+    # Keep model structure identical to run.py, but generate stable synthetic data.
+    n_obs = 4000
+    n_fixed = 10
+    n_iid = 40
+    n_dense = 40
+
+    tau_iid_true = 3.0
+    tau_dense_true = 10.0
+    prec_o_true = 50.0
+
+    # Directories expected by run.py
+    inputs_iid_dir = BASE_DIR / "inputs_iid"
+    inputs_dense_dir = BASE_DIR / "inputs_queenGRMinv"
+    inputs_fixed_dir = BASE_DIR / "inputs_fixed_effects"
+    ref_dir = BASE_DIR / "reference_outputs"
+
+    for p in [inputs_iid_dir, inputs_dense_dir, inputs_fixed_dir, ref_dir]:
+        p.mkdir(parents=True, exist_ok=True)
+
+    # 1) IID generic component
+    group_idx = make_balanced_group_indices(n_obs, n_iid)
+    rows = np.arange(n_obs)
+    cols = group_idx
+    data = np.ones(n_obs, dtype=float)
+    a_iid = sparse.coo_matrix((data, (rows, cols)), shape=(n_obs, n_iid)).tocsc()
+    q_iid = sparse.identity(n_iid, format="csc")
+
+    # True latent effects
+    u_iid = rng.normal(loc=0.0, scale=np.sqrt(1.0 / tau_iid_true), size=n_iid)
+
+    # 2) Dense generic component: random design and SPD precision matrix.
+    a_dense = rng.normal(loc=0.0, scale=1.0 / np.sqrt(n_dense), size=(n_obs, n_dense))
+
+    m = rng.normal(loc=0.0, scale=0.2, size=(n_dense, n_dense))
+    q_dense_np = m.T @ m + np.eye(n_dense)
+    q_dense = sparse.csc_matrix(q_dense_np)
+
+    cov_dense = np.linalg.inv(tau_dense_true * q_dense_np)
+    l_dense = np.linalg.cholesky(cov_dense)
+    u_dense = l_dense @ rng.normal(size=n_dense)
+
+    # 3) Regression component: intercept + near-orthonormal covariates.
+    x_raw = rng.normal(size=(n_obs, n_fixed - 1))
+    q_cov, _ = np.linalg.qr(x_raw, mode="reduced")
+    a_fixed = np.column_stack([np.ones(n_obs), q_cov])
+
+    # Sparse signal setting: only a few fixed effects are nonzero.
+    beta_true = np.zeros(n_fixed)
+    beta_true[0] = 0.5
+    n_nonzero = 8
+    nonzero_idx = rng.choice(np.arange(1, n_fixed), size=n_nonzero, replace=False)
+    beta_true[nonzero_idx] = rng.normal(loc=0.0, scale=0.2, size=n_nonzero)
+
+    eta = a_iid @ u_iid + a_dense @ u_dense + a_fixed @ beta_true
+    y = eta + rng.normal(loc=0.0, scale=np.sqrt(1.0 / prec_o_true), size=n_obs)
+
+    # Save run.py inputs
+    np.save(BASE_DIR / "y.npy", y)
+
+    sparse.save_npz(inputs_iid_dir / "a.npz", a_iid)
+    sparse.save_npz(inputs_iid_dir / "q.npz", q_iid)
+
+    np.save(inputs_dense_dir / "a.npy", a_dense)
+    sparse.save_npz(inputs_dense_dir / "q.npz", q_dense)
+
+    np.save(inputs_fixed_dir / "a.npy", a_fixed)
+
+    # Save consistent reference outputs used by run.py debug checks.
+    theta_external_true = np.array([tau_iid_true, tau_dense_true, prec_o_true])
+    theta_internal_true = np.log(theta_external_true)
+    x_true = np.concatenate([u_iid, u_dense, beta_true])
+
+    np.save(ref_dir / "theta_internal.npy", theta_internal_true)
+    np.save(ref_dir / "theta_external.npy", theta_external_true)
+    np.save(ref_dir / "x.npy", x_true)
+
+    q_prior = sparse.block_diag(
+        [
+            tau_iid_true * q_iid,
+            tau_dense_true * q_dense,
+            0.001 * sparse.identity(n_fixed),
+        ],
+        format="csc",
+    )
+
+    a_full = sparse.hstack(
+        [a_iid, sparse.csc_matrix(a_dense), sparse.csc_matrix(a_fixed)], format="csc"
+    )
+    q_cond = q_prior + prec_o_true * (a_full.T @ a_full)
+
+    sparse.save_npz(ref_dir / "qprior.npz", q_prior)
+    sparse.save_npz(ref_dir / "qcond.npz", q_cond)
+
+    print("Generated stable synthetic bee_genomics data.")
+    print(f"n_obs={n_obs}, n_fixed={n_fixed}, n_iid={n_iid}, n_dense={n_dense}")
+    print(f"Saved y to {BASE_DIR / 'y.npy'}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,167 @@
+import os
+import sys
+
+import numpy as np
+
+from pathlib import Path
+
+
+from dalia import xp, sp
+from dalia.configs import dalia_config, likelihood_config, submodels_config
+from dalia.core.dalia import DALIA
+from dalia.core.model import Model
+from dalia.submodels import GenericSubModel, RegressionSubModel
+from dalia.utils import (
+    print_msg,
+)
+
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(parent_dir)
+from examples_utils.parser_utils import parse_args  # noqa: E402
+
+# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+BASE_DIR = Path(__file__).resolve().parent / "synthetic_data"
+
+if __name__ == "__main__":
+    print_msg("--- Example: Bee Genomics  ---")
+    # consists of 2 generic submodels (iid + dense component) and 1 regression submodel (covariates + covariates) with a Gaussian likelihood
+
+    # Check for parsed parameters
+    args = parse_args()
+
+    # Configurations of the generic submodel
+    generic_dict_iid = {
+        "type": "generic",
+        "input_dir": f"{BASE_DIR}/inputs_iid",
+        "tau": 4,  # has to be positive
+        "ph_tau": {"type": "gamma", "alpha": 1.0, "beta": 1e-1},
+    }
+    generic_iid = GenericSubModel(
+        config=submodels_config.parse_config(generic_dict_iid),
+    )
+
+    # Configurations of the generic submodel
+    generic_dict_queenGRMinv = {
+        "type": "generic",
+        "input_dir": f"{BASE_DIR}/inputs_queenGRMinv",
+        "ph_tau": {"type": "gamma", "alpha": 1.0, "beta": 1e-1},
+    }
+    generic_queenGRMinv = GenericSubModel(
+        config=submodels_config.parse_config(generic_dict_queenGRMinv),
+    )
+
+    # fixed effects + intercept
+    regression_dict = {
+        "type": "regression",
+        "input_dir": f"{BASE_DIR}/inputs_fixed_effects",
+    }
+    regression = RegressionSubModel(
+        config=submodels_config.parse_config(regression_dict),
+    )
+
+    # Likelihood
+    likelihood_dict = {
+        "type": "gaussian",
+        "prior_hyperparameters": {"type": "gamma", "alpha": 1.0, "beta": 1e-1},
+    }
+    # Creation of the first model by combining the Generic submodel and the likelihood
+    model = Model(
+        submodels=[generic_iid, generic_queenGRMinv, regression],
+        likelihood_config=likelihood_config.parse_config(likelihood_dict),
+    )
+    print_msg(model)
+
+    # Configurations of DALIA
+    dalia_dict = {
+        "solver": {"type": "dense"},
+    }
+    dalia = DALIA(
+        model=model,
+        config=dalia_config.parse_config(dalia_dict),
+    )
+
+    results = dalia.run()
+
+    # load theta reference and set theta_internal to reference values
+    theta_ref_internal = xp.load(f"{BASE_DIR}/reference_outputs/theta_internal.npy")
+    theta_ref_external = xp.load(f"{BASE_DIR}/reference_outputs/theta_external.npy")
+    x_ref = xp.load(f"{BASE_DIR}/reference_outputs/x.npy")
+
+    print_msg("\n--- Results ---")
+    print_msg("Theta values external:\n", results["theta"])
+    print_msg("Theta values internal:\n", results["theta_internal"])
+    print_msg("Internal Covariance of theta:\n", results["cov_theta_internal"])
+
+    print_msg("\n--- Comparisons ---")
+    # Compare hyperparameters
+    print_msg("Theta reference external:\n", theta_ref_external)
+    print_msg("Theta values external:\n", results["theta"])
+    print_msg(
+        "Norm (theta external - theta_ref_external):        ",
+        f"{xp.linalg.norm(results['theta'] - theta_ref_external):.4e}",
+    )
+    print_msg(
+        "Norm (theta internal - theta_ref_internal):        ",
+        f"{xp.linalg.norm(results['theta_internal'] - theta_ref_internal):.4e}",
+    )
+
+    # Compare latent parameters
+    print_msg(
+        "Norm (x - x_ref)/Norm(x_ref):                ",
+        f"{xp.linalg.norm(results['x'] - x_ref) / xp.linalg.norm(x_ref):.4e}",
+    )
+
+    # Compare marginal variances of latent parameters
+    var_latent_params = results["marginal_variances_latent"]
+    Qconditional = dalia.model.construct_Q_conditional(eta=model.a @ model.x)
+    Qinv_ref = xp.linalg.inv(Qconditional)
+    print_msg(
+        "Norm (marg var latent - ref):    ",
+        f"{np.linalg.norm(var_latent_params - xp.diag(Qinv_ref)):.4e}",
+    )
+
+    print_msg("\n--- Marginal distributions of the hyperparameters ---")
+    marginals_hp = dalia.marginal_distributions_hp()
+
+    prec_obs = marginals_hp["hyperparameters"]["prec_o"]
+    quantile_pairs = prec_obs["quantiles"]["external"]["pairs"]
+
+    print("Quantile pairs of prec_o:")
+    for p, q in quantile_pairs:
+        print(f"   {p:.3f} quantile: {q:.4f}")
+
+    no_samples = 5000
+    samples = dalia.sample_posterior_latent_parameters(n_samples=no_samples)
+
+    n_iid_latent = generic_iid.n_latent_parameters
+
+    # randomly sample subset of indices
+    no_sub_indices = 10
+    iid_indices = np.sort(
+        np.random.choice(np.arange(n_iid_latent), no_sub_indices, replace=False)
+    )
+    print_msg("Subindices: ", iid_indices)
+
+    # compute generic indices corresponding to the same latent variables
+    # NOTE: check that the iid submodel is actually first in the model, then generic & that they have the same number
+    if generic_queenGRMinv.n_latent_parameters != n_iid_latent:
+        raise ValueError(
+            "The number of latent parameters in the generic submodel should be the same as in the iid submodel for this."
+        )
+    generic_indices = iid_indices + n_iid_latent
+    print_msg("Randomly sampled iid indices: ", iid_indices)
+    print_msg("Corresponding generic indices: ", generic_indices)
+
+    # extract subset of relevant samples from different latent components
+    relevant_latent_idd = results["x"][iid_indices]
+    relevant_latent_generic = results["x"][generic_indices]
+    samples_iid = samples[iid_indices, :]
+    samples_generic = samples[generic_indices, :]
+
+    # check sample means are close to the relevant latent parameters
+    print_msg("Mean of samples (iid)           : ", np.mean(samples_iid, axis=1))
+    print_msg("Relevant latent parameters (iid): ", relevant_latent_idd)
+    print_msg("Mean of samples (generic)       : ", np.mean(samples_generic, axis=1))
+    print_msg("Relevant latent parameters (generic): ", relevant_latent_generic)
+
+    print_msg("\n--- Finished ---")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,4 @@
		examples/*/.npz filter=lfs diff=lfs merge=lfs -text
		examples/*/.npy filter=lfs diff=lfs merge=lfs -text
		examples/*/.dat filter=lfs diff=lfs merge=lfs -text