This project is a "vibe-coding" project, meaning that the package has been entirely generated by Claude-Code AI, from design to implementation and documentation, based on my original ideas and specifications. While I have reviewed and tested the code, there may still be issues or limitations. Use at your own risk.
Synthetic data generation using DAG-based structural causal models.
pip install .For full functionality (plotting, DataFrames, probit link):
pip install ".[full]"from datagenerator import DAG, DataGenerator
# Create a DAG with confounding
dag = DAG()
dag.add_node("Z", noise_std=1.0) # Confounder
dag.add_node("X", noise_std=0.5)
dag.add_node("Y", noise_std=0.5)
dag.add_edge("Z", "X", weight=0.8)
dag.add_edge("Z", "Y", weight=0.6)
dag.add_edge("X", "Y", weight=1.0, transform="quadratic")
# Generate data
generator = DataGenerator(dag, seed=42)
data = generator.sample(n=1000)
# Or as a dictionary
data_dict = generator.sample(n=1000, return_dict=True)from datagenerator import ClassificationDataGenerator, FeatureSpec
# Generative mode: control class balance directly
gen = ClassificationDataGenerator(
mode="generative",
class_balance=0.1, # 10% positive class
feature_specs=[
FeatureSpec("f0", loc_by_class=(0.0, 2.0), noise_std=1.0),
FeatureSpec("f1", loc_by_class=(-0.5, 1.0), noise_std=1.0),
FeatureSpec("f2", parents=["f0", "f1"], parent_weights=[1.0, 0.5],
output_transform="tanh", noise_std=0.5),
],
n_noise_features=3,
seed=42
)
X, y = gen.generate_batch(1000)
# Or generate randomly configured data
gen = ClassificationDataGenerator.from_random(
n_features=10,
n_informative=6,
n_direct_to_y=3,
connectivity=0.4,
class_balance=0.2,
mode="causal",
seed=42
)
X, y = gen.generate_batch(1000)from datagenerator import (
create_chain,
create_fork,
create_collider,
create_mediator,
create_instrument,
create_random_dag,
)
# Chain: X0 -> X1 -> X2
chain = create_chain(n_nodes=3)
# Fork (confounder): Z -> X, Z -> Y
fork = create_fork(n_children=2)
# Collider: X -> Y, Z -> Y
collider = create_collider(n_parents=2)
# Mediation: X -> M -> Y and X -> Y
mediator = create_mediator(direct_effect=0.5, indirect_effect_xm=1.0, indirect_effect_my=0.8)
# Instrumental variable: Z -> X -> Y with U -> X and U -> Y
iv = create_instrument(x_y_weight=2.0)
# Random DAG
random_dag = create_random_dag(n_nodes=5, edge_probability=0.4, seed=42)from datagenerator import DAG, DataGenerator
dag = DAG()
dag.add_node("X", noise_std=1.0)
dag.add_node("Y", noise_std=0.5)
dag.add_edge("X", "Y", weight=1.0)
generator = DataGenerator(dag, seed=42)
# Sample with intervention do(X=2)
interventional_data = generator.sample_interventional(
n=1000,
interventions={"X": 2.0},
return_dict=True
)Available transforms: linear, quadratic, cubic, sigmoid, tanh, sin, exp, log, relu, leaky_relu, threshold
from datagenerator import DAG, PolynomialTransform, CompositeTransform, SigmoidTransform
dag = DAG()
dag.add_node("X")
dag.add_node("Y")
# Using string name
dag.add_edge("X", "Y", weight=1.0, transform="quadratic")
# Using transform instance
dag.add_edge("X", "Y", weight=1.0, transform=PolynomialTransform(degrees=[1, 2, 3]))
# Composing transforms
dag.add_edge("X", "Y", weight=1.0, transform=CompositeTransform([
PolynomialTransform(degrees=[2]),
SigmoidTransform(scale=0.5)
]))from datagenerator import DAG, GaussianNoise, LaplacianNoise, StudentTNoise, MixtureNoise
dag = DAG()
# Using convenience parameters
dag.add_node("X", noise_std=1.0) # Gaussian
dag.add_node("Y", noise_type="laplacian", noise_std=0.5)
dag.add_node("Z", noise_type="student_t", noise_std=1.0, noise_params={"df": 3})
# Using noise generator instances
dag.add_node("W", noise=MixtureNoise(
components=[GaussianNoise(std=1.0), LaplacianNoise(scale=2.0)],
weights=[0.7, 0.3]
))# Requires matplotlib
dag.plot(figsize=(10, 8), show_weights=True)
# ASCII representation
print(dag.to_ascii())
# Detailed description
print(dag.describe())
# For ClassificationDataGenerator
gen.plot_dag()- Flexible DAG construction with automatic cycle detection
- Multiple noise distributions: Gaussian, Uniform, Laplacian, Student's t, Mixture
- Non-linear edge transformations: polynomial, sigmoid, tanh, sinusoidal, exponential, log, ReLU, threshold
- Interventional sampling for causal inference experiments
- Classification data generation with generative or causal modes
- Common DAG patterns: chain, fork, collider, mediator, instrumental variable
- Visualization with matplotlib or ASCII
MIT
- Add categorical variable support
- Add free text variable support