phnazari · phnazari · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,84 @@
+name: Bug Report
+description: File a bug report to help us improve Quark
+title: "[Bug]: "
+labels: ["bug", "triage"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this bug report!
+
+  - type: textarea
+    id: description
+    attributes:
+      label: Bug Description
+      description: What happened? What did you expect to happen?
+      placeholder: I tried to... but instead...
+    validations:
+      required: true
+
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: Steps to Reproduce
+      description: How can we reproduce this issue?
+      placeholder: |
+        1. Configure '...'
+        2. Run training with '...'
+        3. See error
+    validations:
+      required: true
+
+  - type: textarea
+    id: code
+    attributes:
+      label: Minimal Code Example
+      description: Please provide a minimal code example that reproduces the issue
+      render: python
+      placeholder: |
+        import torch
+        from models.transformer import Transformer, TransformerConfig
+
+        # Your code here...
+    validations:
+      required: false
+
+  - type: textarea
+    id: traceback
+    attributes:
+      label: Error Message / Traceback
+      description: If applicable, paste the full error message or traceback
+      render: shell
+      placeholder: Paste error message here...
+    validations:
+      required: false
+
+  - type: textarea
+    id: environment
+    attributes:
+      label: Environment
+      description: Python version, PyTorch version, OS, hardware (GPU/CPU)
+      placeholder: |
+        Python 3.12
+        PyTorch 2.10
+        Ubuntu 22.04
+        NVIDIA A100
+    validations:
+      required: true
+
+  - type: textarea
+    id: additional-context
+    attributes:
+      label: Additional Context
+      description: Add any other context about the problem here
+      placeholder: Any additional information...
+    validations:
+      required: false
+
+  - type: checkboxes
+    id: terms
+    attributes:
+      label: Checklist
+      options:
+        - label: I have searched existing issues to ensure this is not a duplicate
+          required: true
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: true
+contact_links:
+  - name: GitHub Discussions
+    url: https://github.com/phnazari/quark/discussions
+    about: Ask questions or start a discussion about Quark
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,57 @@
+name: Feature Request
+description: Suggest a new feature or enhancement for Quark
+title: "[Feature]: "
+labels: ["enhancement"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for suggesting a new feature!
+
+  - type: textarea
+    id: problem
+    attributes:
+      label: Problem Statement
+      description: What problem would this feature solve? What are you trying to achieve?
+      placeholder: I'm trying to... but currently...
+    validations:
+      required: true
+
+  - type: textarea
+    id: solution
+    attributes:
+      label: Proposed Solution
+      description: Describe how you'd like this to work
+      placeholder: I would like Quark to...
+    validations:
+      required: true
+
+  - type: textarea
+    id: example
+    attributes:
+      label: Example Code
+      description: Show us how you'd like to use this feature
+      render: python
+      placeholder: |
+        from models.transformer import Transformer, TransformerConfig
+
+        # Example of how the feature would be used
+    validations:
+      required: false
+
+  - type: textarea
+    id: additional-context
+    attributes:
+      label: Additional Context
+      description: Any other context, references to papers, implementations in other frameworks, etc.
+      placeholder: Related papers, implementations, etc.
+    validations:
+      required: false
+
+  - type: checkboxes
+    id: terms
+    attributes:
+      label: Checklist
+      options:
+        - label: I have searched existing issues to ensure this feature hasn't been requested
+          required: true
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,19 @@
+## Description
+
+<!-- Provide a brief description of the changes in this PR -->
+
+## Type of Change
+
+- [ ] 🐛 Bug fix
+- [ ] ✨ New feature
+- [ ] 💥 Breaking change
+- [ ] 📚 Documentation
+- [ ] Other
+
+## Changes Made
+
+<!-- Provide a detailed list of changes -->
+
+-
+-
+-
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,59 @@
+# =============================
+# Adapted from:
+# 	https://github.com/fla-org/flash-linear-attention
+# =============================
+name: Lint
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: .python-version
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v3
+
+      - name: Cache uv environment
+        uses: actions/cache@v4
+        with:
+          path: .venv
+          key: ${{ runner.os }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }}
+
+      - name: Cache pre-commit hooks
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pre-commit
+          key: ${{ runner.os }}-precommit-${{ hashFiles('.pre-commit-config.yaml') }}
+
+      - name: Sync environment
+        run: uv sync --extra dev
+
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v46.0.5
+
+      - name: Lint only changed files
+        if: ${{ steps.changed-files.outputs.all_changed_files != '' }}
+        run: |
+          echo "Changed files: ${{ steps.changed-files.outputs.all_changed_files }}"
+          uv run pre-commit run --files ${{ steps.changed-files.outputs.all_changed_files }}
+
+      - name: No files changed
+        if: ${{ steps.changed-files.outputs.all_changed_files == '' }}
+        run: echo "No changed files to lint."
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,38 @@
+name: Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: .python-version
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v3
+
+      - name: Cache uv environment
+        uses: actions/cache@v4
+        with:
+          path: .venv
+          key: ${{ runner.os }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }}
+
+      - name: Sync environment
+        run: uv sync --extra dev
+
+      - name: Run tests
+        run: uv run pytest tests/ -v
diff --git a/checkpoint_utils.py b/checkpoint_utils.py
@@ -31,7 +31,7 @@ def save_checkpoint(step, model, engine, cfg, metrics=None):
         "scaler": engine.scaler.state_dict(),
     }
 
-    exp_dir = os.path.join(cfg.out_dir, cfg.exp_name)
+    exp_dir = os.path.join(cfg.out_dir, cfg.checkpoint.exp_name)
     os.makedirs(exp_dir, exist_ok=True)
 
     save_path = os.path.join(exp_dir, f"ckpt_step_{step}.pth")
@@ -46,14 +46,18 @@ def save_checkpoint(step, model, engine, cfg, metrics=None):
 
 def maybe_load_checkpoint(cfg):
     """Load a checkpoint if resuming, else return None."""
-    if not cfg.resume:
+    if not cfg.checkpoint.resume:
         return None
 
-    resume_exp_name = cfg.resume_exp_name if cfg.resume_exp_name is not None else cfg.exp_name
+    resume_exp_name = (
+        cfg.checkpoint.resume_exp_name
+        if cfg.checkpoint.resume_exp_name is not None
+        else cfg.checkpoint.exp_name
+    )
     ckpt_dir = os.path.join(cfg.out_dir, resume_exp_name)
 
-    if cfg.resume_step is not None:
-        ckpt_path = os.path.join(ckpt_dir, f"ckpt_step_{cfg.resume_step}.pth")
+    if cfg.checkpoint.resume_step is not None:
+        ckpt_path = os.path.join(ckpt_dir, f"ckpt_step_{cfg.checkpoint.resume_step}.pth")
     else:
         ckpt_path = _latest_checkpoint(ckpt_dir, prefix="ckpt_step_")
 

diff --git a/configs/checkpoint/default.yaml b/configs/checkpoint/default.yaml
@@ -0,0 +1,8 @@
+save_last_checkpoint: true
+save_intermediate_checkpoints: true
+save_every_steps: 1000
+resume: false
+resume_step: null
+resume_exp_name: null
+over_write: true
+exp_name: default
diff --git a/configs/config.yaml b/configs/config.yaml
@@ -1,59 +1,14 @@
 defaults:
   - model: transformer
   - data: fineweb10B
+  - training: default
+  - system: default
+  - logging: default
+  - checkpoint: default
   - _self_
 
-training:
-  steps_budget: 19064
-  eval_every_steps: 200
-  log_every_steps: 100
-  grad_accumulation_steps: 2
-  # Optimizer
-  optim: adamw
-  fused_optim: false
-  lr: 7e-4
-  weight_decay: 0.1
-  beta1: 0.9
-  beta2: 0.95
-  grad_clip: 1.0
-  eps: 1e-15
-  # Scheduler
-  scheduler: warmup_cosine
-  warmup_steps: 950
-  cooldown_steps: null
-  lr_start: 0.0
-  lr_end: null
-  lr_end_pct: 0.1
-  # Early stopping
-  early_stopping_patience: 0
-
-system:
-  dtype: bfloat16
-  compile_model: false
-  seed: 42
-  ddp_backend: nccl
-
-logging:
-  wandb_log: true
-  wandb_project: quark
-  wandb_log_layer_stats: false
-
-checkpoint:
-  save_last_checkpoint: true
-  save_intermediate_checkpoints: true
-  save_every_steps: 1000
-  resume: false
-  resume_step: null
-  resume_exp_name: null
-  over_write: true
-  exp_name: default
-
 out_dir: /fast/pnazari/quark
 
-data:
-  seq_len: 2048
-  micro_batch_size: 16
-
 hydra:
   job:
     chdir: false

diff --git a/configs/data/fineweb10B.yaml b/configs/data/fineweb10B.yaml
@@ -1,10 +1,7 @@
 dataset: fineweb10B
+vocab_size: 50304
 trainset_path: /fast/pnazari/data/fwedu_sample_10BT/tokenized_EleutherAI_gpt-neox-20b/ctx_2048/train
 validset_path: /fast/pnazari/data/fwedu_sample_10BT/tokenized_EleutherAI_gpt-neox-20b/ctx_2048/valid
-seq_len: 1024
-micro_batch_size: 32
-num_workers: 17
-sampler: stateful_random
-sampler_seed: 42
+seq_len: 2048
 eval: true
 valid_tokens: 10000000
diff --git a/configs/logging/default.yaml b/configs/logging/default.yaml
@@ -0,0 +1,3 @@
+wandb_log: true
+wandb_project: quark
+wandb_log_layer_stats: false
diff --git a/configs/system/default.yaml b/configs/system/default.yaml
@@ -0,0 +1,4 @@
+dtype: bfloat16
+compile_model: false
+seed: 42
+ddp_backend: nccl