Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
name: Bug Report
description: File a bug report to help us improve Quark
title: "[Bug]: "
labels: ["bug", "triage"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!

- type: textarea
id: description
attributes:
label: Bug Description
description: What happened? What did you expect to happen?
placeholder: I tried to... but instead...
validations:
required: true

- type: textarea
id: reproduction
attributes:
label: Steps to Reproduce
description: How can we reproduce this issue?
placeholder: |
1. Configure '...'
2. Run training with '...'
3. See error
validations:
required: true

- type: textarea
id: code
attributes:
label: Minimal Code Example
description: Please provide a minimal code example that reproduces the issue
render: python
placeholder: |
import torch
from models.transformer import Transformer, TransformerConfig

# Your code here...
validations:
required: false

- type: textarea
id: traceback
attributes:
label: Error Message / Traceback
description: If applicable, paste the full error message or traceback
render: shell
placeholder: Paste error message here...
validations:
required: false

- type: textarea
id: environment
attributes:
label: Environment
description: Python version, PyTorch version, OS, hardware (GPU/CPU)
placeholder: |
Python 3.12
PyTorch 2.10
Ubuntu 22.04
NVIDIA A100
validations:
required: true

- type: textarea
id: additional-context
attributes:
label: Additional Context
description: Add any other context about the problem here
placeholder: Any additional information...
validations:
required: false

- type: checkboxes
id: terms
attributes:
label: Checklist
options:
- label: I have searched existing issues to ensure this is not a duplicate
required: true
5 changes: 5 additions & 0 deletions .github/ISSUE_TEMPLATE/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
blank_issues_enabled: true
contact_links:
- name: GitHub Discussions
url: https://github.com/phnazari/quark/discussions
about: Ask questions or start a discussion about Quark
57 changes: 57 additions & 0 deletions .github/ISSUE_TEMPLATE/feature_request.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
name: Feature Request
description: Suggest a new feature or enhancement for Quark
title: "[Feature]: "
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
Thanks for suggesting a new feature!

- type: textarea
id: problem
attributes:
label: Problem Statement
description: What problem would this feature solve? What are you trying to achieve?
placeholder: I'm trying to... but currently...
validations:
required: true

- type: textarea
id: solution
attributes:
label: Proposed Solution
description: Describe how you'd like this to work
placeholder: I would like Quark to...
validations:
required: true

- type: textarea
id: example
attributes:
label: Example Code
description: Show us how you'd like to use this feature
render: python
placeholder: |
from models.transformer import Transformer, TransformerConfig

# Example of how the feature would be used
validations:
required: false

- type: textarea
id: additional-context
attributes:
label: Additional Context
description: Any other context, references to papers, implementations in other frameworks, etc.
placeholder: Related papers, implementations, etc.
validations:
required: false

- type: checkboxes
id: terms
attributes:
label: Checklist
options:
- label: I have searched existing issues to ensure this feature hasn't been requested
required: true
19 changes: 19 additions & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## Description

<!-- Provide a brief description of the changes in this PR -->

## Type of Change

- [ ] 🐛 Bug fix
- [ ] ✨ New feature
- [ ] 💥 Breaking change
- [ ] 📚 Documentation
- [ ] Other

## Changes Made

<!-- Provide a detailed list of changes -->

-
-
-
59 changes: 59 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# =============================
# Adapted from:
# https://github.com/fla-org/flash-linear-attention
# =============================
name: Lint

on:
workflow_dispatch:
pull_request:
branches: [main]
push:
branches: [main]

jobs:
lint:
runs-on: ubuntu-latest

steps:
- name: Check out repo
uses: actions/checkout@v4
with:
submodules: true

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version-file: .python-version

- name: Set up uv
uses: astral-sh/setup-uv@v3

- name: Cache uv environment
uses: actions/cache@v4
with:
path: .venv
key: ${{ runner.os }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }}

- name: Cache pre-commit hooks
uses: actions/cache@v4
with:
path: ~/.cache/pre-commit
key: ${{ runner.os }}-precommit-${{ hashFiles('.pre-commit-config.yaml') }}

- name: Sync environment
run: uv sync --extra dev

- name: Get changed files
id: changed-files
uses: tj-actions/changed-files@v46.0.5

- name: Lint only changed files
if: ${{ steps.changed-files.outputs.all_changed_files != '' }}
run: |
echo "Changed files: ${{ steps.changed-files.outputs.all_changed_files }}"
uv run pre-commit run --files ${{ steps.changed-files.outputs.all_changed_files }}

- name: No files changed
if: ${{ steps.changed-files.outputs.all_changed_files == '' }}
run: echo "No changed files to lint."
38 changes: 38 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Tests

on:
workflow_dispatch:
pull_request:
branches: [main]
push:
branches: [main]

jobs:
test:
runs-on: ubuntu-latest

steps:
- name: Check out repo
uses: actions/checkout@v4
with:
submodules: true

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version-file: .python-version

- name: Set up uv
uses: astral-sh/setup-uv@v3

- name: Cache uv environment
uses: actions/cache@v4
with:
path: .venv
key: ${{ runner.os }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }}

- name: Sync environment
run: uv sync --extra dev

- name: Run tests
run: uv run pytest tests/ -v
14 changes: 9 additions & 5 deletions checkpoint_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def save_checkpoint(step, model, engine, cfg, metrics=None):
"scaler": engine.scaler.state_dict(),
}

exp_dir = os.path.join(cfg.out_dir, cfg.exp_name)
exp_dir = os.path.join(cfg.out_dir, cfg.checkpoint.exp_name)
os.makedirs(exp_dir, exist_ok=True)

save_path = os.path.join(exp_dir, f"ckpt_step_{step}.pth")
Expand All @@ -46,14 +46,18 @@ def save_checkpoint(step, model, engine, cfg, metrics=None):

def maybe_load_checkpoint(cfg):
"""Load a checkpoint if resuming, else return None."""
if not cfg.resume:
if not cfg.checkpoint.resume:
return None

resume_exp_name = cfg.resume_exp_name if cfg.resume_exp_name is not None else cfg.exp_name
resume_exp_name = (
cfg.checkpoint.resume_exp_name
if cfg.checkpoint.resume_exp_name is not None
else cfg.checkpoint.exp_name
)
ckpt_dir = os.path.join(cfg.out_dir, resume_exp_name)

if cfg.resume_step is not None:
ckpt_path = os.path.join(ckpt_dir, f"ckpt_step_{cfg.resume_step}.pth")
if cfg.checkpoint.resume_step is not None:
ckpt_path = os.path.join(ckpt_dir, f"ckpt_step_{cfg.checkpoint.resume_step}.pth")
else:
ckpt_path = _latest_checkpoint(ckpt_dir, prefix="ckpt_step_")

Expand Down
8 changes: 8 additions & 0 deletions configs/checkpoint/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
save_last_checkpoint: true
save_intermediate_checkpoints: true
save_every_steps: 1000
resume: false
resume_step: null
resume_exp_name: null
over_write: true
exp_name: default
53 changes: 4 additions & 49 deletions configs/config.yaml
Original file line number Diff line number Diff line change
@@ -1,59 +1,14 @@
defaults:
- model: transformer
- data: fineweb10B
- training: default
- system: default
- logging: default
- checkpoint: default
- _self_

training:
steps_budget: 19064
eval_every_steps: 200
log_every_steps: 100
grad_accumulation_steps: 2
# Optimizer
optim: adamw
fused_optim: false
lr: 7e-4
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
eps: 1e-15
# Scheduler
scheduler: warmup_cosine
warmup_steps: 950
cooldown_steps: null
lr_start: 0.0
lr_end: null
lr_end_pct: 0.1
# Early stopping
early_stopping_patience: 0

system:
dtype: bfloat16
compile_model: false
seed: 42
ddp_backend: nccl

logging:
wandb_log: true
wandb_project: quark
wandb_log_layer_stats: false

checkpoint:
save_last_checkpoint: true
save_intermediate_checkpoints: true
save_every_steps: 1000
resume: false
resume_step: null
resume_exp_name: null
over_write: true
exp_name: default

out_dir: /fast/pnazari/quark

data:
seq_len: 2048
micro_batch_size: 16

hydra:
job:
chdir: false
Expand Down
7 changes: 2 additions & 5 deletions configs/data/fineweb10B.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
dataset: fineweb10B
vocab_size: 50304
trainset_path: /fast/pnazari/data/fwedu_sample_10BT/tokenized_EleutherAI_gpt-neox-20b/ctx_2048/train
validset_path: /fast/pnazari/data/fwedu_sample_10BT/tokenized_EleutherAI_gpt-neox-20b/ctx_2048/valid
seq_len: 1024
micro_batch_size: 32
num_workers: 17
sampler: stateful_random
sampler_seed: 42
seq_len: 2048
eval: true
valid_tokens: 10000000
3 changes: 3 additions & 0 deletions configs/logging/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
wandb_log: true
wandb_project: quark
wandb_log_layer_stats: false
4 changes: 4 additions & 0 deletions configs/system/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
dtype: bfloat16
compile_model: false
seed: 42
ddp_backend: nccl
Loading