Skip to content

Meridian.AI Train #1412

Meridian.AI Train

Meridian.AI Train #1412

Workflow file for this run

name: Meridian.AI Train
on:
schedule:
# Every hour, every day
- cron: '0 * * * *'
workflow_dispatch:
inputs:
force_seed:
description: 'Nuke & re-seed HF repo with fresh model?'
type: boolean
default: false
max_steps:
description: 'Training steps per run (default: 150)'
type: string
default: '150'
env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
concurrency:
group: meridian-train-${{ github.ref }}
cancel-in-progress: false
jobs:
seed:
name: "Nuke & Seed HF"
if: github.event.inputs.force_seed == 'true'
runs-on: ubuntu-latest
environment: HuggingFace Hub
steps:
- uses: actions/checkout@v4
with:
token: ${{ secrets.GH_PAT }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install deps
run: |
pip install huggingface_hub transformers torch safetensors sentencepiece python-dotenv
- name: Nuke & Seed
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B'
PYTHONPATH: .
run: python scripts/seed_hf_repo.py
train:
name: "Hourly Training Run"
needs: seed
if: |
always() &&
(needs.seed.result == 'success' || needs.seed.result == 'skipped')
runs-on: ubuntu-latest
timeout-minutes: 90 # 1.5 hours max
permissions:
contents: write
issues: write
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.GH_PAT }}
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
- name: Install Dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install ruff black
- name: Lint & Format
run: |
black . --quiet
ruff check . --fix --quiet
- name: Pull Checkpoint from HuggingFace
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
python -c "
from huggingface_hub import snapshot_download, list_repo_files
import os, shutil
repo_id = 'meridianal/FinAI'
token = os.getenv('HF_TOKEN')
try:
files = list(list_repo_files(repo_id=repo_id, token=token))
checkpoint_files = [f for f in files if f.startswith('checkpoint/')]
if checkpoint_files:
print(f'Found {len(checkpoint_files)} checkpoint files')
temp_dir = './temp_download'
snapshot_download(
repo_id=repo_id,
local_dir=temp_dir,
token=token
)
if os.path.exists(os.path.join(temp_dir, 'checkpoint')):
if os.path.exists('./checkpoint'):
shutil.rmtree('./checkpoint')
shutil.move(os.path.join(temp_dir, 'checkpoint'), './checkpoint')
print('Checkpoint pulled')
for cf in ['model.safetensors', 'trainer_state.pt', 'config.json', 'ewc_state.pt']:
path = os.path.join('./checkpoint', cf)
if os.path.exists(path):
size_mb = os.path.getsize(path) / (1024 * 1024)
print(f' - {cf} ({size_mb:.2f} MB)')
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
else:
print('No checkpoint found — will start fresh')
except Exception as e:
print(f'Pull error: {e}')
import traceback
traceback.print_exc()
print('Will start fresh.')
"
- name: Train
id: training
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
COMET_API_KEY: ${{ secrets.COMET_API_KEY }}
TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B'
DTYPE: 'bfloat16'
SKIP_OPTIMIZER_SAVE: '1'
HARD_RAM_GUARD: '1'
MAX_RAM_GB: '14.5'
SOFT_RAM_GB: '12.5'
SOFT_RAM_PCT: '80'
MIN_THROTTLE_SEQ_LEN: '64'
DEBUG_STEPS: '0'
MAX_STEPS: ${{ github.event.inputs.max_steps || '150' }}
TOTAL_STEPS: '100000'
GRAD_ACCUM: '8'
BATCH_SIZE: '1'
LEARNING_RATE: '5e-5'
BLOCK_SIZE: '256'
MAX_BYTES: '15728640'
USE_EWC: '1'
EWC_SAMPLES: '5'
GRADIENT_CHECKPOINTING: '1'
OPTIMIZER: 'adafactor'
FREE_OPTIMIZER_BEFORE_FISHER: '1'
FISHER_SEQ_LEN: '64'
FISHER_THRESHOLD: '1e-6'
GC_EVERY_STEPS: '5'
PYTHONPATH: .
PYTHONUNBUFFERED: '1'
run: |
timeout 4800 python train.py 2>&1 | tee train_output.log
TRAIN_EXIT=${PIPESTATUS[0]}
echo "train_exit_code=$TRAIN_EXIT" >> "$GITHUB_OUTPUT"
# Detect repeated errors (10+ occurrences of same error = systemic failure)
ERROR_COUNT=$(grep -c '\[ERROR\]' train_output.log 2>/dev/null) || ERROR_COUNT=0
echo "error_count=$ERROR_COUNT" >> "$GITHUB_OUTPUT"
# Extract unique error messages for the issue body
grep '\[ERROR\]' train_output.log | sort -u | head -10 > train_errors.txt 2>/dev/null || true
# Detect fatal patterns (NaN loss explosion, OOM, etc.)
FATAL=$(grep -cE 'CUDA out of memory|Loss is NaN|RuntimeError|FATAL' train_output.log 2>/dev/null) || FATAL=0
echo "fatal_count=$FATAL" >> "$GITHUB_OUTPUT"
if [ "$ERROR_COUNT" -gt 50 ] || [ "$FATAL" -gt 0 ]; then
echo "train_failed=true" >> "$GITHUB_OUTPUT"
echo "[FAIL] Training had $ERROR_COUNT errors, $FATAL fatal issues."
else
echo "train_failed=false" >> "$GITHUB_OUTPUT"
fi
# Still save checkpoint even on errors
exit 0
- name: Auto-create issue on training failure
if: steps.training.outputs.train_failed == 'true'
env:
GH_TOKEN: ${{ secrets.GH_PAT }}
run: |
ERROR_COUNT=${{ steps.training.outputs.error_count }}
FATAL_COUNT=${{ steps.training.outputs.fatal_count }}
EXIT_CODE=${{ steps.training.outputs.train_exit_code }}
RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
# Read unique errors
ERRORS=$(cat train_errors.txt 2>/dev/null | head -10 || echo "No error details captured")
# Check for existing open issue to avoid duplicates
EXISTING=$(gh issue list --state open --label "training-failure" --limit 1 --json number -q '.[0].number' 2>/dev/null || echo "")
BODY="$(cat <<EOF
## Training Failure Report
**Run:** ${RUN_URL}
**Date:** $(date -u +%Y-%m-%dT%H:%M:%SZ)
**Exit Code:** ${EXIT_CODE}
**Error Count:** ${ERROR_COUNT}
**Fatal Count:** ${FATAL_COUNT}
### Unique Errors
\`\`\`
${ERRORS}
\`\`\`
### Action Required
- [ ] Investigate root cause
- [ ] Check if EWC state file is stale (shape mismatch)
- [ ] Check model architecture hasn't changed
- [ ] Re-run training after fix
*Auto-generated by CI*
EOF
)"
if [ -n "$EXISTING" ]; then
echo "Appending to existing issue #$EXISTING"
gh issue comment "$EXISTING" --body "$BODY"
else
gh issue create \
--title "Training failure: ${ERROR_COUNT} errors detected ($(date -u +%Y-%m-%d))" \
--body "$BODY" \
--label "training-failure,bug"
fi
- name: Upload Checkpoint to HuggingFace
if: always()
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
python -c "
from huggingface_hub import HfApi
import os
if not os.path.exists('./checkpoint'):
print('No checkpoint found, skipping')
exit(0)
api = HfApi()
token = os.getenv('HF_TOKEN')
repo_id = 'meridianal/FinAI'
print(f'Uploading checkpoint to {repo_id}...')
try:
api.upload_folder(
folder_path='./checkpoint',
repo_id=repo_id,
path_in_repo='checkpoint',
commit_message='Hourly training update [skip ci]',
token=token,
)
print('Checkpoint upload successful')
except Exception as e:
print(f'Checkpoint upload failed: {e}')
exit(1)
# Upload model card to HF repo root
model_card_path = './FinAI/README.md'
if os.path.exists(model_card_path):
print('Uploading model card...')
try:
api.upload_file(
path_or_fileobj=model_card_path,
path_in_repo='README.md',
repo_id=repo_id,
commit_message='Update model card [skip ci]',
token=token,
)
print('Model card upload successful')
except Exception as e:
print(f'Model card upload failed (non-fatal): {e}')
else:
print('No model card found at FinAI/README.md, skipping')
"
- name: Sync Dataset State
if: always()
env:
GH_PAT: ${{ secrets.GH_PAT }}
run: |
git config --local user.email "action@github.com"
git config --local user.name "Meridian.AI Bot"
git add .
git diff --staged --quiet || git commit -m "chore: sync dataset state & formatting [skip ci]"
git pull --rebase -X theirs origin main || (git rebase --abort && git pull --no-rebase origin main)
git push https://$GH_PAT@github.com/MeridianAlgo/FinAI.git main