Meridian.AI Train #1412
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Meridian.AI Train | |
| on: | |
| schedule: | |
| # Every hour, every day | |
| - cron: '0 * * * *' | |
| workflow_dispatch: | |
| inputs: | |
| force_seed: | |
| description: 'Nuke & re-seed HF repo with fresh model?' | |
| type: boolean | |
| default: false | |
| max_steps: | |
| description: 'Training steps per run (default: 150)' | |
| type: string | |
| default: '150' | |
| env: | |
| FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true | |
| concurrency: | |
| group: meridian-train-${{ github.ref }} | |
| cancel-in-progress: false | |
| jobs: | |
| seed: | |
| name: "Nuke & Seed HF" | |
| if: github.event.inputs.force_seed == 'true' | |
| runs-on: ubuntu-latest | |
| environment: HuggingFace Hub | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| token: ${{ secrets.GH_PAT }} | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: Install deps | |
| run: | | |
| pip install huggingface_hub transformers torch safetensors sentencepiece python-dotenv | |
| - name: Nuke & Seed | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B' | |
| PYTHONPATH: . | |
| run: python scripts/seed_hf_repo.py | |
| train: | |
| name: "Hourly Training Run" | |
| needs: seed | |
| if: | | |
| always() && | |
| (needs.seed.result == 'success' || needs.seed.result == 'skipped') | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 90 # 1.5 hours max | |
| permissions: | |
| contents: write | |
| issues: write | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| token: ${{ secrets.GH_PAT }} | |
| - name: Set up Python 3.11 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| cache: 'pip' | |
| - name: Install Dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r requirements.txt | |
| pip install ruff black | |
| - name: Lint & Format | |
| run: | | |
| black . --quiet | |
| ruff check . --fix --quiet | |
| - name: Pull Checkpoint from HuggingFace | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| python -c " | |
| from huggingface_hub import snapshot_download, list_repo_files | |
| import os, shutil | |
| repo_id = 'meridianal/FinAI' | |
| token = os.getenv('HF_TOKEN') | |
| try: | |
| files = list(list_repo_files(repo_id=repo_id, token=token)) | |
| checkpoint_files = [f for f in files if f.startswith('checkpoint/')] | |
| if checkpoint_files: | |
| print(f'Found {len(checkpoint_files)} checkpoint files') | |
| temp_dir = './temp_download' | |
| snapshot_download( | |
| repo_id=repo_id, | |
| local_dir=temp_dir, | |
| token=token | |
| ) | |
| if os.path.exists(os.path.join(temp_dir, 'checkpoint')): | |
| if os.path.exists('./checkpoint'): | |
| shutil.rmtree('./checkpoint') | |
| shutil.move(os.path.join(temp_dir, 'checkpoint'), './checkpoint') | |
| print('Checkpoint pulled') | |
| for cf in ['model.safetensors', 'trainer_state.pt', 'config.json', 'ewc_state.pt']: | |
| path = os.path.join('./checkpoint', cf) | |
| if os.path.exists(path): | |
| size_mb = os.path.getsize(path) / (1024 * 1024) | |
| print(f' - {cf} ({size_mb:.2f} MB)') | |
| if os.path.exists(temp_dir): | |
| shutil.rmtree(temp_dir) | |
| else: | |
| print('No checkpoint found — will start fresh') | |
| except Exception as e: | |
| print(f'Pull error: {e}') | |
| import traceback | |
| traceback.print_exc() | |
| print('Will start fresh.') | |
| " | |
| - name: Train | |
| id: training | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| COMET_API_KEY: ${{ secrets.COMET_API_KEY }} | |
| TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B' | |
| DTYPE: 'bfloat16' | |
| SKIP_OPTIMIZER_SAVE: '1' | |
| HARD_RAM_GUARD: '1' | |
| MAX_RAM_GB: '14.5' | |
| SOFT_RAM_GB: '12.5' | |
| SOFT_RAM_PCT: '80' | |
| MIN_THROTTLE_SEQ_LEN: '64' | |
| DEBUG_STEPS: '0' | |
| MAX_STEPS: ${{ github.event.inputs.max_steps || '150' }} | |
| TOTAL_STEPS: '100000' | |
| GRAD_ACCUM: '8' | |
| BATCH_SIZE: '1' | |
| LEARNING_RATE: '5e-5' | |
| BLOCK_SIZE: '256' | |
| MAX_BYTES: '15728640' | |
| USE_EWC: '1' | |
| EWC_SAMPLES: '5' | |
| GRADIENT_CHECKPOINTING: '1' | |
| OPTIMIZER: 'adafactor' | |
| FREE_OPTIMIZER_BEFORE_FISHER: '1' | |
| FISHER_SEQ_LEN: '64' | |
| FISHER_THRESHOLD: '1e-6' | |
| GC_EVERY_STEPS: '5' | |
| PYTHONPATH: . | |
| PYTHONUNBUFFERED: '1' | |
| run: | | |
| timeout 4800 python train.py 2>&1 | tee train_output.log | |
| TRAIN_EXIT=${PIPESTATUS[0]} | |
| echo "train_exit_code=$TRAIN_EXIT" >> "$GITHUB_OUTPUT" | |
| # Detect repeated errors (10+ occurrences of same error = systemic failure) | |
| ERROR_COUNT=$(grep -c '\[ERROR\]' train_output.log 2>/dev/null) || ERROR_COUNT=0 | |
| echo "error_count=$ERROR_COUNT" >> "$GITHUB_OUTPUT" | |
| # Extract unique error messages for the issue body | |
| grep '\[ERROR\]' train_output.log | sort -u | head -10 > train_errors.txt 2>/dev/null || true | |
| # Detect fatal patterns (NaN loss explosion, OOM, etc.) | |
| FATAL=$(grep -cE 'CUDA out of memory|Loss is NaN|RuntimeError|FATAL' train_output.log 2>/dev/null) || FATAL=0 | |
| echo "fatal_count=$FATAL" >> "$GITHUB_OUTPUT" | |
| if [ "$ERROR_COUNT" -gt 50 ] || [ "$FATAL" -gt 0 ]; then | |
| echo "train_failed=true" >> "$GITHUB_OUTPUT" | |
| echo "[FAIL] Training had $ERROR_COUNT errors, $FATAL fatal issues." | |
| else | |
| echo "train_failed=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| # Still save checkpoint even on errors | |
| exit 0 | |
| - name: Auto-create issue on training failure | |
| if: steps.training.outputs.train_failed == 'true' | |
| env: | |
| GH_TOKEN: ${{ secrets.GH_PAT }} | |
| run: | | |
| ERROR_COUNT=${{ steps.training.outputs.error_count }} | |
| FATAL_COUNT=${{ steps.training.outputs.fatal_count }} | |
| EXIT_CODE=${{ steps.training.outputs.train_exit_code }} | |
| RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| # Read unique errors | |
| ERRORS=$(cat train_errors.txt 2>/dev/null | head -10 || echo "No error details captured") | |
| # Check for existing open issue to avoid duplicates | |
| EXISTING=$(gh issue list --state open --label "training-failure" --limit 1 --json number -q '.[0].number' 2>/dev/null || echo "") | |
| BODY="$(cat <<EOF | |
| ## Training Failure Report | |
| **Run:** ${RUN_URL} | |
| **Date:** $(date -u +%Y-%m-%dT%H:%M:%SZ) | |
| **Exit Code:** ${EXIT_CODE} | |
| **Error Count:** ${ERROR_COUNT} | |
| **Fatal Count:** ${FATAL_COUNT} | |
| ### Unique Errors | |
| \`\`\` | |
| ${ERRORS} | |
| \`\`\` | |
| ### Action Required | |
| - [ ] Investigate root cause | |
| - [ ] Check if EWC state file is stale (shape mismatch) | |
| - [ ] Check model architecture hasn't changed | |
| - [ ] Re-run training after fix | |
| *Auto-generated by CI* | |
| EOF | |
| )" | |
| if [ -n "$EXISTING" ]; then | |
| echo "Appending to existing issue #$EXISTING" | |
| gh issue comment "$EXISTING" --body "$BODY" | |
| else | |
| gh issue create \ | |
| --title "Training failure: ${ERROR_COUNT} errors detected ($(date -u +%Y-%m-%d))" \ | |
| --body "$BODY" \ | |
| --label "training-failure,bug" | |
| fi | |
| - name: Upload Checkpoint to HuggingFace | |
| if: always() | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| python -c " | |
| from huggingface_hub import HfApi | |
| import os | |
| if not os.path.exists('./checkpoint'): | |
| print('No checkpoint found, skipping') | |
| exit(0) | |
| api = HfApi() | |
| token = os.getenv('HF_TOKEN') | |
| repo_id = 'meridianal/FinAI' | |
| print(f'Uploading checkpoint to {repo_id}...') | |
| try: | |
| api.upload_folder( | |
| folder_path='./checkpoint', | |
| repo_id=repo_id, | |
| path_in_repo='checkpoint', | |
| commit_message='Hourly training update [skip ci]', | |
| token=token, | |
| ) | |
| print('Checkpoint upload successful') | |
| except Exception as e: | |
| print(f'Checkpoint upload failed: {e}') | |
| exit(1) | |
| # Upload model card to HF repo root | |
| model_card_path = './FinAI/README.md' | |
| if os.path.exists(model_card_path): | |
| print('Uploading model card...') | |
| try: | |
| api.upload_file( | |
| path_or_fileobj=model_card_path, | |
| path_in_repo='README.md', | |
| repo_id=repo_id, | |
| commit_message='Update model card [skip ci]', | |
| token=token, | |
| ) | |
| print('Model card upload successful') | |
| except Exception as e: | |
| print(f'Model card upload failed (non-fatal): {e}') | |
| else: | |
| print('No model card found at FinAI/README.md, skipping') | |
| " | |
| - name: Sync Dataset State | |
| if: always() | |
| env: | |
| GH_PAT: ${{ secrets.GH_PAT }} | |
| run: | | |
| git config --local user.email "action@github.com" | |
| git config --local user.name "Meridian.AI Bot" | |
| git add . | |
| git diff --staged --quiet || git commit -m "chore: sync dataset state & formatting [skip ci]" | |
| git pull --rebase -X theirs origin main || (git rebase --abort && git pull --no-rebase origin main) | |
| git push https://$GH_PAT@github.com/MeridianAlgo/FinAI.git main |