Meridian.AI Train #1412

Workflow file for this run

	name: Meridian.AI Train

	on:
	schedule:
	# Every hour, every day
	- cron: '0 * * * *'
	workflow_dispatch:
	inputs:
	force_seed:
	description: 'Nuke & re-seed HF repo with fresh model?'
	type: boolean
	default: false
	max_steps:
	description: 'Training steps per run (default: 150)'
	type: string
	default: '150'

	env:
	FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

	concurrency:
	group: meridian-train-${{ github.ref }}
	cancel-in-progress: false

	jobs:
	seed:
	name: "Nuke & Seed HF"
	if: github.event.inputs.force_seed == 'true'
	runs-on: ubuntu-latest
	environment: HuggingFace Hub
	steps:
	- uses: actions/checkout@v4
	with:
	token: ${{ secrets.GH_PAT }}

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install deps
	run: \|
	pip install huggingface_hub transformers torch safetensors sentencepiece python-dotenv

	- name: Nuke & Seed
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B'
	PYTHONPATH: .
	run: python scripts/seed_hf_repo.py

	train:
	name: "Hourly Training Run"
	needs: seed
	if: \|
	always() &&
	(needs.seed.result == 'success' \|\| needs.seed.result == 'skipped')
	runs-on: ubuntu-latest
	timeout-minutes: 90 # 1.5 hours max
	permissions:
	contents: write
	issues: write

	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0
	token: ${{ secrets.GH_PAT }}

	- name: Set up Python 3.11
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'
	cache: 'pip'

	- name: Install Dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements.txt
	pip install ruff black

	- name: Lint & Format
	run: \|
	black . --quiet
	ruff check . --fix --quiet

	- name: Pull Checkpoint from HuggingFace
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	python -c "
	from huggingface_hub import snapshot_download, list_repo_files
	import os, shutil

	repo_id = 'meridianal/FinAI'
	token = os.getenv('HF_TOKEN')

	try:
	files = list(list_repo_files(repo_id=repo_id, token=token))
	checkpoint_files = [f for f in files if f.startswith('checkpoint/')]

	if checkpoint_files:
	print(f'Found {len(checkpoint_files)} checkpoint files')
	temp_dir = './temp_download'
	snapshot_download(
	repo_id=repo_id,
	local_dir=temp_dir,
	token=token
	)
	if os.path.exists(os.path.join(temp_dir, 'checkpoint')):
	if os.path.exists('./checkpoint'):
	shutil.rmtree('./checkpoint')
	shutil.move(os.path.join(temp_dir, 'checkpoint'), './checkpoint')
	print('Checkpoint pulled')

	for cf in ['model.safetensors', 'trainer_state.pt', 'config.json', 'ewc_state.pt']:
	path = os.path.join('./checkpoint', cf)
	if os.path.exists(path):
	size_mb = os.path.getsize(path) / (1024 * 1024)
	print(f' - {cf} ({size_mb:.2f} MB)')

	if os.path.exists(temp_dir):
	shutil.rmtree(temp_dir)
	else:
	print('No checkpoint found — will start fresh')
	except Exception as e:
	print(f'Pull error: {e}')
	import traceback
	traceback.print_exc()
	print('Will start fresh.')
	"

	- name: Train
	id: training
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	COMET_API_KEY: ${{ secrets.COMET_API_KEY }}
	TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B'
	DTYPE: 'bfloat16'
	SKIP_OPTIMIZER_SAVE: '1'
	HARD_RAM_GUARD: '1'
	MAX_RAM_GB: '14.5'
	SOFT_RAM_GB: '12.5'
	SOFT_RAM_PCT: '80'
	MIN_THROTTLE_SEQ_LEN: '64'
	DEBUG_STEPS: '0'
	MAX_STEPS: ${{ github.event.inputs.max_steps \|\| '150' }}
	TOTAL_STEPS: '100000'
	GRAD_ACCUM: '8'
	BATCH_SIZE: '1'
	LEARNING_RATE: '5e-5'
	BLOCK_SIZE: '256'
	MAX_BYTES: '15728640'
	USE_EWC: '1'
	EWC_SAMPLES: '5'
	GRADIENT_CHECKPOINTING: '1'
	OPTIMIZER: 'adafactor'
	FREE_OPTIMIZER_BEFORE_FISHER: '1'
	FISHER_SEQ_LEN: '64'
	FISHER_THRESHOLD: '1e-6'
	GC_EVERY_STEPS: '5'
	PYTHONPATH: .
	PYTHONUNBUFFERED: '1'
	run: \|
	timeout 4800 python train.py 2>&1 \| tee train_output.log
	TRAIN_EXIT=${PIPESTATUS[0]}
	echo "train_exit_code=$TRAIN_EXIT" >> "$GITHUB_OUTPUT"

	# Detect repeated errors (10+ occurrences of same error = systemic failure)
	ERROR_COUNT=$(grep -c '\[ERROR\]' train_output.log 2>/dev/null) \|\| ERROR_COUNT=0
	echo "error_count=$ERROR_COUNT" >> "$GITHUB_OUTPUT"

	# Extract unique error messages for the issue body
	grep '\[ERROR\]' train_output.log \| sort -u \| head -10 > train_errors.txt 2>/dev/null \|\| true

	# Detect fatal patterns (NaN loss explosion, OOM, etc.)
	FATAL=$(grep -cE 'CUDA out of memory\|Loss is NaN\|RuntimeError\|FATAL' train_output.log 2>/dev/null) \|\| FATAL=0
	echo "fatal_count=$FATAL" >> "$GITHUB_OUTPUT"

	if [ "$ERROR_COUNT" -gt 50 ] \|\| [ "$FATAL" -gt 0 ]; then
	echo "train_failed=true" >> "$GITHUB_OUTPUT"
	echo "[FAIL] Training had $ERROR_COUNT errors, $FATAL fatal issues."
	else
	echo "train_failed=false" >> "$GITHUB_OUTPUT"
	fi

	# Still save checkpoint even on errors
	exit 0

	- name: Auto-create issue on training failure
	if: steps.training.outputs.train_failed == 'true'
	env:
	GH_TOKEN: ${{ secrets.GH_PAT }}
	run: \|
	ERROR_COUNT=${{ steps.training.outputs.error_count }}
	FATAL_COUNT=${{ steps.training.outputs.fatal_count }}
	EXIT_CODE=${{ steps.training.outputs.train_exit_code }}
	RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"

	# Read unique errors
	ERRORS=$(cat train_errors.txt 2>/dev/null \| head -10 \|\| echo "No error details captured")

	# Check for existing open issue to avoid duplicates
	EXISTING=$(gh issue list --state open --label "training-failure" --limit 1 --json number -q '.[0].number' 2>/dev/null \|\| echo "")

	BODY="$(cat <<EOF
	## Training Failure Report

	Run: ${RUN_URL}
	Date: $(date -u +%Y-%m-%dT%H:%M:%SZ)
	Exit Code: ${EXIT_CODE}
	Error Count: ${ERROR_COUNT}
	Fatal Count: ${FATAL_COUNT}

	### Unique Errors
	\`\`\`
	${ERRORS}
	\`\`\`

	### Action Required
	- [ ] Investigate root cause
	- [ ] Check if EWC state file is stale (shape mismatch)
	- [ ] Check model architecture hasn't changed
	- [ ] Re-run training after fix

	Auto-generated by CI
	EOF
	)"

	if [ -n "$EXISTING" ]; then
	echo "Appending to existing issue #$EXISTING"
	gh issue comment "$EXISTING" --body "$BODY"
	else
	gh issue create \
	--title "Training failure: ${ERROR_COUNT} errors detected ($(date -u +%Y-%m-%d))" \
	--body "$BODY" \
	--label "training-failure,bug"
	fi

	- name: Upload Checkpoint to HuggingFace
	if: always()
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	python -c "
	from huggingface_hub import HfApi
	import os

	if not os.path.exists('./checkpoint'):
	print('No checkpoint found, skipping')
	exit(0)

	api = HfApi()
	token = os.getenv('HF_TOKEN')
	repo_id = 'meridianal/FinAI'
	print(f'Uploading checkpoint to {repo_id}...')
	try:
	api.upload_folder(
	folder_path='./checkpoint',
	repo_id=repo_id,
	path_in_repo='checkpoint',
	commit_message='Hourly training update [skip ci]',
	token=token,
	)
	print('Checkpoint upload successful')
	except Exception as e:
	print(f'Checkpoint upload failed: {e}')
	exit(1)

	# Upload model card to HF repo root
	model_card_path = './FinAI/README.md'
	if os.path.exists(model_card_path):
	print('Uploading model card...')
	try:
	api.upload_file(
	path_or_fileobj=model_card_path,
	path_in_repo='README.md',
	repo_id=repo_id,
	commit_message='Update model card [skip ci]',
	token=token,
	)
	print('Model card upload successful')
	except Exception as e:
	print(f'Model card upload failed (non-fatal): {e}')
	else:
	print('No model card found at FinAI/README.md, skipping')
	"

	- name: Sync Dataset State
	if: always()
	env:
	GH_PAT: ${{ secrets.GH_PAT }}
	run: \|
	git config --local user.email "action@github.com"
	git config --local user.name "Meridian.AI Bot"
	git add .
	git diff --staged --quiet \|\| git commit -m "chore: sync dataset state & formatting [skip ci]"
	git pull --rebase -X theirs origin main \|\| (git rebase --abort && git pull --no-rebase origin main)
	git push https://$GH_PAT@github.com/MeridianAlgo/FinAI.git main

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Meridian.AI Train #1412

Workflow file

Meridian.AI Train #1412

Uh oh!

Workflow file for this run