77import hashlib
88import json
99import logging
10- import random
1110import tempfile
1211from datetime import date , datetime , timezone
1312from pathlib import Path
5150# training data (~500 dates × 500 tickers = 250k samples).
5251TRAINING_LOOKBACK = "2y"
5352
53+ # IC quality gate: if validation IC is below this threshold, the model
54+ # is too weak to trust for live trading. Fall back to equal-weight.
55+ # 0.02 is conservative — typical cross-sectional ICs are 0.03-0.08.
56+ MIN_VALIDATION_IC = 0.02
57+
5458# R3-P-9/P-10 fix: resolve paths relative to project root, not CWD
5559_PROJECT_ROOT = Path (__file__ ).resolve ().parent .parent .parent
5660MODEL_CACHE_DIR = _PROJECT_ROOT / "data" / "models"
@@ -392,12 +396,12 @@ def train_model(
392396 "Use a point-in-time membership table for unbiased backtests."
393397 )
394398
395- # M12 fix: use a random sample instead of the first 100 alphabetically.
396- # Alphabetical slicing (A-D) introduces systematic sector/name bias.
397- # Seed is fixed for reproducibility across runs on the same day.
399+ # Train on the full S&P 500 universe. Previous versions sampled 100
400+ # tickers for speed, but this causes distribution mismatch at inference
401+ # time when the model scores all ~500 stocks. LightGBM handles 500
402+ # tickers × 2yr daily data (~250k rows) in under a minute.
398403 all_tickers = fetch_sp500_tickers ()
399- random .seed (42 )
400- tickers = training_tickers or random .sample (all_tickers , min (100 , len (all_tickers )))
404+ tickers = training_tickers or all_tickers
401405 raw = fetch_ohlcv (tickers , period = TRAINING_LOOKBACK )
402406 long = reshape_ohlcv_wide_to_long (raw )
403407
@@ -438,15 +442,23 @@ def train_model(
438442 # with 5-day embargo to prevent target leakage (same approach as train.py)
439443 dates = labeled .index .get_level_values (0 ).unique ().sort_values ()
440444 split_date = dates [int (len (dates ) * 0.8 )]
441- embargo_offset = pd .tseries .offsets .BDay (5 )
445+ # Embargo must cover the longest feature lookback window to prevent
446+ # information leakage from features that straddle the train/val boundary.
447+ # Feature set includes 20-day returns, 20-day vol, 20-day Bollinger, and
448+ # 60-day moving averages. 22 business days (~1 calendar month) provides
449+ # a safe margin above the 20-day features while being conservative enough
450+ # not to waste too much data. (The 60-day MA creates backward dependence
451+ # only, not forward leakage, so 22 days is sufficient.)
452+ embargo_offset = pd .tseries .offsets .BDay (22 )
442453 embargo_date = split_date + embargo_offset
443454
444455 train_data = labeled .loc [labeled .index .get_level_values (0 ) <= split_date ]
445456 val_data = labeled .loc [labeled .index .get_level_values (0 ) >= embargo_date ]
446457
447458 logger .info (
448459 f"Training on { len (train_data )} samples (up to { split_date .date ()} ), "
449- f"validating on { len (val_data )} samples (from { embargo_date .date ()} )"
460+ f"validating on { len (val_data )} samples (from { embargo_date .date ()} , "
461+ f"embargo=22 business days)"
450462 )
451463
452464 model = CrossSectionalModel (model_type = "lightgbm" , feature_cols = available_cols )
@@ -462,6 +474,9 @@ def train_model(
462474 logger .warning ("No validation data available — training without early stopping" )
463475 model .fit (train_data , target_col = "target_5d" )
464476
477+ # Attach IC to model so callers can gate on quality
478+ model .validation_ic = ic # type: ignore[attr-defined]
479+
465480 # --- MLflow tracking (best-effort: never crash training) ---
466481 try :
467482 import mlflow
@@ -488,6 +503,15 @@ def train_model(
488503 except Exception as e :
489504 logger .debug (f"Could not log feature importance artifact: { e } " )
490505
506+ # Log winsorize bounds alongside the model so rollbacks keep
507+ # bounds in sync with the model version that produced them.
508+ try :
509+ bounds_path = Path ("data/models/winsorize_bounds.json" )
510+ if bounds_path .exists ():
511+ mlflow .log_artifact (str (bounds_path ), artifact_path = "winsorize_bounds" )
512+ except Exception as e :
513+ logger .debug (f"Could not log winsorize bounds artifact: { e } " )
514+
491515 logger .info ("MLflow run logged successfully for live_lgbm_alpha" )
492516 except Exception as e :
493517 logger .warning (f"MLflow tracking failed (training unaffected): { e } " )
@@ -707,6 +731,22 @@ def get_ml_weights(
707731 logger .info ("Step 1/4: Training model..." )
708732 model = train_model (data_path = training_data_path )
709733
734+ # IC quality gate: if the model's validation IC is too low, fall back
735+ # to equal-weight portfolio. A weak model is worse than no model.
736+ model_ic = getattr (model , "validation_ic" , None )
737+ if isinstance (model_ic , (int , float )) and model_ic < MIN_VALIDATION_IC :
738+ logger .warning (
739+ f"Model validation IC ({ model_ic :.4f} ) below minimum "
740+ f"({ MIN_VALIDATION_IC } ). Falling back to equal-weight."
741+ )
742+ # Return equal-weight across top_n current holdings if available,
743+ # otherwise return empty (no trades).
744+ if current_weights :
745+ tickers = list (current_weights .keys ())[:top_n ]
746+ eq_wt = 1.0 / len (tickers ) if tickers else 0.0
747+ return {t : eq_wt for t in tickers }
748+ return {}
749+
710750 # Step 2: Fetch recent data for the full universe to rank
711751 logger .info ("Step 2/4: Fetching recent data for universe ranking..." )
712752 from python .data .ingestion import fetch_sp500_tickers
0 commit comments