LOKAN/train.py at main · WakateM/LOKAN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
"""LOKAN: Localized Oscillatory Kolmogorov-Arnold Network — Training Pipeline.

Usage:
    python train.py --config config/train/etth1.yaml
"""

import argparse
import json
import os
import tempfile
import warnings
import mlflow
import pytorch_lightning as pl
import torch
import yaml

from src.data import prepare_data, create_dataloaders, create_classification_dataloaders
from src.training import build_model, create_loggers, build_trainer
from src.evaluation import freeze_and_eval, custom_output_dim_eval, run_classification_eval
from src.visualization import (
    plot_loss_curve, plot_kan_graph, plot_predicted_vs_actual,
    plot_predictions_histogram, plot_timeseries_forecast, plot_confusion_matrix,
)
from src.post_training import run_post_training


def _log_artifact(path):
    """Log a file as an MLflow artifact in the active run."""
    mlflow.log_artifact(path)


def export_model(model, ctx, tmpdir):
    """Export model state dict, TorchScript, scaler, and metadata.

    Each export is wrapped individually so a single failure doesn't prevent the rest.
    """
    # 1. State dict + hparams
    try:
        sd_path = os.path.join(tmpdir, "model_state_dict.pt")
        torch.save({
            "model_state_dict": model.state_dict(),
            "hparams": dict(model.hparams),
        }, sd_path)
        _log_artifact(sd_path)
        print(f"Logged model_state_dict.pt")
    except Exception as e:
        warnings.warn(f"Failed to save state dict: {e}")

    # 2. TorchScript (traced)
    try:
        model.eval()
        dummy_input = torch.randn(1, ctx.X.shape[1])
        traced = torch.jit.trace(model.model, dummy_input)
        ts_path = os.path.join(tmpdir, "model_traced.pt")
        traced.save(ts_path)
        _log_artifact(ts_path)
        print(f"Logged model_traced.pt")
    except Exception as e:
        warnings.warn(f"Failed to save TorchScript model: {e}")

    # 3. Feature scaler
    try:
        import joblib
        scaler_path = os.path.join(tmpdir, "feature_scaler.joblib")
        joblib.dump(ctx.scaler, scaler_path)
        _log_artifact(scaler_path)
        print(f"Logged feature_scaler.joblib")
    except Exception as e:
        warnings.warn(f"Failed to save feature scaler: {e}")

    # 4. Metadata
    try:
        metadata = {
            "all_cols": ctx.all_cols,
            "target_idx": ctx.target_idx,
            "n_cols": ctx.n_cols,
            "input_indices": ctx.input_indices,
            "in_features": int(ctx.X.shape[1]),
        }
        meta_path = os.path.join(tmpdir, "metadata.json")
        with open(meta_path, "w") as f:
            json.dump(metadata, f, indent=2)
        _log_artifact(meta_path)
        print(f"Logged metadata.json")
    except Exception as e:
        warnings.warn(f"Failed to save metadata: {e}")


def run_pipeline(config: dict):
    seed = config["experiment"].get("seed", 42)
    experiment_name = config["experiment"]["name"]
    run_name = config["experiment"].get("run_name") or None

    dcfg = config["data"]
    tcfg = config["training"]
    ecfg = config["evaluation"]
    data_type = dcfg.get("type", "timeseries")
    is_timeseries = data_type == "timeseries"
    is_classification = data_type == "classification"
    mcfg = config["model"]
    output_dim = mcfg.get("output_dim", 1)
    horizon = dcfg.get("horizon", output_dim)
    lookback = dcfg.get("lookback", 1)
    target_name = dcfg.get("target_col", "Value")

    # 2. MLflow setup
    mlflow_uri = f"sqlite:///{os.path.abspath('mlflow.db')}"
    mlflow.set_tracking_uri(mlflow_uri)
    mlflow.pytorch.autolog(disable=True)

    enable_mlflow = tcfg.get("enable_mlflow", False)
    enable_tb = tcfg.get("enable_tensorboard", False)

    # 3. MLflow experiment + run (before seeding so run name is random)
    mlflow.set_experiment(experiment_name)

    # 8. Single MLflow run for training metrics + artifacts
    with mlflow.start_run(run_name=run_name) as run:

        # 4. Seed AFTER run creation so MLflow gets a unique random name
        pl.seed_everything(seed)

        # 5. Data
        ctx = prepare_data(config)

        # 6. DataLoaders
        if is_classification:
            train_loader, val_loader, test_loader = create_classification_dataloaders(ctx, dcfg)
        else:
            train_loader, val_loader, test_loader = create_dataloaders(
                ctx,
                split=dcfg.get("split", [60, 20, 20]),
                batch_size=dcfg.get("batch_size", 256),
            )

        # 7. Model
        model = build_model(config, in_features=ctx.X.shape[1])
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Total parameters: {total_params:,} | Trainable: {trainable_params:,}")
        if is_classification:
            print(f"Classification: {ctx.X.shape[1]} features -> {dcfg.get('num_classes', 10)} classes")
        else:
            print(f"Lookback: {lookback} -> Horizon: {horizon} | "
                  f"Normalize target: {dcfg.get('normalize_target', False)}")
        # Log config as params
        flat_config = {}
        for section, values in config.items():
            if isinstance(values, dict):
                for k, v in values.items():
                    flat_config[f"{section}.{k}"] = str(v) if isinstance(v, (list, dict)) else v
        mlflow.log_params(flat_config)

        # Create PL loggers using the same MLflow run_id
        mlf_logger, tb_logger = create_loggers(
            experiment_name,
            mlflow_uri,
            os.path.join("tb_logs", experiment_name),
            enable_mlflow=enable_mlflow,
            enable_tensorboard=enable_tb,
            run_id=run.info.run_id if enable_mlflow else None,
        )

        loggers = [l for l in [mlf_logger, tb_logger] if l is not None]
        trainer = build_trainer(
            max_epochs=tcfg["max_epochs"],
            loggers=loggers,
            gradient_clip=tcfg.get("gradient_clip"),
            early_stopping=tcfg.get("early_stopping", False),
            patience=tcfg.get("early_stop_patience", 15),
        )
        trainer.fit(model, train_loader, val_loader)

        with tempfile.TemporaryDirectory() as tmpdir:
            # 7.1 Export model artifacts
            export_model(model, ctx, tmpdir)

            # 7.2 Loss curve
            loss_path = os.path.join(tmpdir, "loss_curve.png")
            plot_loss_curve(model.train_losses, model.val_losses, filepath=loss_path)
            _log_artifact(loss_path)

            # 7.3 LOKAN graph
            input_labels = []
            if is_timeseries:
                for t in range(lookback):
                    for col in dcfg["input_cols"]:
                        input_labels.append(f"t-{lookback - t}:{col}")
            graph_path = os.path.join(tmpdir, "lokan_network_graph.png")
            basis_range = tuple(config["model"]["basis_range"])
            graph_path, edges_path = plot_kan_graph(
                model.model, input_labels=input_labels,
                filepath=graph_path, spline_range=basis_range,
            )
            _log_artifact(graph_path)
            _log_artifact(edges_path)

            # 7.4 Evaluate
            if is_classification:
                logits, cls_preds, cls_targets, cls_metrics = run_classification_eval(
                    model, test_loader)

                mlflow.log_metrics({"test_accuracy": cls_metrics["accuracy"]})
                print(f"\n--- {experiment_name} ---")
                print(f"Test Accuracy: {cls_metrics['accuracy']:.4f}")
                for c, acc in sorted(cls_metrics["per_class_accuracy"].items()):
                    print(f"  Class {c}: {acc:.4f}")

                num_classes = dcfg.get("num_classes", 10)
                cm_path = os.path.join(tmpdir, "confusion_matrix.png")
                plot_confusion_matrix(cls_preds, cls_targets, num_classes=num_classes,
                                      filepath=cm_path)
                _log_artifact(cm_path)

            else:
                preds, targets, metrics, scale_label = freeze_and_eval(
                    model, train_loader, test_loader, ctx,
                    inverse_target=ecfg.get("inverse_target", False),
                    normalize_target=dcfg.get("normalize_target", False),
                )
                mlflow.log_metrics({
                    "test_mse": metrics["mse"], "test_rmse": metrics["rmse"],
                    "test_mae": metrics["mae"], "test_r2": metrics["r2"],
                })
                print(f"\n--- {experiment_name} ({scale_label}) ---")
                print(f"Test MSE: {metrics['mse']:.4f}")
                print(f"Test RMSE: {metrics['rmse']:.4f}")
                print(f"Test MAE: {metrics['mae']:.4f}")
                print(f"Test R2: {metrics['r2']:.4f}")

                hist_path = os.path.join(tmpdir, "predictions_histogram.png")
                plot_predictions_histogram(preds, targets, output_dim, metrics,
                                           scale_label=scale_label, target_name=target_name,
                                           filepath=hist_path)
                _log_artifact(hist_path)

                scatter_path = os.path.join(tmpdir, "predicted_vs_actual.png")
                plot_predicted_vs_actual(preds.flatten(), targets.flatten(), filepath=scatter_path)
                _log_artifact(scatter_path)

                if is_timeseries:
                    ts_path = os.path.join(tmpdir, "timeseries_forecast.png")
                    plot_timeseries_forecast(preds, targets, output_dim, lookback, metrics,
                                             scale_label=scale_label, target_name=target_name,
                                             filepath=ts_path)
                    _log_artifact(ts_path)

                # Custom output_dim evaluation
                custom_output_dims = ecfg.get("custom_output_dims", [])
                if is_timeseries and custom_output_dims:
                    print(f"\n{'=' * 70}")
                    print(f"Testing model on custom output_dims: {custom_output_dims}")
                    print(f"Model was trained on HORIZON = {output_dim}")
                    print(f"{'=' * 70}\n")

                    for ch in custom_output_dims:
                        if ch >= output_dim:
                            print(f"Skipping custom output_dim {ch} (>= trained output_dim {output_dim})")
                            continue

                        print(f"\n--- Testing Horizon {ch} ---")
                        ch_preds, ch_targets, ch_metrics, ch_scale = custom_output_dim_eval(
                            model, ctx, ch, config)

                        mlflow.log_metrics({
                            f"custom_h{ch}_mse": ch_metrics["mse"],
                            f"custom_h{ch}_rmse": ch_metrics["rmse"],
                            f"custom_h{ch}_mae": ch_metrics["mae"],
                            f"custom_h{ch}_r2": ch_metrics["r2"],
                        })
                        print(f"MSE: {ch_metrics['mse']:.4f} | RMSE: {ch_metrics['rmse']:.4f} | "
                              f"MAE: {ch_metrics['mae']:.4f} | R2: {ch_metrics['r2']:.4f}")

                        ch_hist = os.path.join(tmpdir, f"predictions_histogram_h{ch}.png")
                        plot_predictions_histogram(ch_preds, ch_targets, ch, ch_metrics,
                                                   scale_label=ch_scale, trained_output_dim=output_dim,
                                                   target_name=target_name, filepath=ch_hist)
                        _log_artifact(ch_hist)

                        ch_scatter = os.path.join(tmpdir, f"predicted_vs_actual_h{ch}.png")
                        plot_predicted_vs_actual(ch_preds.flatten(), ch_targets.flatten(),
                                                filepath=ch_scatter)
                        _log_artifact(ch_scatter)

                        ch_ts = os.path.join(tmpdir, f"timeseries_forecast_h{ch}.png")
                        plot_timeseries_forecast(ch_preds, ch_targets, ch, lookback, ch_metrics,
                                                 scale_label=ch_scale, trained_output_dim=output_dim,
                                                 target_name=target_name, filepath=ch_ts)
                        _log_artifact(ch_ts)
                        print(f"Saved plots for output_dim {ch}")

                    print(f"\n{'=' * 70}")
                    print("Custom output_dim testing complete!")
                    print(f"{'=' * 70}")

                # 7.5 Post-training analysis
                post_cfg = config.get("post_training", {})
                if post_cfg.get("enabled", False):
                    post_dir = os.path.join(tmpdir, "post_training")

                    # Build test-split coordinates for spatial tasks
                    coords = None
                    if "lon" in dcfg.get("input_cols", []) and "lat" in dcfg.get("input_cols", []):
                        import pandas as pd
                        from src.data import load_csv
                        df_full = load_csv(dcfg["path"], dcfg.get("sort_col"))
                        n_total = len(ctx.X)
                        split = dcfg.get("split", [60, 20, 20])
                        n_train = int(n_total * split[0] / 100)
                        n_val = int(n_total * split[1] / 100)
                        # Test split indices in the original DataFrame
                        test_start = n_train + n_val
                        df_test = df_full.iloc[test_start:test_start + len(preds)]
                        coords = df_test[["lon", "lat"]].reset_index(drop=True)

                    run_post_training(
                        preds, targets, coords=coords,
                        output_dir=post_dir,
                        tasks=post_cfg.get("tasks"),
                        resolution=post_cfg.get("resolution", 0.5),
                    )

                    # Log post-training artifacts
                    for fname in os.listdir(post_dir):
                        fpath = os.path.join(post_dir, fname)
                        if os.path.isfile(fpath):
                            _log_artifact(fpath)

        print(f"\nRun ID: {run.info.run_id}")
        print(f"Artifacts stored in mlruns/")


def main():
    parser = argparse.ArgumentParser(description="LOKAN Training Pipeline")
    parser.add_argument("--config", type=str, default="config/train/etth1.yaml",
                        help="Path to YAML config file")
    args = parser.parse_args()

    with open(args.config, "r") as f:
        config = yaml.safe_load(f)

    run_pipeline(config)


if __name__ == "__main__":
    main()