From eee73d8470c6ed497a693e625bf049d7b254d66b Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Thu, 18 Jun 2026 19:56:16 +0200
Subject: [PATCH 1/7] Add TRAIN.md research plan for /train LoRA
 experimentation UI.

Documents ltx-2-mlx slice/preprocess/train pipeline, upstream APIs, RAM
constraints, and phased implementation for a Web UI training lab.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 TRAIN.md | 261 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 261 insertions(+)
 create mode 100644 TRAIN.md

diff --git a/TRAIN.md b/TRAIN.md
new file mode 100644
index 0000000..d507b1e
--- /dev/null
+++ b/TRAIN.md
@@ -0,0 +1,261 @@
+# Training UI plan (`/train`)
+
+Research-backed plan for a **ltx-ws** training lab on top of [ltx-2-mlx](https://github.com/dgrauet/ltx-2-mlx) **v0.14.12** (`ltx-trainer-mlx`).
+
+## What “training” means upstream
+
+ltx-2-mlx is a three-package monorepo:
+
+| Package | Role |
+|---------|------|
+| `ltx-core-mlx` | Model weights, VAE, DiT, Gemma connectors |
+| `ltx-pipelines-mlx` | Inference CLI (`generate`, `retake`, …) |
+| `ltx-trainer-mlx` | **LoRA / full fine-tune** via flow matching |
+
+Training is **not** online learning during inference. It is an offline pipeline:
+
+```
+raw videos  →  [slice]  →  clips + captions  →  [preprocess]  →  latents + conditions  →  [train]  →  LoRA .safetensors
+```
+
+### CLI entry points (from `ltx_pipelines_mlx/cli.py`)
+
+All require optional package install:
+
+```bash
+uv pip install \
+  "ltx-trainer-mlx @ git+https://github.com/dgrauet/ltx-2-mlx.git@v0.14.12#subdirectory=packages/ltx-trainer"
+```
+
+| Command | Python API | Purpose |
+|---------|------------|---------|
+| `ltx-2-mlx slice` | `ltx_trainer_mlx.slice_clips.slice_videos` | Cut long sources into fixed-length, 32-aligned clips (ffmpeg; audio retained) |
+| `ltx-2-mlx preprocess` | `ltx_trainer_mlx.preprocess.preprocess_dataset` | Encode clips → `.precomputed/latents/`, `conditions/`, optional `audio_latents/` |
+| `ltx-2-mlx train` | `LtxvTrainer(config).train()` | Flow-matching LoRA (or full) training from preprocessed data |
+
+### Training strategies (validated in `tests/test_trainer_core.py`)
+
+| Strategy | Config `training_strategy.name` | Notes |
+|----------|----------------------------------|-------|
+| Text-to-video LoRA | `text_to_video` | Default; `generate_audio: false` for video-only style |
+| Joint AV LoRA | `text_to_video` + `generate_audio: true` | Needs `preprocess --with-audio`; v0.14.12 audio path |
+| Video-to-video (IC-LoRA) | `video_to_video` | Requires reference latents in preprocessed data; LoRA only |
+
+Example configs ship in upstream `packages/ltx-trainer/configs/`:
+
+- `lora_t2v.yaml` — basic T2V style LoRA
+- `lora_v2v.yaml` — IC-LoRA / reference-video conditioning
+- `lora_av_whisper.yaml` — joint audio+video (whisper/ASMR); uses `transformer-dev.safetensors`, gradient checkpointing
+
+### Preprocessed data layout
+
+```
+<data_root>/
+  .precomputed/
+    latents/latent_0000.safetensors      # video VAE latents + dims/fps
+    conditions/condition_0000.safetensors # Gemma prompt embeds
+    audio_latents/latent_0000.safetensors # optional; paired filenames
+```
+
+Captions: sibling `.txt` per clip (or `--captions` dir with matching stems).
+
+### Training runtime characteristics
+
+- **Single-device MLX** on Apple Silicon (unified memory); no DDP.
+- **Heavy RAM**: dev transformer + Gemma + activations. `enable_gradient_checkpointing` / CLI `--low-ram` needed on ≤64 GB for dev-base LoRAs.
+- **Long-running**: thousands of steps; checkpoints + validation renders on interval.
+- **Outputs**: `output_dir/` with checkpoints (`.safetensors`), validation MP4s, saved YAML config.
+- **Progress hook**: `LtxvTrainer.train(step_callback=fn)` — `(global_step, total_steps, validation_paths)`.
+- **Conflicts with inference**: training and generation both want GPU/RAM; must not run concurrently with `server.py` generation lock.
+
+### Hardware guidance (from upstream configs + changelog)
+
+| Workflow | Typical RAM | Resolution / frames |
+|----------|-------------|-------------------|
+| T2V LoRA (distilled base) | 32–48 GB | 704×480 × 25 frames validation |
+| AV style LoRA (dev base + checkpointing) | 64 GB | 192×192 × 97 frames |
+| Preprocess only | ~16 GB peak | Encoder + Gemma partial download (v0.14.12) |
+
+Frame counts must stay **8k+1**; spatial dims **÷32**; training fps should stay near **24** (LTX training distribution).
+
+---
+
+## Gap in ltx-ws today
+
+- Inference stack only: `ltx_mlx_backend.py`, `/api/generate`, main React UI.
+- LoRA **inference** presets exist; no slice/preprocess/train orchestration.
+- `ltx-trainer-mlx` not in `requirements.txt` (optional extra).
+- Single generation worker; no training job queue.
+
+---
+
+## `/train` page — product goals
+
+**Experimentation lab**, not a full MLOps platform:
+
+1. Prepare a small dataset (upload or point at folder).
+2. Run preprocess with sensible defaults.
+3. Configure and launch a LoRA run (T2V first; AV/V2V later).
+4. Watch step progress + validation previews.
+5. Register finished LoRA into existing Web UI preset list for inference smoke tests.
+
+---
+
+## Proposed architecture
+
+### Frontend (`web/`)
+
+Add client routing (e.g. `react-router-dom`):
+
+| Route | Page |
+|-------|------|
+| `/` | Existing generator (`App.tsx` → `GeneratePage`) |
+| `/train` | New `TrainPage.tsx` |
+
+`TrainPage` sections (wizard or tabs):
+
+1. **Dataset** — upload videos + caption `.txt`; or path to local folder; optional slice settings (interval, res, caption template).
+2. **Preprocess** — model id, H×W, max frames, `with_audio`, frame rate; start + progress.
+3. **Train** — preset picker (T2V / AV / V2V), hyperparams (rank, steps, LR, checkpoint/val intervals), validation prompts; advanced YAML toggle.
+4. **Runs** — list jobs, live step/loss, validation video thumbnails, cancel, download LoRA, **“Add to LoRA library”**.
+
+Nav link in header: `Generate` | `Train`.
+
+Vite: `historyApiFallback` already handled by FastAPI `html=True` static mount.
+
+### Backend (`web_ui.py` + new `ltx_train_backend.py`)
+
+Optional dependency gate: if `ltx_trainer_mlx` missing, `/api/train/health` returns `ok: false` with install hint.
+
+```
+web_outputs/
+  train/
+    <job_id>/
+      uploads/          # raw uploads
+      clips/            # post-slice
+      preprocessed/     # .precomputed
+      outputs/          # trainer output_dir
+      config.yaml       # resolved trainer config
+      status.json       # phase, step, logs
+```
+
+**API sketch:**
+
+| Method | Path | Purpose |
+|--------|------|---------|
+| GET | `/api/train/health` | Trainer package installed? ffmpeg? active MLX model path? |
+| GET | `/api/train/presets` | Built-in config templates (T2V / AV / V2V) |
+| POST | `/api/train/datasets` | Create dataset job dir; accept multipart uploads |
+| POST | `/api/train/slice` | `{ dataset_id, interval, res, ... }` |
+| POST | `/api/train/preprocess` | `{ dataset_id, model, height, width, with_audio, ... }` |
+| POST | `/api/train/runs` | Build `LtxTrainerConfig`, start training |
+| GET | `/api/train/runs` | List runs |
+| GET | `/api/train/runs/{id}` | Status + stats |
+| GET | `/api/train/runs/{id}/events` | SSE: step, loss, validation paths (mirror `/api/runs/{id}/events`) |
+| POST | `/api/train/runs/{id}/cancel` | Cooperative cancel |
+| GET | `/api/train/runs/{id}/artifacts/{path}` | Validation MP4s, final LoRA |
+| POST | `/api/train/runs/{id}/register-lora` | Copy LoRA into `web_outputs/loras` + preset entry |
+
+**Worker model:**
+
+- Separate `TrainingWorker` thread pool (`max_workers=1`), analogous to generation executor.
+- **Global mutex** with generation: starting train rejects if `server` busy generating (and vice versa) — surface clear UI message.
+- Long steps run in `asyncio.to_thread()` / dedicated thread; `step_callback` pushes to asyncio queue for SSE.
+
+**Config builder:**
+
+- Start from upstream YAML templates embedded or loaded from repo `train_configs/`.
+- Override: `model.model_path` ← resolved from Web UI active MLX model snapshot.
+- Override: `data.preprocessed_data_root`, `output_dir`, `optimization.steps`, `lora.rank`, validation prompts.
+- Validate via `LtxTrainerConfig.model_validate()` before start.
+
+### Dependency update
+
+Add commented optional install in `requirements.txt`:
+
+```bash
+"ltx-trainer-mlx @ git+https://github.com/dgrauet/ltx-2-mlx.git@v0.14.12#subdirectory=packages/ltx-trainer"
+```
+
+Pin tag to `LTX2_MLX_GIT_TAG` in `ltx_mlx_backend.py`.
+
+---
+
+## Implementation phases
+
+### Phase 1 — Foundation (MVP)
+
+- [ ] `training` branch: router + empty `/train` shell + nav
+- [ ] `ltx_train_backend.py`: healthcheck, config templates, subprocess/thread wrapper for **preprocess only**
+- [ ] Dataset upload API + folder layout
+- [ ] Train page: upload videos + captions → preprocess button → log/progress panel
+- [ ] Docs in README: optional trainer install
+
+**Exit criteria:** User can preprocess clips from Web UI; no training yet.
+
+### Phase 2 — T2V LoRA training
+
+- [ ] `POST /api/train/runs` wrapping `LtxvTrainer.train(step_callback=...)`
+- [ ] SSE progress (step / total / ETA from `TrainingStats`)
+- [ ] Cancel flag checked between steps
+- [ ] Validation MP4 serving from `output_dir`
+- [ ] **Register LoRA** → existing `/api/loras/custom` flow
+
+**Exit criteria:** End-to-end T2V LoRA on a toy dataset (≥2 clips); use in generator.
+
+### Phase 3 — Slice + AV presets
+
+- [ ] Slice API (ffmpeg dependency check)
+- [ ] `with_audio` preprocess toggle
+- [ ] Presets: `lora_av_whisper` simplified form (audio-only target modules hidden behind preset)
+- [ ] RAM warning banners (`--low-ram` → `enable_gradient_checkpointing`)
+
+### Phase 4 — V2V / IC-LoRA training
+
+- [ ] Reference video upload + reference latent preprocess path
+- [ ] `video_to_video` strategy UI
+- [ ] Validation with `reference_videos`
+
+### Phase 5 — Polish
+
+- [ ] Resume from checkpoint (`model.load_checkpoint`)
+- [ ] W&B optional (`wandb` extra)
+- [ ] MCP tool `ltx_train_lora` for agents (optional)
+
+---
+
+## Risks and constraints
+
+| Risk | Mitigation |
+|------|------------|
+| OOM during train | Default to q8 model path; expose checkpointing; block train if free RAM estimate low |
+| Train + generate concurrent | Global `mlx_busy` lock shared with `LocalVideoGenerator` |
+| Preprocess partial HF download | Use same resolved `model_path` as inference (full snapshot already cached) |
+| V2V reference latents | Defer to Phase 4; document manual preprocess steps until automated |
+| Long jobs lost on server restart | Persist `status.json`; optional resume; warn user |
+| Alpha trainer API | Pin v0.14.12; thin adapter layer in `ltx_train_backend.py` |
+
+---
+
+## Testing strategy
+
+1. **Unit** (no MLX): config builder, path layout, `_sync` job state, YAML merge.
+2. **Integration** (MLX machine): preprocess 2 clips → 50-step T2V LoRA → load in generator.
+3. **Manual**: `/train` SSE progress, cancel mid-run, register LoRA preset.
+
+Upstream tests to mirror behavior: `tests/test_trainer_core.py`, `tests/test_trainer_datasets.py`.
+
+---
+
+## Open questions (decide before Phase 2)
+
+1. **Default base weights for training** — always `transformer-dev.safetensors` (inference-compatible CFG pipelines) vs distilled (faster but different inference path)?
+2. **Dataset size limits** — cap uploads (e.g. 2 GB / 50 clips) for Web UI?
+3. **Separate process** — run trainer in child process for crash isolation vs in-process thread?
+4. **Standalone `web_server.py`** — should `/train` work without full `server.py` WS stack? (Recommend: yes, train-only via FastAPI.)
+
+---
+
+## Immediate next step
+
+Implement **Phase 1** on `training` branch: routing, `ltx_train_backend.py` skeleton, preprocess job API, minimal `/train` UI.

From 2a5cccfada13b5d0efcf4b31ef3d3bbf90831239 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Thu, 18 Jun 2026 20:00:24 +0200
Subject: [PATCH 2/7] Expand TRAIN.md with input requirements and SSE job
 wiring plan.

Documents slice/preprocess/train inputs, preset matrix, TrainJob phases,
and how to mirror generation SSE for long-running background training.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 TRAIN.md | 299 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 299 insertions(+)

diff --git a/TRAIN.md b/TRAIN.md
index d507b1e..5274631 100644
--- a/TRAIN.md
+++ b/TRAIN.md
@@ -259,3 +259,302 @@ Upstream tests to mirror behavior: `tests/test_trainer_core.py`, `tests/test_tra
 ## Immediate next step
 
 Implement **Phase 1** on `training` branch: routing, `ltx_train_backend.py` skeleton, preprocess job API, minimal `/train` UI.
+
+---
+
+## Training inputs (what ltx-2-mlx accepts & requires)
+
+Training is three optional/required stages. **Train** only consumes **preprocessed** data; everything before that is dataset prep.
+
+### Stage A — Slice (optional)
+
+**API:** `ltx_trainer_mlx.slice_clips.slice_videos` · **Requires:** `ffmpeg` on PATH
+
+| Input | Required | Notes |
+|-------|----------|-------|
+| `sources` | yes | One or more video files or directories |
+| `out_dir` | yes | Per-source subfolders of clips |
+| `interval` | no (default 4s) | Clip length; ignored if `timecodes_file` set |
+| `timecodes_file` | no | `start,end` per line |
+| `res` | no (default `384x384`) | `WxH`, both **÷32** |
+| `fps` | no (default 24) | Output fps |
+| `fit` | no | `crop` or `pad` |
+| `min_length` | no | Drop clips shorter than N seconds |
+| `max_clips` / `sample` | no | Cap + even/sequential sampling |
+| `skip_start` / `skip_end` | no | Trim intros/outros |
+| `caption_template` | no | Writes identical `.txt` beside each clip |
+| `crf` | no | x264 quality |
+
+**Outputs:** `clip_XXX.mp4` + optional `clip_XXX.txt` (caption seed for editing).
+
+---
+
+### Stage B — Preprocess (required before train)
+
+**API:** `ltx_trainer_mlx.preprocess.preprocess_dataset` · **Requires:** local MLX model dir, Gemma (HF id)
+
+| Input | Required | Notes |
+|-------|----------|-------|
+| `videos_dir` | yes | `.mp4/.mov/.avi/.mkv/.webm`; recursive (slice subfolders OK) |
+| `output_dir` | yes | Creates `output_dir/.precomputed/` |
+| `model_dir` | yes | **Local path** to MLX snapshot (encoders only; partial HF download OK in v0.14.12) |
+| `gemma_model_id` | no | Default `mlx-community/gemma-3-12b-it-4bit` |
+| `target_height` / `target_width` | no | **÷32**; default = native per clip |
+| `max_frames` | no | Default 97; must be **8k+1** |
+| `captions_dir` | no | `.txt` per video stem; else **filename stem** used as prompt |
+| `caption_ext` | no | Default `.txt` |
+| `with_audio` | no | Adds `audio_latents/`; **required** if training with `generate_audio: true` |
+| `frame_rate` | no | Written into latent metadata; default = probed fps |
+
+**Outputs:**
+
+```
+<preprocessed>/.precomputed/
+  latents/latent_0000.safetensors       # video VAE latent + dims/fps metadata
+  conditions/condition_0000.safetensors # Gemma video+audio prompt embeds
+  audio_latents/latent_0000.safetensors # optional; same index as video latent
+```
+
+**V2V add-on (Phase 4):** IC-LoRA also needs `reference_latents/latent_XXXX.safetensors` paired by index (separate encode pass — not in basic `preprocess_dataset` today; manual or custom script per upstream `lora_v2v.yaml`).
+
+---
+
+### Stage C — Train (LoRA / full)
+
+**API:** `LtxvTrainer(LtxTrainerConfig).train(step_callback=…)` · **Requires:** `ltx-trainer-mlx`, preprocessed data, **local** `model.model_path`
+
+#### Hard requirements (`LtxTrainerConfig` validation)
+
+| Field | Required | Notes |
+|-------|----------|-------|
+| `model.model_path` | yes | Existing **local** directory (not URL) |
+| `model.text_encoder_path` | yes* | Gemma id/path; *skipped if no validation prompts |
+| `model.training_mode` | yes | `lora` (UI default) or `full` |
+| `lora` block | yes if `lora` mode | `rank`, `alpha`, `dropout`, `target_modules` |
+| `data.preprocessed_data_root` | yes | Path to dataset root (parent of `.precomputed`) |
+| `training_strategy.name` | yes | `text_to_video` or `video_to_video` |
+| `optimization.steps` | yes | Often 1000–3000+ |
+| `output_dir` | yes | Checkpoints + validation MP4s |
+
+#### Common optional / preset fields
+
+| Field | Purpose |
+|-------|---------|
+| `model.transformer_file` | e.g. `transformer-dev.safetensors` (AV/style LoRAs) |
+| `model.load_checkpoint` | Resume from prior checkpoint dir/file |
+| `training_strategy.generate_audio` | Joint AV training (needs audio latents) |
+| `optimization.enable_gradient_checkpointing` | **Required** on 64 GB for dev-base; maps to CLI `--low-ram` |
+| `optimization.batch_size`, `gradient_accumulation_steps`, `learning_rate`, schedulers | Standard training knobs |
+| `validation.*` | Prompts, `video_dims` (W,H,F), `interval`, `inference_steps`, `reference_videos` (V2V) |
+| `checkpoints.interval` / `keep_last_n` | Intermediate `.safetensors` |
+| `flow_matching.timestep_sampling_mode` | Default `shifted_logit_normal` |
+| `seed` | Reproducibility |
+| `wandb.*` / `hub.*` | Off by default in UI |
+
+#### Strategy matrix (what we wire first)
+
+| Preset | `training_strategy` | Preprocess | `transformer_file` | RAM hint |
+|--------|---------------------|------------|----------------------|----------|
+| **T2V style** | `text_to_video`, `generate_audio: false` | standard | auto (distilled OK) | 32–48 GB |
+| **AV style** | `text_to_video`, `generate_audio: true` | `--with-audio` | `transformer-dev.safetensors` | 64 GB + checkpointing |
+| **IC-LoRA V2V** | `video_to_video`, LoRA only | + `reference_latents/` | LoRA | defer Phase 4 |
+
+#### Trainer outputs
+
+- `output_dir/checkpoint-XXXX.safetensors` (LoRA weights)
+- `output_dir/validation_step_XXXX_*.mp4` (when `validation.interval` set)
+- `output_dir/config.yaml` (resolved config copy)
+- Final return: `(saved_path: Path, TrainingStats)` — steps/sec, peak GB, total time
+
+#### Trainer progress hooks (for our adapter)
+
+| Hook | Data available |
+|------|----------------|
+| `step_callback(global_step, total_steps, validation_paths)` | Step index, validation MP4 paths after val steps |
+| `TrainingProgress.update_training` | `loss`, `lr`, `step_time` (internal — we patch or subclass to expose) |
+| `disable_progress_bars=True` | Logs loss every 5 steps to logger (fallback) |
+
+**No built-in cancel** — cooperative cancel via `step_callback` raising `TrainingCancelledError` between steps.
+
+---
+
+## What we wire in ltx-ws (scope by preset)
+
+### Phase 1–2 UI fields → upstream mapping
+
+| UI control | Maps to |
+|------------|---------|
+| Upload videos + caption files | `videos_dir` (+ optional `captions_dir`) |
+| “Slice first” toggle + interval/res/fps/template | `slice_videos(...)` → `clips/` |
+| Model picker | `model_dir` = same resolved snapshot as inference (`state.active_model`) |
+| Resolution / max frames / with audio | `preprocess_dataset(...)` |
+| Preset: T2V / AV | Load embedded YAML template → override paths & steps |
+| Rank, steps, LR, val interval, val prompts | `LtxTrainerConfig` overrides |
+| Low RAM toggle | `optimization.enable_gradient_checkpointing=true` |
+| Run name | `output_dir` subfolder + preset label |
+
+### Phase 4 additions (V2V)
+
+| UI control | Maps to |
+|------------|---------|
+| Reference video per target clip | `reference_latents/` preprocess + `validation.reference_videos` |
+| `reference_downscale_factor` | validation config |
+
+### Out of scope for v1 UI (CLI / advanced YAML only)
+
+- `training_mode: full` (full fine-tune)
+- W&B / Hub push
+- Custom `target_modules` (expose in “Advanced YAML” panel later)
+- Timecode-list slicing
+
+---
+
+## Long-running background jobs & client updates
+
+Reuse the **generation run pattern** in `web_ui.py` — it already solves queueing, SSE, cancel, and persistence. Training jobs are longer and multi-phase but fit the same model.
+
+### Job model: `TrainJob` (extends run concepts)
+
+One **`job_id`** spans all phases (not three separate IDs):
+
+```text
+phase: queued → slicing → preprocessing → training → done | failed | cancelled
+```
+
+Persisted to `web_outputs/train/<job_id>/status.json` (+ index in `settings.json`).
+
+```json
+{
+  "job_id": "...",
+  "phase": "training",
+  "preset": "t2v",
+  "created_at": "...",
+  "step": 420,
+  "total_steps": 3000,
+  "loss": 0.0842,
+  "lr": 0.00035,
+  "eta_s": 3600,
+  "peak_memory_gb": 28.4,
+  "validation_clips": [{"step": 400, "url": "/api/train/jobs/.../validation/400_0.mp4"}],
+  "artifact_lora": "/api/train/jobs/.../lora.safetensors",
+  "error": null
+}
+```
+
+### Worker architecture
+
+```
+┌─────────────────────────────────────────────────────────┐
+│  FastAPI (web_ui)                                       │
+│  POST /api/train/jobs  → enqueue job_id                 │
+│  GET  /api/train/jobs/{id}/events  → SSE (EventSource)  │
+└───────────────────────┬─────────────────────────────────┘
+                        │
+        ┌───────────────▼───────────────┐
+        │  _train_worker_loop (async)    │  ← mirror _worker_loop
+        │  asyncio.Queue[job_id]           │
+        └───────────────┬───────────────┘
+                        │ asyncio.to_thread()
+        ┌───────────────▼───────────────┐
+        │  ltx_train_backend.py          │
+        │  · slice_videos (ffmpeg)       │
+        │  · preprocess_dataset (MLX)    │
+        │  · LtxvTrainer.train (MLX)     │
+        └───────────────────────────────┘
+```
+
+**MLX exclusivity:** shared `AppState.mlx_busy: asyncio.Lock` — training and generation cannot overlap (same as today’s single gen executor). `POST /api/generate` returns 409 if train active; `POST /api/train/jobs` returns 409 if gen active.
+
+**Threading:** MLX training blocks the GIL/Metal for minutes–hours; run entire `slice` / `preprocess` / `train` in **`asyncio.to_thread()`** (or dedicated `ThreadPoolExecutor(max_workers=1)`), same as LoRA downloads. Main asyncio loop stays responsive for SSE pings.
+
+### SSE event schema (mirror `/api/runs/{id}/events`)
+
+Client uses **`EventSource`** on `/api/train/jobs/{job_id}/events` (same as `subscribeRun` in `App.tsx`). Optional later: WS `train_progress` for raw `server.py` clients.
+
+| Event `type` | When | Payload |
+|--------------|------|---------|
+| `job_started` | Job dequeued | `job_id`, `preset`, `phases` |
+| `phase_started` | slice / preprocess / train begin | `phase`, `message` |
+| `phase_progress` | preprocess clip N/M | `phase`, `current`, `total`, `message` |
+| `train_step` | each optim step | `step`, `total`, `loss`, `lr`, `step_time_s`, `eta_s`, `peak_memory_gb` |
+| `train_validation` | val interval | `step`, `videos: [{url, prompt}]` |
+| `train_checkpoint` | checkpoint saved | `step`, `path` |
+| `ping` | 120s idle | `{}` |
+| `job_done` | success | `artifact_lora`, `stats`, `register_lora_url` |
+| `error` | failure | `message`, `phase` |
+| `job_complete` | always (finally) | `job_id`, `status` |
+
+**Loss streaming:** wrap `TrainingProgress.update_training` in `ltx_train_backend.py` to push `loss`/`lr` into a thread-safe queue drained by the training thread’s `step_callback`. Avoid duplicating the 200-line train loop.
+
+**Cancel:** `POST /api/train/jobs/{id}/cancel` sets `job.cancelled=True`; `step_callback` checks flag and raises `TrainingCancelledError` → `phase: cancelled`, emit `job_complete`.
+
+**Reconnect:** SSE handler replays `status.json` snapshot then attaches to live queue (same pattern as completed runs in `run_events`).
+
+### Frontend (`/train`)
+
+- `subscribeTrainJob(jobId)` — clone of `subscribeRun` with `train_step` / `train_validation` handlers
+- Progress bar: reuse `formatProgressMessage` / `formatMmSs` from `progress.ts`
+- Phase stepper: Slice → Preprocess → Train
+- Validation gallery: thumbnails from `train_validation` events
+- **Background-friendly:** user can navigate away; job continues; reconnect via job list + SSE
+- Header badge: “Training step 420/3000” when job active (poll `/api/train/jobs/active` or keep SSE open globally)
+
+### WebSocket (optional Phase 2b)
+
+For `server.py` WS clients (videofentanyl), add message types parallel to generation:
+
+- `train_job_status` — polled or pushed during training
+- Not required for Web UI (SSE is enough and already works through Vite proxy)
+
+---
+
+## Minimal API surface (revised)
+
+| Method | Path | Purpose |
+|--------|------|---------|
+| GET | `/api/train/health` | `ltx_trainer_mlx` installed, ffmpeg, model path resolved |
+| GET | `/api/train/presets` | T2V / AV templates + field metadata |
+| POST | `/api/train/jobs` | Create job: uploads refs OR multipart in same request |
+| GET | `/api/train/jobs` | List jobs (active + history) |
+| GET | `/api/train/jobs/{id}` | `status.json` snapshot |
+| GET | `/api/train/jobs/{id}/events` | **SSE** progress stream |
+| POST | `/api/train/jobs/{id}/cancel` | Cooperative cancel |
+| GET | `/api/train/jobs/{id}/artifacts/{name}` | LoRA, validation MP4s |
+| POST | `/api/train/jobs/{id}/register-lora` | → existing custom LoRA preset |
+
+Single **`POST /api/train/jobs`** body (Phase 2):
+
+```json
+{
+  "preset": "t2v",
+  "name": "my_style_lora",
+  "slice": { "enabled": false },
+  "preprocess": { "width": 704, "height": 480, "max_frames": 97, "with_audio": false, "frame_rate": 24 },
+  "train": { "steps": 2000, "rank": 64, "learning_rate": 5e-4, "validation_prompts": ["..."], "validation_interval": 500, "checkpoint_interval": 500, "low_ram": false },
+  "video_paths": ["uploaded-id-1", "uploaded-id-2"],
+  "caption_paths": ["uploaded-id-1.txt"]
+}
+```
+
+---
+
+## Revised implementation phases
+
+### Phase 1 — Job shell + preprocess
+- `TrainJob` + worker queue + SSE skeleton
+- Multipart upload → `web_outputs/train/<job_id>/raw/`
+- Preprocess phase only; `phase_progress` events
+
+### Phase 2 — T2V training end-to-end
+- `LtxvTrainer` wrapper + `train_step` / `train_validation` SSE
+- Cancel + `status.json` persistence
+- Register LoRA → inference presets
+- MLX lock vs generation
+
+### Phase 3 — Slice + AV preset
+- Slice phase in job pipeline
+- `with_audio` + `lora_av` simplified preset
+
+### Phase 4 — V2V
+- Reference latent preprocess + validation reference videos
+

From e7cbf9f98511189ef4b9e4aea7de39057978d806 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Thu, 18 Jun 2026 20:08:49 +0200
Subject: [PATCH 3/7] Use durable local paths for training; prefer HF hub cache
 for weights.

Drop /tmp placeholders from train presets; inject paths under web_outputs/train/.
Resolve MLX weights from local dirs, VIDEOFENTANYL_MODELS, or HF hub cache
before downloading. Copy finished LoRAs to repo loras/ or VIDEOFENTANYL_LORA_DIR.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 TRAIN.md                    |  10 +
 ltx_mlx_backend.py          |  48 ++++
 ltx_train_backend.py        | 502 ++++++++++++++++++++++++++++++++++++
 requirements.txt            |   4 +
 train_configs/lora_av.yaml  |  53 ++++
 train_configs/lora_t2v.yaml |  56 ++++
 6 files changed, 673 insertions(+)
 create mode 100644 ltx_train_backend.py
 create mode 100644 train_configs/lora_av.yaml
 create mode 100644 train_configs/lora_t2v.yaml

diff --git a/TRAIN.md b/TRAIN.md
index 5274631..a54e2a0 100644
--- a/TRAIN.md
+++ b/TRAIN.md
@@ -423,6 +423,16 @@ phase: queued → slicing → preprocessing → training → done | failed | can
 
 Persisted to `web_outputs/train/<job_id>/status.json` (+ index in `settings.json`).
 
+**Storage policy (no `/tmp`):**
+
+| Asset | Location |
+|-------|----------|
+| Uploads, clips, preprocessed latents, checkpoints, validation MP4s | `<web_outputs>/train/<job_id>/` |
+| Base MLX weights (preprocess + train) | Local path, `$VIDEOFENTANYL_MODELS` / `<repo>/models/`, or existing **HF hub cache** (`HF_HOME` / `~/.cache/huggingface`) via `resolve_mlx_weights_directory` |
+| Finished LoRA for inference | Copied to `$VIDEOFENTANYL_LORA_DIR` or `<repo>/loras/` when registered |
+
+Preset YAML files under `train_configs/` hold **hyperparameters only** — paths are injected at job start.
+
 ```json
 {
   "job_id": "...",
diff --git a/ltx_mlx_backend.py b/ltx_mlx_backend.py
index 06d0ec9..b038533 100644
--- a/ltx_mlx_backend.py
+++ b/ltx_mlx_backend.py
@@ -141,6 +141,50 @@ def _model_snapshot_present(dest: Path) -> bool:
     return bool(has_config and has_weights)
 
 
+def _hf_hub_cache_roots() -> list[Path]:
+    """Candidate Hugging Face hub cache roots (``HF_HOME``, ``HUGGINGFACE_HUB_CACHE``, defaults)."""
+    candidates: list[Path] = []
+    hub_cache = os.environ.get("HUGGINGFACE_HUB_CACHE", "").strip()
+    if hub_cache:
+        candidates.append(Path(hub_cache).expanduser().resolve())
+    hf_home = os.environ.get("HF_HOME", "").strip()
+    if hf_home:
+        candidates.append(Path(hf_home).expanduser().resolve())
+    xdg = os.environ.get("XDG_CACHE_HOME", "").strip()
+    if xdg:
+        candidates.append((Path(xdg).expanduser() / "huggingface").resolve())
+    candidates.append((Path.home() / ".cache" / "huggingface").resolve())
+    seen: set[Path] = set()
+    unique: list[Path] = []
+    for path in candidates:
+        if path not in seen:
+            seen.add(path)
+            unique.append(path)
+    return unique
+
+
+def find_hf_hub_snapshot(repo_id: str) -> Path | None:
+    """Return the newest materialized weights tree under the HF hub cache, if any."""
+    slug = repo_id.strip().replace("/", "--")
+    best: Path | None = None
+    best_mtime = -1.0
+    for cache_root in _hf_hub_cache_roots():
+        snaps_dir = cache_root / "hub" / f"models--{slug}" / "snapshots"
+        if not snaps_dir.is_dir():
+            continue
+        try:
+            for snap in snaps_dir.iterdir():
+                if not snap.is_dir() or not _model_snapshot_present(snap):
+                    continue
+                mtime = snap.stat().st_mtime
+                if mtime >= best_mtime:
+                    best_mtime = mtime
+                    best = snap.resolve()
+        except OSError:
+            continue
+    return best
+
+
 def hf_local_weights_directory(repo_id: str, explicit_model_dir: str | None) -> Path:
     """
     Directory where we store a full ``snapshot_download`` for ``repo_id``.
@@ -253,6 +297,10 @@ def resolve_mlx_weights_directory(model: str, explicit_model_dir: str | None) ->
                 "Install with:  pip install huggingface_hub\n"
                 "Or use a local directory for --model."
             ) from e
+        hub_snap = find_hf_hub_snapshot(raw)
+        if hub_snap is not None:
+            log.info("Using Hugging Face hub cache snapshot for %r at %s", raw, hub_snap)
+            return str(hub_snap)
         dest = hf_local_weights_directory(raw, explicit_model_dir)
         dest.mkdir(parents=True, exist_ok=True)
         if _model_snapshot_present(dest):
diff --git a/ltx_train_backend.py b/ltx_train_backend.py
new file mode 100644
index 0000000..04d1d81
--- /dev/null
+++ b/ltx_train_backend.py
@@ -0,0 +1,502 @@
+# SPDX-License-Identifier: Apache-2.0
+"""MLX LoRA training adapter for ltx-trainer-mlx (optional dependency).
+
+Storage policy (no ``/tmp``):
+- Per-job artifacts live under ``<web_outputs>/train/<job_id>/`` (raw uploads, clips,
+  preprocessed latents, checkpoints, validation MP4s).
+- Base MLX weights resolve via :func:`resolve_mlx_weights_directory` — explicit local
+  path, ``$VIDEOFENTANYL_MODELS`` / ``<repo>/models/``, or an existing Hugging Face
+  hub cache snapshot before downloading.
+- Finished LoRAs copied for inference via :func:`register_trained_lora` into
+  ``$VIDEOFENTANYL_LORA_DIR`` or ``<repo>/loras/``.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import shutil
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable
+
+import yaml
+
+from ltx_mlx_backend import (
+    LTX2_MLX_GIT_TAG,
+    _local_lora_cache_dir,
+    _nearest_valid_frames,
+    resolve_mlx_weights_directory,
+)
+
+log = logging.getLogger("ltx_train")
+
+REPO_ROOT = Path(__file__).resolve().parent
+TRAIN_CONFIGS_DIR = REPO_ROOT / "train_configs"
+DEFAULT_GEMMA = "mlx-community/gemma-3-12b-it-4bit"
+
+TRAINER_INSTALL_HINT = (
+    f'uv pip install "ltx-trainer-mlx @ git+https://github.com/dgrauet/ltx-2-mlx.git@{LTX2_MLX_GIT_TAG}'
+    f'#subdirectory=packages/ltx-trainer"'
+)
+
+
+class TrainingCancelledError(Exception):
+    """Raised when a cooperative training cancel is requested."""
+
+
+@dataclass
+class TrainPresetInfo:
+    id: str
+    label: str
+    description: str
+    ram_hint: str
+    with_audio: bool
+    low_ram_default: bool
+
+
+TRAIN_PRESETS: dict[str, TrainPresetInfo] = {
+    "t2v": TrainPresetInfo(
+        id="t2v",
+        label="Text-to-video style",
+        description="Video-only LoRA on the default distilled/dev stack.",
+        ram_hint="32–48 GB unified memory",
+        with_audio=False,
+        low_ram_default=False,
+    ),
+    "av": TrainPresetInfo(
+        id="av",
+        label="Audio + video style",
+        description="Joint AV LoRA (whisper/ASMR-style); dev transformer + checkpointing.",
+        ram_hint="64 GB recommended",
+        with_audio=True,
+        low_ram_default=True,
+    ),
+}
+
+
+@dataclass
+class SliceOptions:
+    enabled: bool = False
+    interval: float = 4.0
+    res: str = "384x384"
+    fps: float = 24.0
+    fit: str = "crop"
+    caption_template: str | None = None
+    max_clips: int | None = None
+
+
+@dataclass
+class PreprocessOptions:
+    width: int | None = 704
+    height: int | None = 480
+    max_frames: int = 97
+    with_audio: bool = False
+    frame_rate: float | None = 24.0
+
+
+@dataclass
+class TrainHyperparams:
+    steps: int = 2000
+    rank: int = 64
+    learning_rate: float = 5e-4
+    validation_prompts: list[str] = field(default_factory=lambda: ["a cinematic landscape at sunset"])
+    validation_interval: int = 500
+    checkpoint_interval: int = 500
+    low_ram: bool = False
+    seed: int = 42
+
+
+@dataclass
+class TrainJobRequest:
+    preset: str
+    name: str
+    model_id: str
+    model_dir: str | None
+    slice: SliceOptions = field(default_factory=SliceOptions)
+    preprocess: PreprocessOptions = field(default_factory=PreprocessOptions)
+    train: TrainHyperparams = field(default_factory=TrainHyperparams)
+
+
+EventCallback = Callable[[dict[str, Any]], None]
+CancelCheck = Callable[[], bool]
+
+
+def trainer_available() -> bool:
+    try:
+        import ltx_trainer_mlx  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def trainer_health(*, ffmpeg_required: bool = False) -> dict[str, Any]:
+    ok = trainer_available()
+    ffmpeg = bool(shutil.which("ffmpeg"))
+    return {
+        "ok": ok and (ffmpeg or not ffmpeg_required),
+        "trainer_installed": ok,
+        "ffmpeg_available": ffmpeg,
+        "install_hint": None if ok else TRAINER_INSTALL_HINT,
+        "presets": [p.__dict__ for p in TRAIN_PRESETS.values()],
+        "configs_dir": str(TRAIN_CONFIGS_DIR),
+    }
+
+
+def job_root(output_dir: Path, job_id: str) -> Path:
+    return output_dir.resolve() / "train" / job_id
+
+
+@dataclass(frozen=True)
+class TrainJobPaths:
+    """All mutable training artifacts for one job (under ``web_outputs``)."""
+
+    root: Path
+    raw: Path
+    clips: Path
+    captions: Path
+    preprocessed: Path
+    outputs: Path
+    config: Path
+
+    def ensure_dirs(self) -> None:
+        for d in (self.root, self.raw, self.clips, self.captions, self.preprocessed, self.outputs):
+            d.mkdir(parents=True, exist_ok=True)
+
+
+def training_job_paths(output_dir: Path, job_id: str) -> TrainJobPaths:
+    root = job_root(output_dir, job_id)
+    return TrainJobPaths(
+        root=root,
+        raw=root / "raw",
+        clips=root / "clips",
+        captions=root / "captions",
+        preprocessed=root / "preprocessed",
+        outputs=root / "outputs",
+        config=root / "config.yaml",
+    )
+
+
+def register_trained_lora(lora_path: Path, *, name: str) -> Path:
+    """Copy a finished LoRA into the persistent local cache for inference presets."""
+    src = lora_path.expanduser().resolve()
+    if not src.is_file():
+        raise FileNotFoundError(f"LoRA weights not found: {src}")
+    dest_dir = _local_lora_cache_dir()
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    slug = re.sub(r"[^\w.\-]+", "_", (name or "trained_lora").strip()).strip("._") or "trained_lora"
+    dest = (dest_dir / f"{slug}.safetensors").resolve()
+    shutil.copy2(src, dest)
+    return dest
+
+
+def status_path(output_dir: Path, job_id: str) -> Path:
+    return job_root(output_dir, job_id) / "status.json"
+
+
+def load_status(output_dir: Path, job_id: str) -> dict[str, Any] | None:
+    path = status_path(output_dir, job_id)
+    if not path.is_file():
+        return None
+    try:
+        import json
+
+        return json.loads(path.read_text(encoding="utf-8"))
+    except (OSError, ValueError):
+        return None
+
+
+def save_status(output_dir: Path, job_id: str, payload: dict[str, Any]) -> None:
+    import json
+
+    root = job_root(output_dir, job_id)
+    root.mkdir(parents=True, exist_ok=True)
+    path = status_path(output_dir, job_id)
+    path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+
+
+def _preset_yaml_path(preset: str) -> Path:
+    key = (preset or "t2v").strip().lower()
+    path = TRAIN_CONFIGS_DIR / f"lora_{key}.yaml"
+    if not path.is_file():
+        path = TRAIN_CONFIGS_DIR / "lora_t2v.yaml"
+    if not path.is_file():
+        raise FileNotFoundError(f"No training preset config for {preset!r}")
+    return path
+
+
+def build_trainer_config(req: TrainJobRequest, *, paths: TrainJobPaths) -> Any:
+    from ltx_trainer_mlx.config import LtxTrainerConfig
+
+    raw = yaml.safe_load(_preset_yaml_path(req.preset).read_text(encoding="utf-8"))
+    model_path = resolve_mlx_weights_directory(req.model_id, req.model_dir)
+
+    model_block = dict(raw.get("model") or {})
+    model_block["model_path"] = model_path
+    raw["model"] = model_block
+    raw["data"] = {"preprocessed_data_root": str(paths.preprocessed.resolve())}
+    raw["output_dir"] = str(paths.outputs.resolve())
+    raw["seed"] = int(req.train.seed)
+
+    raw["optimization"]["steps"] = int(req.train.steps)
+    raw["optimization"]["learning_rate"] = float(req.train.learning_rate)
+    if req.train.low_ram:
+        raw["optimization"]["enable_gradient_checkpointing"] = True
+
+    lora = raw.get("lora") or {}
+    lora["rank"] = int(req.train.rank)
+    lora["alpha"] = int(req.train.rank)
+    raw["lora"] = lora
+
+    prompts = [p.strip() for p in req.train.validation_prompts if str(p).strip()]
+    if not prompts:
+        prompts = ["a cinematic landscape at sunset"]
+    val = raw.get("validation") or {}
+    val["prompts"] = prompts
+    val["interval"] = int(req.train.validation_interval)
+    val["skip_initial_validation"] = True
+    w = int(req.preprocess.width or 704)
+    h = int(req.preprocess.height or 480)
+    nf = _nearest_valid_frames(int(req.preprocess.max_frames))
+    val["video_dims"] = [w, h, nf]
+    val["frame_rate"] = float(req.preprocess.frame_rate or 24.0)
+    val["generate_audio"] = bool(TRAIN_PRESETS.get(req.preset, TRAIN_PRESETS["t2v"]).with_audio)
+    raw["validation"] = val
+
+    ckpt = raw.get("checkpoints") or {}
+    ckpt["interval"] = int(req.train.checkpoint_interval)
+    raw["checkpoints"] = ckpt
+
+    strat = raw.get("training_strategy") or {}
+    preset_info = TRAIN_PRESETS.get(req.preset, TRAIN_PRESETS["t2v"])
+    strat["generate_audio"] = preset_info.with_audio
+    raw["training_strategy"] = strat
+
+    return LtxTrainerConfig(**raw)
+
+
+@contextmanager
+def _metrics_hook(on_metrics: Callable[[dict[str, float]], None] | None):
+    if on_metrics is None:
+        yield
+        return
+    from ltx_trainer_mlx import progress as progress_mod
+
+    orig_cls = progress_mod.TrainingProgress
+    orig_update = orig_cls.update_training
+
+    def patched_update(
+        self,
+        *,
+        loss: float,
+        lr: float,
+        step_time: float,
+        advance: bool = True,
+    ) -> None:
+        orig_update(self, loss=loss, lr=lr, step_time=step_time, advance=advance)
+        if advance:
+            try:
+                on_metrics({"loss": float(loss), "lr": float(lr), "step_time_s": float(step_time)})
+            except Exception:
+                pass
+
+    progress_mod.TrainingProgress.update_training = patched_update  # type: ignore[method-assign]
+    try:
+        yield
+    finally:
+        progress_mod.TrainingProgress.update_training = orig_update  # type: ignore[method-assign]
+
+
+def _check_cancel(should_cancel: CancelCheck | None) -> None:
+    if should_cancel and should_cancel():
+        raise TrainingCancelledError("Training cancelled")
+
+
+def run_train_job(
+    req: TrainJobRequest,
+    *,
+    output_dir: Path,
+    job_id: str,
+    on_event: EventCallback | None = None,
+    should_cancel: CancelCheck | None = None,
+) -> dict[str, Any]:
+    """Execute slice → preprocess → train for one job."""
+    if not trainer_available():
+        raise RuntimeError(f"ltx-trainer-mlx is not installed. {TRAINER_INSTALL_HINT}")
+
+    paths = training_job_paths(output_dir, job_id)
+    paths.ensure_dirs()
+
+    status: dict[str, Any] = {
+        "job_id": job_id,
+        "name": req.name,
+        "preset": req.preset,
+        "phase": "queued",
+        "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "step": 0,
+        "total_steps": int(req.train.steps),
+        "job_dir": str(paths.root),
+        "model_path": None,
+        "error": None,
+    }
+    save_status(output_dir, job_id, status)
+
+    def emit(event: dict[str, Any]) -> None:
+        nonlocal status
+        status.update({k: v for k, v in event.items() if k != "type"})
+        save_status(output_dir, job_id, status)
+        if on_event:
+            on_event(event)
+
+    videos_dir = paths.raw
+    captions_dir: str | None = None
+
+    try:
+        if req.slice.enabled:
+            _check_cancel(should_cancel)
+            status["phase"] = "slicing"
+            emit({"type": "phase_started", "phase": "slicing", "message": "Slicing source videos…"})
+            if not shutil.which("ffmpeg"):
+                raise RuntimeError("ffmpeg is required for slice")
+            from ltx_trainer_mlx.slice_clips import slice_videos
+
+            sources = sorted(
+                p
+                for p in paths.raw.iterdir()
+                if p.suffix.lower() in {".mp4", ".mov", ".avi", ".mkv", ".webm"}
+            )
+            if not sources:
+                raise ValueError("No video files found in upload")
+            count = slice_videos(
+                [str(p) for p in sources],
+                str(paths.clips),
+                interval=float(req.slice.interval),
+                res=str(req.slice.res),
+                fps=float(req.slice.fps),
+                fit=str(req.slice.fit),
+                caption_template=req.slice.caption_template,
+                max_clips=req.slice.max_clips,
+            )
+            emit({"type": "phase_progress", "phase": "slicing", "message": f"Created {count} clips"})
+            videos_dir = paths.clips
+            captions_dir = None
+        else:
+            txts = list(paths.raw.glob("*.txt"))
+            if txts:
+                paths.captions.mkdir(parents=True, exist_ok=True)
+                for t in txts:
+                    shutil.copy2(t, paths.captions / t.name)
+                captions_dir = str(paths.captions)
+
+        _check_cancel(should_cancel)
+        status["phase"] = "preprocessing"
+        emit({"type": "phase_started", "phase": "preprocessing", "message": "Encoding latents…"})
+        from ltx_trainer_mlx.preprocess import preprocess_dataset
+
+        model_path = resolve_mlx_weights_directory(req.model_id, req.model_dir)
+        status["model_path"] = model_path
+        nf = _nearest_valid_frames(int(req.preprocess.max_frames))
+        preset_info = TRAIN_PRESETS.get(req.preset, TRAIN_PRESETS["t2v"])
+        with_audio = req.preprocess.with_audio or preset_info.with_audio
+        preprocess_dataset(
+            videos_dir=str(videos_dir),
+            output_dir=str(paths.preprocessed),
+            model_dir=model_path,
+            gemma_model_id=DEFAULT_GEMMA,
+            target_height=int(req.preprocess.height) if req.preprocess.height else None,
+            target_width=int(req.preprocess.width) if req.preprocess.width else None,
+            max_frames=nf,
+            captions_dir=captions_dir,
+            with_audio=with_audio,
+            frame_rate=float(req.preprocess.frame_rate) if req.preprocess.frame_rate else None,
+        )
+        emit({"type": "phase_progress", "phase": "preprocessing", "message": "Preprocess complete"})
+
+        _check_cancel(should_cancel)
+        status["phase"] = "training"
+        status["total_steps"] = int(req.train.steps)
+        emit({"type": "phase_started", "phase": "training", "message": "Training LoRA…"})
+
+        from ltx_trainer_mlx.trainer import LtxvTrainer
+
+        config = build_trainer_config(req, paths=paths)
+        paths.config.write_text(yaml.safe_dump(config.model_dump(mode="json")), encoding="utf-8")
+
+        train_t0 = time.time()
+        last_metrics: dict[str, float] = {}
+
+        def on_metrics(m: dict[str, float]) -> None:
+            last_metrics.update(m)
+
+        def step_callback(step: int, total: int, validation_paths: list) -> None:
+            _check_cancel(should_cancel)
+            elapsed = max(time.time() - train_t0, 1e-6)
+            eta_s = (elapsed / max(step, 1)) * max(total - step, 0)
+            payload: dict[str, Any] = {
+                "type": "train_step",
+                "phase": "training",
+                "step": int(step),
+                "total_steps": int(total),
+                "eta_s": round(eta_s, 1),
+            }
+            if last_metrics:
+                payload.update(last_metrics)
+            emit(payload)
+            if validation_paths:
+                rels = []
+                for vp in validation_paths:
+                    p = Path(vp)
+                    try:
+                        rel = p.relative_to(paths.outputs)
+                    except ValueError:
+                        rel = p.name
+                    rels.append(
+                        {
+                            "step": int(step),
+                            "filename": str(rel),
+                            "url": f"/api/train/jobs/{job_id}/artifacts/{rel.as_posix()}",
+                        }
+                    )
+                status.setdefault("validation_clips", []).extend(rels)
+                emit({"type": "train_validation", "step": int(step), "videos": rels})
+
+        with _metrics_hook(on_metrics):
+            trainer = LtxvTrainer(config)
+            saved_path, stats = trainer.train(
+                disable_progress_bars=True,
+                step_callback=step_callback,
+            )
+
+        lora_path = Path(saved_path)
+        artifact_url = f"/api/train/jobs/{job_id}/artifacts/{lora_path.name}"
+        status["phase"] = "done"
+        status["artifact_lora"] = str(lora_path)
+        status["artifact_url"] = artifact_url
+        status["stats"] = stats.model_dump() if hasattr(stats, "model_dump") else dict(stats)
+        save_status(output_dir, job_id, status)
+        emit(
+            {
+                "type": "job_done",
+                "artifact_url": artifact_url,
+                "artifact_name": lora_path.name,
+                "stats": status["stats"],
+            }
+        )
+        return status
+
+    except TrainingCancelledError:
+        status["phase"] = "cancelled"
+        status["error"] = "Cancelled"
+        save_status(output_dir, job_id, status)
+        emit({"type": "error", "phase": status.get("phase"), "message": "Cancelled"})
+        raise
+    except Exception as exc:
+        log.exception("Train job %s failed", job_id)
+        status["phase"] = "failed"
+        status["error"] = str(exc)
+        save_status(output_dir, job_id, status)
+        emit({"type": "error", "phase": status.get("phase"), "message": str(exc)})
+        raise
diff --git a/requirements.txt b/requirements.txt
index b0f3f89..82dde9a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,3 +23,7 @@ mcp>=1.0.0
 #
 # Tag must match ltx_mlx_backend.LTX2_MLX_GIT_TAG (v0.14.12).
 # Upstream: https://github.com/dgrauet/ltx-2-mlx
+#
+# Optional trainer (training branch / Web UI /train):
+#   uv pip install \
+#     "ltx-trainer-mlx @ git+https://github.com/dgrauet/ltx-2-mlx.git@v0.14.12#subdirectory=packages/ltx-trainer"
diff --git a/train_configs/lora_av.yaml b/train_configs/lora_av.yaml
new file mode 100644
index 0000000..dc879f6
--- /dev/null
+++ b/train_configs/lora_av.yaml
@@ -0,0 +1,53 @@
+# Hyperparameters only — paths injected at runtime (see lora_t2v.yaml header).
+model:
+  transformer_file: transformer-dev.safetensors
+  text_encoder_path: mlx-community/gemma-3-12b-it-4bit
+  training_mode: lora
+
+lora:
+  rank: 32
+  alpha: 32
+  dropout: 0.0
+  target_modules:
+    - audio_attn1
+    - audio_attn2
+    - video_to_audio_attn
+
+optimization:
+  learning_rate: 1.5e-4
+  steps: 2000
+  batch_size: 1
+  gradient_accumulation_steps: 1
+  max_grad_norm: 1.0
+  weight_decay: 0.0
+  enable_gradient_checkpointing: true
+  scheduler_type: linear
+  scheduler_params:
+    start_factor: 1.0
+    end_factor: 0.1
+
+training_strategy:
+  name: text_to_video
+  generate_audio: true
+
+flow_matching:
+  timestep_sampling_mode: shifted_logit_normal
+
+validation:
+  prompts:
+    - "a person speaking softly close to a microphone, intimate ASMR"
+  video_dims: [192, 192, 97]
+  frame_rate: 24.0
+  inference_steps: 8
+  interval: 200
+  guidance_scale: 4.0
+  stg_scale: 0.0
+  seed: 42
+  generate_audio: true
+  skip_initial_validation: true
+
+checkpoints:
+  interval: 100
+  keep_last_n: 10
+
+seed: 42
diff --git a/train_configs/lora_t2v.yaml b/train_configs/lora_t2v.yaml
new file mode 100644
index 0000000..eaf724d
--- /dev/null
+++ b/train_configs/lora_t2v.yaml
@@ -0,0 +1,56 @@
+# Hyperparameters only — paths are injected at runtime:
+#   model_path     → local dir / $VIDEOFENTANYL_MODELS / HF hub cache (resolve_mlx_weights_directory)
+#   preprocessed   → <web_outputs>/train/<job_id>/preprocessed
+#   output_dir     → <web_outputs>/train/<job_id>/outputs
+model:
+  text_encoder_path: mlx-community/gemma-3-12b-it-4bit
+  training_mode: lora
+
+lora:
+  rank: 64
+  alpha: 64
+  dropout: 0.0
+  target_modules:
+    - to_k
+    - to_q
+    - to_v
+    - to_out.0
+
+optimization:
+  learning_rate: 5.0e-4
+  steps: 2000
+  batch_size: 1
+  gradient_accumulation_steps: 1
+  max_grad_norm: 1.0
+  weight_decay: 0.0
+  enable_gradient_checkpointing: false
+  scheduler_type: linear
+  scheduler_params:
+    start_factor: 1.0
+    end_factor: 0.1
+
+training_strategy:
+  name: text_to_video
+  generate_audio: false
+
+flow_matching:
+  timestep_sampling_mode: shifted_logit_normal
+
+validation:
+  prompts:
+    - "a cinematic shot of a cat walking through a sunlit garden"
+  video_dims: [704, 480, 25]
+  frame_rate: 24.0
+  inference_steps: 8
+  interval: 500
+  guidance_scale: 4.0
+  stg_scale: 0.0
+  seed: 42
+  generate_audio: false
+  skip_initial_validation: true
+
+checkpoints:
+  interval: 500
+  keep_last_n: 3
+
+seed: 42

From 1ee5b5ac0393ff862324ffda5097cc99fe3c8229 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Thu, 18 Jun 2026 20:18:07 +0200
Subject: [PATCH 4/7] Add /train wizard UI and training job API with live SSE
 progress.

Wire web_train routes and worker queue into the Web UI, add a four-step LoRA training flow with preset picker, uploads, hyperparameters, and validation gallery plus one-click register to the generation library.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 web/package-lock.json |  60 +++-
 web/package.json      |   3 +-
 web/src/App.tsx       |  12 +-
 web/src/Layout.tsx    |  25 ++
 web/src/TrainPage.tsx | 764 ++++++++++++++++++++++++++++++++++++++++++
 web/src/api/train.ts  | 113 +++++++
 web/src/index.css     | 648 +++++++++++++++++++++++++++++++++++
 web/src/main.tsx      |  12 +-
 web/src/types.ts      |  44 +++
 web_train.py          | 488 +++++++++++++++++++++++++++
 web_ui.py             |  15 +-
 11 files changed, 2172 insertions(+), 12 deletions(-)
 create mode 100644 web/src/Layout.tsx
 create mode 100644 web/src/TrainPage.tsx
 create mode 100644 web/src/api/train.ts
 create mode 100644 web_train.py

diff --git a/web/package-lock.json b/web/package-lock.json
index f728aab..d757da3 100644
--- a/web/package-lock.json
+++ b/web/package-lock.json
@@ -9,7 +9,8 @@
       "version": "0.1.0",
       "dependencies": {
         "react": "^19.1.0",
-        "react-dom": "^19.1.0"
+        "react-dom": "^19.1.0",
+        "react-router-dom": "^7.6.2"
       },
       "devDependencies": {
         "@types/react": "^19.1.0",
@@ -1318,6 +1319,19 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/cookie": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-1.1.1.tgz",
+      "integrity": "sha512-ei8Aos7ja0weRpFzJnEA9UHJ/7XQmqglbRwnf2ATjcB9Wq874VKH9kfjjirM6UhU2/E5fFYadylyhFldcqSidQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
     "node_modules/csstype": {
       "version": "3.2.3",
       "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz",
@@ -1604,6 +1618,44 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/react-router": {
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/react-router/-/react-router-7.18.0.tgz",
+      "integrity": "sha512-pTTGt8J+ji1NOmYnjzT+bAJy/1zD+Jp4ziO6cL7T3ZLvXKtusO7BpFqlRXitqpcPVqllsIXFHRMt+2/k3Xn6HQ==",
+      "license": "MIT",
+      "dependencies": {
+        "cookie": "^1.0.1",
+        "set-cookie-parser": "^2.6.0"
+      },
+      "engines": {
+        "node": ">=20.0.0"
+      },
+      "peerDependencies": {
+        "react": ">=18",
+        "react-dom": ">=18"
+      },
+      "peerDependenciesMeta": {
+        "react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/react-router-dom": {
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/react-router-dom/-/react-router-dom-7.18.0.tgz",
+      "integrity": "sha512-Fi0yY6kgtKae/Th2xibdWK0KSdYZ4B53Gyf6wRtomOKWgpNm7H7+DyfDhncdz9FKbpS+1jmDhg3F4WoGJ+yFOA==",
+      "license": "MIT",
+      "dependencies": {
+        "react-router": "7.18.0"
+      },
+      "engines": {
+        "node": ">=20.0.0"
+      },
+      "peerDependencies": {
+        "react": ">=18",
+        "react-dom": ">=18"
+      }
+    },
     "node_modules/rollup": {
       "version": "4.62.0",
       "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.62.0.tgz",
@@ -1665,6 +1717,12 @@
         "semver": "bin/semver.js"
       }
     },
+    "node_modules/set-cookie-parser": {
+      "version": "2.7.2",
+      "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.7.2.tgz",
+      "integrity": "sha512-oeM1lpU/UvhTxw+g3cIfxXHyJRc/uidd3yK1P242gzHds0udQBYzs3y8j4gCCW+ZJ7ad0yctld8RYO+bdurlvw==",
+      "license": "MIT"
+    },
     "node_modules/source-map-js": {
       "version": "1.2.1",
       "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
diff --git a/web/package.json b/web/package.json
index c9e5fba..7ba7a84 100644
--- a/web/package.json
+++ b/web/package.json
@@ -10,7 +10,8 @@
   },
   "dependencies": {
     "react": "^19.1.0",
-    "react-dom": "^19.1.0"
+    "react-dom": "^19.1.0",
+    "react-router-dom": "^7.6.2"
   },
   "devDependencies": {
     "@types/react": "^19.1.0",
diff --git a/web/src/App.tsx b/web/src/App.tsx
index 2121997..8f642e6 100644
--- a/web/src/App.tsx
+++ b/web/src/App.tsx
@@ -1398,12 +1398,8 @@ export default function App() {
   }, [prompt, fitPromptHeight]);
 
   return (
-    <div className="app">
-      <header className="header">
-        <div className="brand">
-          <span className="brand-mark">LTX-WS</span>
-          <span className="brand-sub">Videofentanyl</span>
-        </div>
+    <>
+      <div className="page-toolbar">
         <div className="header-status">
           <button
             type="button"
@@ -1419,7 +1415,7 @@ export default function App() {
           />
           {serverOk ? "Server connected" : "Server offline"}
         </div>
-      </header>
+      </div>
 
       <div className="app-body">
         <div className="app-main">
@@ -2227,6 +2223,6 @@ export default function App() {
           </div>
         </aside>
       </div>
-    </div>
+    </>
   );
 }
diff --git a/web/src/Layout.tsx b/web/src/Layout.tsx
new file mode 100644
index 0000000..1eccf02
--- /dev/null
+++ b/web/src/Layout.tsx
@@ -0,0 +1,25 @@
+import { NavLink, Outlet } from "react-router-dom";
+
+export default function Layout() {
+  return (
+    <div className="app">
+      <header className="header">
+        <div className="brand-row">
+          <div className="brand">
+            <span className="brand-mark">LTX-WS</span>
+            <span className="brand-sub">Videofentanyl</span>
+          </div>
+          <nav className="main-nav" aria-label="Main">
+            <NavLink to="/" end className={({ isActive }) => (isActive ? "nav-link active" : "nav-link")}>
+              Generate
+            </NavLink>
+            <NavLink to="/train" className={({ isActive }) => (isActive ? "nav-link active" : "nav-link")}>
+              Train LoRA
+            </NavLink>
+          </nav>
+        </div>
+      </header>
+      <Outlet />
+    </div>
+  );
+}
diff --git a/web/src/TrainPage.tsx b/web/src/TrainPage.tsx
new file mode 100644
index 0000000..7595a22
--- /dev/null
+++ b/web/src/TrainPage.tsx
@@ -0,0 +1,764 @@
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import type { Config, TrainHealth, TrainJob, TrainPreset } from "./types";
+import {
+  cancelTrainJob,
+  createTrainJob,
+  fetchTrainHealth,
+  fetchTrainJobs,
+  fetchTrainPresets,
+  registerTrainedLora,
+  subscribeTrainJob,
+  type TrainManifest,
+} from "./api/train";
+
+type WizardStep = "dataset" | "preprocess" | "train" | "runs";
+
+const STEPS: { id: WizardStep; label: string; hint: string }[] = [
+  { id: "dataset", label: "Dataset", hint: "Videos & captions" },
+  { id: "preprocess", label: "Preprocess", hint: "Resolution & frames" },
+  { id: "train", label: "Train", hint: "Hyperparameters" },
+  { id: "runs", label: "Runs", hint: "Progress & output" },
+];
+
+function formatEta(seconds?: number): string {
+  if (seconds == null || !Number.isFinite(seconds)) return "—";
+  const s = Math.max(0, Math.round(seconds));
+  const h = Math.floor(s / 3600);
+  const m = Math.floor((s % 3600) / 60);
+  const sec = s % 60;
+  if (h > 0) return `${h}h ${m}m`;
+  if (m > 0) return `${m}m ${sec}s`;
+  return `${sec}s`;
+}
+
+function phaseLabel(phase?: string): string {
+  switch (phase) {
+    case "slicing":
+      return "Slicing videos";
+    case "preprocessing":
+      return "Preprocessing latents";
+    case "training":
+      return "Training LoRA";
+    case "done":
+      return "Complete";
+    case "failed":
+      return "Failed";
+    case "cancelled":
+      return "Cancelled";
+    default:
+      return phase || "Queued";
+  }
+}
+
+export default function TrainPage() {
+  const [step, setStep] = useState<WizardStep>("dataset");
+  const [health, setHealth] = useState<TrainHealth | null>(null);
+  const [presets, setPresets] = useState<TrainPreset[]>([]);
+  const [config, setConfig] = useState<Config | null>(null);
+  const [jobs, setJobs] = useState<TrainJob[]>([]);
+  const [activeJobId, setActiveJobId] = useState<string | null>(null);
+  const [error, setError] = useState<string | null>(null);
+  const [submitting, setSubmitting] = useState(false);
+  const [registering, setRegistering] = useState(false);
+
+  const [name, setName] = useState("My LoRA");
+  const [preset, setPreset] = useState("t2v");
+  const [videos, setVideos] = useState<File[]>([]);
+  const [dragOver, setDragOver] = useState(false);
+  const fileInputRef = useRef<HTMLInputElement>(null);
+
+  const [sliceEnabled, setSliceEnabled] = useState(false);
+  const [sliceInterval, setSliceInterval] = useState(4);
+  const [sliceRes, setSliceRes] = useState("384x384");
+  const [sliceFps, setSliceFps] = useState(24);
+  const [sliceFit, setSliceFit] = useState("crop");
+  const [captionTemplate, setCaptionTemplate] = useState("");
+
+  const [width, setWidth] = useState(704);
+  const [height, setHeight] = useState(480);
+  const [maxFrames, setMaxFrames] = useState(97);
+  const [withAudio, setWithAudio] = useState(false);
+
+  const [steps, setSteps] = useState(2000);
+  const [rank, setRank] = useState(64);
+  const [learningRate, setLearningRate] = useState(0.0005);
+  const [validationPrompts, setValidationPrompts] = useState(
+    "a cinematic landscape at sunset\na person walking through neon rain",
+  );
+  const [validationInterval, setValidationInterval] = useState(500);
+  const [checkpointInterval, setCheckpointInterval] = useState(500);
+  const [lowRam, setLowRam] = useState(false);
+  const [seed, setSeed] = useState(42);
+
+  const activeJob = useMemo(
+    () => jobs.find((j) => j.id === activeJobId) ?? null,
+    [jobs, activeJobId],
+  );
+
+  const selectedPreset = useMemo(
+    () => presets.find((p) => p.id === preset) ?? presets[0],
+    [presets, preset],
+  );
+
+  const refreshJobs = useCallback(async () => {
+    try {
+      const list = await fetchTrainJobs();
+      setJobs(list);
+    } catch {
+      /* ignore */
+    }
+  }, []);
+
+  useEffect(() => {
+    fetch("/api/config")
+      .then((r) => r.json())
+      .then((c: Config) => setConfig(c))
+      .catch(() => {});
+    fetchTrainHealth().then(setHealth).catch(() => {});
+    fetchTrainPresets().then(setPresets).catch(() => {});
+    refreshJobs();
+  }, [refreshJobs]);
+
+  useEffect(() => {
+    if (!selectedPreset) return;
+    setWithAudio(selectedPreset.with_audio);
+    setLowRam(selectedPreset.low_ram_default);
+  }, [selectedPreset?.id]);
+
+  const manifest = useMemo((): TrainManifest => {
+    const prompts = validationPrompts
+      .split("\n")
+      .map((p) => p.trim())
+      .filter(Boolean);
+    return {
+      name,
+      preset,
+      model_id: config?.preferred_model || config?.active_model || "auto",
+      slice: {
+        enabled: sliceEnabled,
+        interval: sliceInterval,
+        res: sliceRes,
+        fps: sliceFps,
+        fit: sliceFit,
+        caption_template: captionTemplate.trim() || undefined,
+      },
+      preprocess: {
+        width,
+        height,
+        max_frames: maxFrames,
+        with_audio: withAudio,
+        frame_rate: 24,
+      },
+      train: {
+        steps,
+        rank,
+        learning_rate: learningRate,
+        validation_prompts: prompts.length ? prompts : ["a cinematic landscape at sunset"],
+        validation_interval: validationInterval,
+        checkpoint_interval: checkpointInterval,
+        low_ram: lowRam,
+        seed,
+      },
+    };
+  }, [
+    name,
+    preset,
+    config,
+    sliceEnabled,
+    sliceInterval,
+    sliceRes,
+    sliceFps,
+    sliceFit,
+    captionTemplate,
+    width,
+    height,
+    maxFrames,
+    withAudio,
+    steps,
+    rank,
+    learningRate,
+    validationPrompts,
+    validationInterval,
+    checkpointInterval,
+    lowRam,
+    seed,
+  ]);
+
+  const updateJob = useCallback((jobId: string, patch: Partial<TrainJob>) => {
+    setJobs((prev) =>
+      prev.map((j) => (j.id === jobId ? { ...j, ...patch } : j)),
+    );
+  }, []);
+
+  useEffect(() => {
+    if (!activeJobId) return;
+    const unsub = subscribeTrainJob(activeJobId, (event) => {
+      const type = String(event.type || "");
+      if (type === "phase_started") {
+        updateJob(activeJobId, { phase: String(event.phase || ""), status: "running" });
+      } else if (type === "train_step") {
+        updateJob(activeJobId, {
+          step: Number(event.step) || 0,
+          total_steps: Number(event.total_steps) || 0,
+          loss: event.loss != null ? Number(event.loss) : undefined,
+          lr: event.lr != null ? Number(event.lr) : undefined,
+          eta_s: event.eta_s != null ? Number(event.eta_s) : undefined,
+          phase: "training",
+          status: "running",
+        });
+      } else if (type === "train_validation") {
+        const videos = (event.videos as TrainJob["validation_clips"]) || [];
+        setJobs((prev) =>
+          prev.map((j) =>
+            j.id === activeJobId
+              ? { ...j, validation_clips: [...(j.validation_clips || []), ...videos] }
+              : j,
+          ),
+        );
+      } else if (type === "job_done") {
+        updateJob(activeJobId, {
+          status: "done",
+          phase: "done",
+          artifact_url: String(event.artifact_url || ""),
+          artifact_name: String(event.artifact_name || ""),
+        });
+      } else if (type === "error") {
+        updateJob(activeJobId, {
+          status: event.message === "Cancelled" ? "cancelled" : "failed",
+          phase: event.message === "Cancelled" ? "cancelled" : "failed",
+          error: String(event.message || "Error"),
+        });
+      } else if (type === "snapshot" && event.job) {
+        const snap = event.job as TrainJob;
+        updateJob(activeJobId, snap);
+      }
+      if (type === "job_complete") {
+        refreshJobs();
+      }
+    });
+    return unsub;
+  }, [activeJobId, updateJob, refreshJobs]);
+
+  function addFiles(fileList: FileList | File[]) {
+    const incoming = Array.from(fileList).filter((f) =>
+      /\.(mp4|mov|avi|mkv|webm|txt)$/i.test(f.name),
+    );
+    if (!incoming.length) return;
+    setVideos((prev) => {
+      const names = new Set(prev.map((f) => f.name));
+      const merged = [...prev];
+      for (const f of incoming) {
+        if (!names.has(f.name)) merged.push(f);
+      }
+      return merged;
+    });
+  }
+
+  async function startTraining() {
+    setError(null);
+    const videoFiles = videos.filter((f) => !f.name.toLowerCase().endsWith(".txt"));
+    if (!videoFiles.length) {
+      setError("Add at least one video file (.mp4, .mov, …).");
+      setStep("dataset");
+      return;
+    }
+    if (!health?.trainer_installed) {
+      setError("Install ltx-trainer-mlx on the server (see install hint below).");
+      return;
+    }
+    if (health.generation_active) {
+      setError("Wait for the current generation to finish before training.");
+      return;
+    }
+    setSubmitting(true);
+    try {
+      const result = await createTrainJob(manifest, videos);
+      const job: TrainJob = {
+        id: result.job_id,
+        name: result.name,
+        preset: result.preset,
+        status: "queued",
+        phase: "queued",
+        created_at: new Date().toISOString(),
+        total_steps: steps,
+        validation_clips: [],
+      };
+      setJobs((prev) => [job, ...prev]);
+      setActiveJobId(result.job_id);
+      setStep("runs");
+    } catch (exc) {
+      setError(exc instanceof Error ? exc.message : String(exc));
+    } finally {
+      setSubmitting(false);
+    }
+  }
+
+  async function handleCancel() {
+    if (!activeJobId) return;
+    try {
+      await cancelTrainJob(activeJobId);
+      updateJob(activeJobId, { status: "cancelled", phase: "cancelled" });
+    } catch (exc) {
+      setError(exc instanceof Error ? exc.message : String(exc));
+    }
+  }
+
+  async function handleRegister() {
+    if (!activeJobId || !activeJob) return;
+    setRegistering(true);
+    setError(null);
+    try {
+      const result = await registerTrainedLora(activeJobId, name, 1.0);
+      updateJob(activeJobId, { registered_lora_id: result.id });
+    } catch (exc) {
+      setError(exc instanceof Error ? exc.message : String(exc));
+    } finally {
+      setRegistering(false);
+    }
+  }
+
+  const trainProgress =
+    activeJob?.total_steps && activeJob.total_steps > 0
+      ? Math.min(100, ((activeJob.step || 0) / activeJob.total_steps) * 100)
+      : 0;
+
+  const videoCount = videos.filter((f) => !f.name.toLowerCase().endsWith(".txt")).length;
+  const captionCount = videos.filter((f) => f.name.toLowerCase().endsWith(".txt")).length;
+
+  return (
+    <div className="train-page">
+      <div className="train-hero">
+        <div>
+          <h1 className="train-title">Train a LoRA</h1>
+          <p className="train-subtitle">
+            Upload clips, preprocess latents, and fine-tune a style LoRA for generation — all on your Mac.
+          </p>
+        </div>
+        <div className="train-health">
+          <span
+            className={`status-dot ${health?.trainer_installed ? "ok" : "off"}`}
+            title={health?.trainer_installed ? "Trainer ready" : "Trainer not installed"}
+          />
+          <span>{health?.trainer_installed ? "Trainer ready" : "Trainer not installed"}</span>
+          {health?.training_active && <span className="train-badge">Training</span>}
+          {health?.generation_active && <span className="train-badge warn">Gen active</span>}
+        </div>
+      </div>
+
+      {!health?.trainer_installed && health?.install_hint && (
+        <div className="train-callout">
+          <strong>Install training support</strong>
+          <code className="train-install-hint">{health.install_hint}</code>
+        </div>
+      )}
+
+      {error && (
+        <div className="train-error" role="alert">
+          {error}
+        </div>
+      )}
+
+      <div className="train-wizard">
+        <nav className="train-steps" aria-label="Training wizard">
+          {STEPS.map((s, i) => (
+            <button
+              key={s.id}
+              type="button"
+              className={`train-step-tab ${step === s.id ? "active" : ""}`}
+              onClick={() => setStep(s.id)}
+            >
+              <span className="train-step-num">{i + 1}</span>
+              <span className="train-step-text">
+                <span className="train-step-label">{s.label}</span>
+                <span className="train-step-hint">{s.hint}</span>
+              </span>
+            </button>
+          ))}
+        </nav>
+
+        <div className="train-panel">
+          {step === "dataset" && (
+            <section className="train-section">
+              <h2>Dataset</h2>
+              <p className="train-section-lead">
+                Drop training videos here. Optional <code>.txt</code> caption files with matching names are used when
+                slicing is off.
+              </p>
+
+              <label className="field">
+                <span>LoRA name</span>
+                <input value={name} onChange={(e) => setName(e.target.value)} placeholder="My style LoRA" />
+              </label>
+
+              <div className="preset-grid">
+                {presets.map((p) => (
+                  <button
+                    key={p.id}
+                    type="button"
+                    className={`preset-card ${preset === p.id ? "selected" : ""}`}
+                    onClick={() => setPreset(p.id)}
+                  >
+                    <span className="preset-card-title">{p.label}</span>
+                    <span className="preset-card-desc">{p.description}</span>
+                    <span className="preset-card-meta">{p.ram_hint}</span>
+                  </button>
+                ))}
+              </div>
+
+              <div
+                className={`drop-zone ${dragOver ? "drag-over" : ""}`}
+                onDragOver={(e) => {
+                  e.preventDefault();
+                  setDragOver(true);
+                }}
+                onDragLeave={() => setDragOver(false)}
+                onDrop={(e) => {
+                  e.preventDefault();
+                  setDragOver(false);
+                  if (e.dataTransfer.files.length) addFiles(e.dataTransfer.files);
+                }}
+                onClick={() => fileInputRef.current?.click()}
+                role="button"
+                tabIndex={0}
+                onKeyDown={(e) => {
+                  if (e.key === "Enter" || e.key === " ") fileInputRef.current?.click();
+                }}
+              >
+                <input
+                  ref={fileInputRef}
+                  type="file"
+                  multiple
+                  accept="video/*,.txt"
+                  hidden
+                  onChange={(e) => e.target.files && addFiles(e.target.files)}
+                />
+                <div className="drop-zone-icon">↑</div>
+                <div className="drop-zone-title">Drop videos or click to browse</div>
+                <div className="drop-zone-meta">
+                  {videoCount} video{videoCount !== 1 ? "s" : ""}
+                  {captionCount > 0 ? ` · ${captionCount} caption file${captionCount !== 1 ? "s" : ""}` : ""}
+                </div>
+              </div>
+
+              {videos.length > 0 && (
+                <ul className="file-list">
+                  {videos.map((f) => (
+                    <li key={f.name}>
+                      <span>{f.name}</span>
+                      <span className="file-size">{(f.size / 1024 / 1024).toFixed(1)} MB</span>
+                      <button
+                        type="button"
+                        className="file-remove"
+                        aria-label={`Remove ${f.name}`}
+                        onClick={() => setVideos((prev) => prev.filter((x) => x !== f))}
+                      >
+                        ×
+                      </button>
+                    </li>
+                  ))}
+                </ul>
+              )}
+
+              <details className="train-advanced">
+                <summary>Auto-slice long videos</summary>
+                <div className="train-advanced-body">
+                  <label className="checkbox-row">
+                    <input
+                      type="checkbox"
+                      checked={sliceEnabled}
+                      onChange={(e) => setSliceEnabled(e.target.checked)}
+                    />
+                    Slice uploads into fixed-length clips (requires ffmpeg)
+                  </label>
+                  {sliceEnabled && (
+                    <div className="field-grid">
+                      <label className="field">
+                        <span>Interval (seconds)</span>
+                        <input
+                          type="number"
+                          min={1}
+                          step={0.5}
+                          value={sliceInterval}
+                          onChange={(e) => setSliceInterval(Number(e.target.value))}
+                        />
+                      </label>
+                      <label className="field">
+                        <span>Clip resolution</span>
+                        <input value={sliceRes} onChange={(e) => setSliceRes(e.target.value)} placeholder="384x384" />
+                      </label>
+                      <label className="field">
+                        <span>FPS</span>
+                        <input
+                          type="number"
+                          min={1}
+                          value={sliceFps}
+                          onChange={(e) => setSliceFps(Number(e.target.value))}
+                        />
+                      </label>
+                      <label className="field">
+                        <span>Fit</span>
+                        <select value={sliceFit} onChange={(e) => setSliceFit(e.target.value)}>
+                          <option value="crop">crop</option>
+                          <option value="pad">pad</option>
+                          <option value="stretch">stretch</option>
+                        </select>
+                      </label>
+                      <label className="field span-2">
+                        <span>Caption template (optional)</span>
+                        <input
+                          value={captionTemplate}
+                          onChange={(e) => setCaptionTemplate(e.target.value)}
+                          placeholder="a video of {filename}"
+                        />
+                      </label>
+                    </div>
+                  )}
+                </div>
+              </details>
+
+              <div className="train-actions">
+                <button type="button" className="btn-primary" onClick={() => setStep("preprocess")}>
+                  Continue
+                </button>
+              </div>
+            </section>
+          )}
+
+          {step === "preprocess" && (
+            <section className="train-section">
+              <h2>Preprocess</h2>
+              <p className="train-section-lead">
+                Latents are encoded at this resolution. Frames are rounded to valid LTX lengths (8k+1).
+              </p>
+              <div className="field-grid">
+                <label className="field">
+                  <span>Width</span>
+                  <input type="number" step={32} value={width} onChange={(e) => setWidth(Number(e.target.value))} />
+                </label>
+                <label className="field">
+                  <span>Height</span>
+                  <input type="number" step={32} value={height} onChange={(e) => setHeight(Number(e.target.value))} />
+                </label>
+                <label className="field">
+                  <span>Max frames</span>
+                  <input
+                    type="number"
+                    step={8}
+                    value={maxFrames}
+                    onChange={(e) => setMaxFrames(Number(e.target.value))}
+                  />
+                </label>
+                <label className="checkbox-row span-2">
+                  <input
+                    type="checkbox"
+                    checked={withAudio}
+                    disabled={selectedPreset?.with_audio}
+                    onChange={(e) => setWithAudio(e.target.checked)}
+                  />
+                  Encode audio latents
+                  {selectedPreset?.with_audio && <span className="field-note"> (required for AV preset)</span>}
+                </label>
+              </div>
+              <div className="train-actions">
+                <button type="button" className="btn-secondary" onClick={() => setStep("dataset")}>
+                  Back
+                </button>
+                <button type="button" className="btn-primary" onClick={() => setStep("train")}>
+                  Continue
+                </button>
+              </div>
+            </section>
+          )}
+
+          {step === "train" && (
+            <section className="train-section">
+              <h2>Training</h2>
+              <p className="train-section-lead">
+                Typical runs use 1k–3k steps. Validation clips appear during training so you can judge quality early.
+              </p>
+              <div className="field-grid">
+                <label className="field">
+                  <span>Steps</span>
+                  <input type="number" min={100} step={100} value={steps} onChange={(e) => setSteps(Number(e.target.value))} />
+                </label>
+                <label className="field">
+                  <span>LoRA rank</span>
+                  <input type="number" min={8} step={8} value={rank} onChange={(e) => setRank(Number(e.target.value))} />
+                </label>
+                <label className="field">
+                  <span>Learning rate</span>
+                  <input
+                    type="number"
+                    step={0.0001}
+                    value={learningRate}
+                    onChange={(e) => setLearningRate(Number(e.target.value))}
+                  />
+                </label>
+                <label className="field">
+                  <span>Seed</span>
+                  <input type="number" value={seed} onChange={(e) => setSeed(Number(e.target.value))} />
+                </label>
+                <label className="field">
+                  <span>Validation every (steps)</span>
+                  <input
+                    type="number"
+                    min={50}
+                    step={50}
+                    value={validationInterval}
+                    onChange={(e) => setValidationInterval(Number(e.target.value))}
+                  />
+                </label>
+                <label className="field">
+                  <span>Checkpoint every (steps)</span>
+                  <input
+                    type="number"
+                    min={50}
+                    step={50}
+                    value={checkpointInterval}
+                    onChange={(e) => setCheckpointInterval(Number(e.target.value))}
+                  />
+                </label>
+                <label className="checkbox-row span-2">
+                  <input type="checkbox" checked={lowRam} onChange={(e) => setLowRam(e.target.checked)} />
+                  Low RAM mode (gradient checkpointing)
+                </label>
+                <label className="field span-2">
+                  <span>Validation prompts (one per line)</span>
+                  <textarea
+                    rows={3}
+                    value={validationPrompts}
+                    onChange={(e) => setValidationPrompts(e.target.value)}
+                  />
+                </label>
+              </div>
+
+              <div className="train-summary">
+                <div>
+                  <strong>{name}</strong> · {selectedPreset?.label || preset} · {steps} steps · rank {rank}
+                </div>
+                <div className="train-summary-meta">
+                  {videoCount} video{videoCount !== 1 ? "s" : ""} · {width}×{height} · {maxFrames} frames
+                </div>
+              </div>
+
+              <div className="train-actions">
+                <button type="button" className="btn-secondary" onClick={() => setStep("preprocess")}>
+                  Back
+                </button>
+                <button
+                  type="button"
+                  className="btn-primary"
+                  disabled={submitting || !health?.trainer_installed}
+                  onClick={startTraining}
+                >
+                  {submitting ? "Starting…" : "Start training"}
+                </button>
+              </div>
+            </section>
+          )}
+
+          {step === "runs" && (
+            <section className="train-section">
+              <h2>Runs</h2>
+              {!activeJob && jobs.length === 0 && (
+                <p className="train-section-lead">No training jobs yet. Configure a run and start from the Train step.</p>
+              )}
+
+              {jobs.length > 0 && (
+                <div className="job-list">
+                  {jobs.map((j) => (
+                    <button
+                      key={j.id}
+                      type="button"
+                      className={`job-card ${activeJobId === j.id ? "active" : ""}`}
+                      onClick={() => setActiveJobId(j.id)}
+                    >
+                      <span className="job-card-name">{j.name}</span>
+                      <span className={`job-status status-${j.status}`}>{j.status}</span>
+                      <span className="job-card-meta">{new Date(j.created_at).toLocaleString()}</span>
+                    </button>
+                  ))}
+                </div>
+              )}
+
+              {activeJob && (
+                <div className="job-detail">
+                  <div className="job-detail-header">
+                    <div>
+                      <h3>{activeJob.name}</h3>
+                      <p className="job-phase">{phaseLabel(activeJob.phase)}</p>
+                    </div>
+                    {["queued", "running"].includes(activeJob.status) && (
+                      <button type="button" className="btn-danger" onClick={handleCancel}>
+                        Cancel
+                      </button>
+                    )}
+                  </div>
+
+                  {activeJob.phase === "training" && (
+                    <div className="progress-block">
+                      <div className="progress-bar">
+                        <div className="progress-fill" style={{ width: `${trainProgress}%` }} />
+                      </div>
+                      <div className="progress-stats">
+                        <span>
+                          Step {activeJob.step || 0} / {activeJob.total_steps || steps}
+                        </span>
+                        {activeJob.loss != null && <span>Loss {activeJob.loss.toFixed(4)}</span>}
+                        <span>ETA {formatEta(activeJob.eta_s)}</span>
+                      </div>
+                    </div>
+                  )}
+
+                  {activeJob.status === "done" && activeJob.artifact_url && (
+                    <div className="artifact-card">
+                      <div>
+                        <strong>LoRA ready</strong>
+                        <div className="artifact-name">{activeJob.artifact_name}</div>
+                      </div>
+                      <div className="artifact-actions">
+                        <a className="btn-secondary" href={activeJob.artifact_url} download>
+                          Download
+                        </a>
+                        <button
+                          type="button"
+                          className="btn-primary"
+                          disabled={registering || !!activeJob.registered_lora_id}
+                          onClick={handleRegister}
+                        >
+                          {activeJob.registered_lora_id ? "Added to library" : registering ? "Adding…" : "Use in Generate"}
+                        </button>
+                      </div>
+                    </div>
+                  )}
+
+                  {activeJob.error && <div className="train-error">{activeJob.error}</div>}
+
+                  {(activeJob.validation_clips?.length ?? 0) > 0 && (
+                    <div className="validation-gallery">
+                      <h4>Validation previews</h4>
+                      <div className="validation-grid">
+                        {activeJob.validation_clips!.map((v, i) => (
+                          <div key={`${v.url}-${i}`} className="validation-card">
+                            <video src={v.url} controls playsInline loop muted />
+                            <span className="validation-step">Step {v.step}</span>
+                          </div>
+                        ))}
+                      </div>
+                    </div>
+                  )}
+                </div>
+              )}
+
+              <div className="train-actions">
+                <button type="button" className="btn-secondary" onClick={() => setStep("train")}>
+                  New run setup
+                </button>
+              </div>
+            </section>
+          )}
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/web/src/api/train.ts b/web/src/api/train.ts
new file mode 100644
index 0000000..b9df8cc
--- /dev/null
+++ b/web/src/api/train.ts
@@ -0,0 +1,113 @@
+import type { TrainHealth, TrainJob, TrainPreset } from "../types";
+
+export async function fetchTrainHealth(): Promise<TrainHealth> {
+  const res = await fetch("/api/train/health");
+  if (!res.ok) throw new Error(`Health check failed (${res.status})`);
+  return res.json();
+}
+
+export async function fetchTrainPresets(): Promise<TrainPreset[]> {
+  const res = await fetch("/api/train/presets");
+  if (!res.ok) throw new Error(`Presets failed (${res.status})`);
+  const data = await res.json();
+  return data.presets ?? [];
+}
+
+export async function fetchTrainJobs(): Promise<TrainJob[]> {
+  const res = await fetch("/api/train/jobs");
+  if (!res.ok) throw new Error(`Jobs list failed (${res.status})`);
+  const data = await res.json();
+  return data.jobs ?? [];
+}
+
+export interface TrainManifest {
+  name: string;
+  preset: string;
+  model_id: string;
+  slice: {
+    enabled: boolean;
+    interval: number;
+    res: string;
+    fps: number;
+    fit: string;
+    caption_template?: string;
+    max_clips?: number;
+  };
+  preprocess: {
+    width: number;
+    height: number;
+    max_frames: number;
+    with_audio: boolean;
+    frame_rate: number;
+  };
+  train: {
+    steps: number;
+    rank: number;
+    learning_rate: number;
+    validation_prompts: string[];
+    validation_interval: number;
+    checkpoint_interval: number;
+    low_ram: boolean;
+    seed: number;
+  };
+}
+
+export async function createTrainJob(
+  manifest: TrainManifest,
+  files: File[],
+): Promise<{ job_id: string; name: string; preset: string }> {
+  const form = new FormData();
+  form.append("manifest", JSON.stringify(manifest));
+  for (const file of files) {
+    form.append("videos", file, file.name);
+  }
+  const res = await fetch("/api/train/jobs", { method: "POST", body: form });
+  if (!res.ok) {
+    const text = await res.text();
+    throw new Error(text || `Create job failed (${res.status})`);
+  }
+  return res.json();
+}
+
+export async function cancelTrainJob(jobId: string): Promise<void> {
+  const res = await fetch(`/api/train/jobs/${jobId}/cancel`, { method: "POST" });
+  if (!res.ok) throw new Error(`Cancel failed (${res.status})`);
+}
+
+export async function registerTrainedLora(
+  jobId: string,
+  label: string,
+  scale = 1.0,
+): Promise<{ id: string; spec: string }> {
+  const res = await fetch(`/api/train/jobs/${jobId}/register-lora`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ label, scale }),
+  });
+  if (!res.ok) {
+    const text = await res.text();
+    throw new Error(text || `Register failed (${res.status})`);
+  }
+  return res.json();
+}
+
+export type TrainEventHandler = (event: Record<string, unknown>) => void;
+
+export function subscribeTrainJob(jobId: string, onEvent: TrainEventHandler): () => void {
+  const es = new EventSource(`/api/train/jobs/${jobId}/events`);
+
+  es.onmessage = (msg) => {
+    try {
+      const event = JSON.parse(msg.data) as Record<string, unknown>;
+      onEvent(event);
+    } catch {
+      /* ignore malformed */
+    }
+  };
+
+  es.onerror = () => {
+    es.close();
+  };
+
+  return () => es.close();
+}
diff --git a/web/src/index.css b/web/src/index.css
index 46ea0a1..61b9090 100644
--- a/web/src/index.css
+++ b/web/src/index.css
@@ -1166,3 +1166,651 @@ body {
   font-size: 0.75rem;
   color: var(--muted);
 }
+
+/* —— Layout nav —— */
+.brand-row {
+  display: flex;
+  align-items: center;
+  gap: 24px;
+  flex-wrap: wrap;
+}
+
+.main-nav {
+  display: flex;
+  gap: 4px;
+  background: var(--bg);
+  border: 1px solid var(--border);
+  border-radius: 999px;
+  padding: 3px;
+}
+
+.nav-link {
+  display: inline-block;
+  padding: 6px 14px;
+  border-radius: 999px;
+  font-size: 0.85rem;
+  font-weight: 500;
+  color: var(--muted);
+  text-decoration: none;
+  transition: background 0.15s, color 0.15s;
+}
+
+.nav-link:hover {
+  color: var(--text);
+}
+
+.nav-link.active {
+  background: var(--surface);
+  color: var(--text);
+  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.06);
+}
+
+.page-toolbar {
+  display: flex;
+  justify-content: flex-end;
+  margin-bottom: 12px;
+}
+
+/* —— Train page —— */
+.train-page {
+  display: flex;
+  flex-direction: column;
+  gap: 16px;
+}
+
+.train-hero {
+  display: flex;
+  align-items: flex-start;
+  justify-content: space-between;
+  gap: 16px;
+  flex-wrap: wrap;
+}
+
+.train-title {
+  margin: 0 0 6px;
+  font-size: 1.5rem;
+  font-weight: 600;
+  letter-spacing: -0.02em;
+}
+
+.train-subtitle {
+  margin: 0;
+  color: var(--muted);
+  font-size: 0.95rem;
+  max-width: 52ch;
+  line-height: 1.5;
+}
+
+.train-health {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-size: 0.85rem;
+  color: var(--muted);
+  padding: 8px 12px;
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+}
+
+.train-badge {
+  font-size: 0.7rem;
+  font-weight: 600;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+  padding: 2px 8px;
+  border-radius: 999px;
+  background: var(--accent-soft);
+  color: var(--accent);
+}
+
+.train-badge.warn {
+  background: #fef3c7;
+  color: #b45309;
+}
+
+.train-callout {
+  display: flex;
+  flex-direction: column;
+  gap: 8px;
+  padding: 14px 16px;
+  background: #fffbeb;
+  border: 1px solid #fde68a;
+  border-radius: var(--radius);
+  font-size: 0.9rem;
+}
+
+.train-install-hint {
+  display: block;
+  padding: 10px 12px;
+  background: #18181b;
+  color: #f4f4f5;
+  border-radius: 8px;
+  font-size: 0.75rem;
+  overflow-x: auto;
+  white-space: pre-wrap;
+  word-break: break-all;
+}
+
+.train-error {
+  padding: 12px 14px;
+  background: #fef2f2;
+  border: 1px solid #fecaca;
+  color: var(--danger);
+  border-radius: var(--radius);
+  font-size: 0.9rem;
+}
+
+.train-wizard {
+  display: grid;
+  grid-template-columns: 220px minmax(0, 1fr);
+  gap: 16px;
+  align-items: start;
+}
+
+@media (max-width: 800px) {
+  .train-wizard {
+    grid-template-columns: 1fr;
+  }
+}
+
+.train-steps {
+  display: flex;
+  flex-direction: column;
+  gap: 6px;
+  position: sticky;
+  top: 16px;
+}
+
+@media (max-width: 800px) {
+  .train-steps {
+    flex-direction: row;
+    overflow-x: auto;
+    position: static;
+    padding-bottom: 4px;
+  }
+}
+
+.train-step-tab {
+  display: flex;
+  align-items: flex-start;
+  gap: 10px;
+  text-align: left;
+  padding: 10px 12px;
+  border: 1px solid var(--border);
+  border-radius: 12px;
+  background: var(--surface);
+  cursor: pointer;
+  transition: border-color 0.15s, box-shadow 0.15s;
+}
+
+.train-step-tab:hover {
+  border-color: #cbd5e1;
+}
+
+.train-step-tab.active {
+  border-color: var(--accent);
+  box-shadow: 0 0 0 1px var(--accent-soft);
+  background: var(--accent-soft);
+}
+
+.train-step-num {
+  width: 22px;
+  height: 22px;
+  border-radius: 50%;
+  background: var(--bg);
+  font-size: 0.75rem;
+  font-weight: 600;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  flex-shrink: 0;
+}
+
+.train-step-tab.active .train-step-num {
+  background: var(--accent);
+  color: white;
+}
+
+.train-step-text {
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
+  min-width: 0;
+}
+
+.train-step-label {
+  font-size: 0.85rem;
+  font-weight: 600;
+}
+
+.train-step-hint {
+  font-size: 0.72rem;
+  color: var(--muted);
+}
+
+.train-panel {
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 20px 22px;
+  box-shadow: var(--shadow);
+}
+
+.train-section h2 {
+  margin: 0 0 8px;
+  font-size: 1.15rem;
+}
+
+.train-section-lead {
+  margin: 0 0 18px;
+  color: var(--muted);
+  font-size: 0.9rem;
+  line-height: 1.5;
+}
+
+.preset-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+  gap: 10px;
+  margin-bottom: 16px;
+}
+
+.preset-card {
+  display: flex;
+  flex-direction: column;
+  gap: 6px;
+  text-align: left;
+  padding: 14px;
+  border: 1px solid var(--border);
+  border-radius: 12px;
+  background: var(--bg);
+  cursor: pointer;
+  transition: border-color 0.15s, background 0.15s;
+}
+
+.preset-card:hover {
+  border-color: #cbd5e1;
+}
+
+.preset-card.selected {
+  border-color: var(--accent);
+  background: var(--accent-soft);
+}
+
+.preset-card-title {
+  font-weight: 600;
+  font-size: 0.9rem;
+}
+
+.preset-card-desc {
+  font-size: 0.8rem;
+  color: var(--muted);
+  line-height: 1.4;
+}
+
+.preset-card-meta {
+  font-size: 0.72rem;
+  color: var(--muted);
+  margin-top: 4px;
+}
+
+.drop-zone {
+  border: 2px dashed var(--border);
+  border-radius: var(--radius);
+  padding: 32px 20px;
+  text-align: center;
+  cursor: pointer;
+  transition: border-color 0.15s, background 0.15s;
+  margin-bottom: 12px;
+}
+
+.drop-zone:hover,
+.drop-zone.drag-over {
+  border-color: var(--accent);
+  background: var(--accent-soft);
+}
+
+.drop-zone-icon {
+  font-size: 1.5rem;
+  margin-bottom: 8px;
+  color: var(--accent);
+}
+
+.drop-zone-title {
+  font-weight: 600;
+  margin-bottom: 4px;
+}
+
+.drop-zone-meta {
+  font-size: 0.8rem;
+  color: var(--muted);
+}
+
+.file-list {
+  list-style: none;
+  margin: 0 0 16px;
+  padding: 0;
+  border: 1px solid var(--border);
+  border-radius: 10px;
+  overflow: hidden;
+}
+
+.file-list li {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  padding: 8px 12px;
+  border-bottom: 1px solid var(--border);
+  font-size: 0.85rem;
+}
+
+.file-list li:last-child {
+  border-bottom: none;
+}
+
+.file-size {
+  margin-left: auto;
+  color: var(--muted);
+  font-size: 0.75rem;
+}
+
+.file-remove {
+  border: none;
+  background: transparent;
+  color: var(--muted);
+  cursor: pointer;
+  font-size: 1.1rem;
+  line-height: 1;
+  padding: 0 4px;
+}
+
+.file-remove:hover {
+  color: var(--danger);
+}
+
+.train-advanced {
+  margin-bottom: 16px;
+  border: 1px solid var(--border);
+  border-radius: 10px;
+  padding: 0 12px;
+  background: var(--bg);
+}
+
+.train-advanced summary {
+  cursor: pointer;
+  padding: 10px 0;
+  font-size: 0.85rem;
+  font-weight: 500;
+}
+
+.train-advanced-body {
+  padding-bottom: 12px;
+}
+
+.field-grid {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 12px;
+  margin-bottom: 16px;
+}
+
+.field-grid .span-2 {
+  grid-column: span 2;
+}
+
+@media (max-width: 600px) {
+  .field-grid {
+    grid-template-columns: 1fr;
+  }
+  .field-grid .span-2 {
+    grid-column: span 1;
+  }
+}
+
+.field {
+  display: flex;
+  flex-direction: column;
+  gap: 6px;
+  font-size: 0.8rem;
+}
+
+.field span {
+  color: var(--muted);
+  font-weight: 500;
+}
+
+.field input,
+.field select,
+.field textarea {
+  padding: 8px 10px;
+  border: 1px solid var(--border);
+  border-radius: 8px;
+  font: inherit;
+  background: var(--surface);
+}
+
+.field-note {
+  color: var(--muted);
+  font-size: 0.75rem;
+}
+
+.checkbox-row {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-size: 0.85rem;
+  padding: 8px 0;
+}
+
+.train-summary {
+  padding: 12px 14px;
+  background: var(--bg);
+  border-radius: 10px;
+  border: 1px solid var(--border);
+  margin-bottom: 16px;
+  font-size: 0.85rem;
+}
+
+.train-summary-meta {
+  color: var(--muted);
+  margin-top: 4px;
+  font-size: 0.8rem;
+}
+
+.train-actions {
+  display: flex;
+  gap: 10px;
+  justify-content: flex-end;
+  flex-wrap: wrap;
+  margin-top: 8px;
+}
+
+.btn-danger {
+  padding: 8px 14px;
+  border-radius: 8px;
+  border: 1px solid #fecaca;
+  background: #fef2f2;
+  color: var(--danger);
+  font: inherit;
+  font-size: 0.85rem;
+  font-weight: 500;
+  cursor: pointer;
+}
+
+.btn-danger:hover {
+  background: #fee2e2;
+}
+
+.job-list {
+  display: flex;
+  flex-direction: column;
+  gap: 8px;
+  margin-bottom: 16px;
+}
+
+.job-card {
+  display: grid;
+  grid-template-columns: 1fr auto;
+  grid-template-rows: auto auto;
+  gap: 2px 12px;
+  text-align: left;
+  padding: 10px 12px;
+  border: 1px solid var(--border);
+  border-radius: 10px;
+  background: var(--bg);
+  cursor: pointer;
+}
+
+.job-card.active {
+  border-color: var(--accent);
+  background: var(--accent-soft);
+}
+
+.job-card-name {
+  font-weight: 600;
+  font-size: 0.9rem;
+}
+
+.job-status {
+  font-size: 0.7rem;
+  font-weight: 600;
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+  padding: 2px 8px;
+  border-radius: 999px;
+  background: var(--border);
+  color: var(--muted);
+}
+
+.job-status.status-running,
+.job-status.status-queued {
+  background: var(--accent-soft);
+  color: var(--accent);
+}
+
+.job-status.status-done {
+  background: #dcfce7;
+  color: #15803d;
+}
+
+.job-status.status-failed,
+.job-status.status-cancelled {
+  background: #fef2f2;
+  color: var(--danger);
+}
+
+.job-card-meta {
+  grid-column: 1 / -1;
+  font-size: 0.75rem;
+  color: var(--muted);
+}
+
+.job-detail {
+  border-top: 1px solid var(--border);
+  padding-top: 16px;
+}
+
+.job-detail-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: flex-start;
+  gap: 12px;
+  margin-bottom: 14px;
+}
+
+.job-detail-header h3 {
+  margin: 0 0 4px;
+}
+
+.job-phase {
+  margin: 0;
+  color: var(--muted);
+  font-size: 0.85rem;
+}
+
+.progress-block {
+  margin-bottom: 16px;
+}
+
+.progress-bar {
+  height: 8px;
+  background: var(--bg);
+  border-radius: 999px;
+  overflow: hidden;
+  border: 1px solid var(--border);
+}
+
+.progress-fill {
+  height: 100%;
+  background: linear-gradient(90deg, var(--accent), #60a5fa);
+  border-radius: 999px;
+  transition: width 0.3s ease;
+}
+
+.progress-stats {
+  display: flex;
+  gap: 16px;
+  flex-wrap: wrap;
+  margin-top: 8px;
+  font-size: 0.8rem;
+  color: var(--muted);
+}
+
+.artifact-card {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: 12px;
+  flex-wrap: wrap;
+  padding: 14px;
+  background: #f0fdf4;
+  border: 1px solid #bbf7d0;
+  border-radius: 12px;
+  margin-bottom: 16px;
+}
+
+.artifact-name {
+  font-size: 0.8rem;
+  color: var(--muted);
+  margin-top: 4px;
+}
+
+.artifact-actions {
+  display: flex;
+  gap: 8px;
+  flex-wrap: wrap;
+}
+
+.validation-gallery h4 {
+  margin: 0 0 10px;
+  font-size: 0.9rem;
+}
+
+.validation-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
+  gap: 10px;
+}
+
+.validation-card {
+  border: 1px solid var(--border);
+  border-radius: 10px;
+  overflow: hidden;
+  background: var(--bg);
+}
+
+.validation-card video {
+  width: 100%;
+  display: block;
+  aspect-ratio: 16 / 9;
+  object-fit: cover;
+  background: #000;
+}
+
+.validation-step {
+  display: block;
+  padding: 6px 8px;
+  font-size: 0.72rem;
+  color: var(--muted);
+}
diff --git a/web/src/main.tsx b/web/src/main.tsx
index c2a145c..dee4866 100644
--- a/web/src/main.tsx
+++ b/web/src/main.tsx
@@ -1,10 +1,20 @@
 import { StrictMode } from "react";
 import { createRoot } from "react-dom/client";
+import { BrowserRouter, Route, Routes } from "react-router-dom";
 import App from "./App";
+import Layout from "./Layout";
+import TrainPage from "./TrainPage";
 import "./index.css";
 
 createRoot(document.getElementById("root")!).render(
   <StrictMode>
-    <App />
+    <BrowserRouter>
+      <Routes>
+        <Route element={<Layout />}>
+          <Route path="/" element={<App />} />
+          <Route path="/train" element={<TrainPage />} />
+        </Route>
+      </Routes>
+    </BrowserRouter>
   </StrictMode>,
 );
diff --git a/web/src/types.ts b/web/src/types.ts
index c73a59d..2e50038 100644
--- a/web/src/types.ts
+++ b/web/src/types.ts
@@ -111,3 +111,47 @@ export interface ProgressState {
   pct?: number;
   eta_s?: number;
 }
+
+export interface TrainPreset {
+  id: string;
+  label: string;
+  description: string;
+  ram_hint: string;
+  with_audio: boolean;
+  low_ram_default: boolean;
+}
+
+export interface TrainHealth {
+  ok: boolean;
+  trainer_installed: boolean;
+  ffmpeg_available: boolean;
+  install_hint?: string | null;
+  presets: TrainPreset[];
+  training_active?: boolean;
+  generation_active?: boolean;
+}
+
+export interface TrainValidationClip {
+  step: number;
+  filename: string;
+  url: string;
+}
+
+export interface TrainJob {
+  id: string;
+  name: string;
+  preset: string;
+  status: string;
+  phase?: string;
+  created_at: string;
+  step?: number;
+  total_steps?: number;
+  loss?: number;
+  lr?: number;
+  eta_s?: number;
+  error?: string | null;
+  artifact_url?: string | null;
+  artifact_name?: string | null;
+  registered_lora_id?: string | null;
+  validation_clips?: TrainValidationClip[];
+}
diff --git a/web_train.py b/web_train.py
new file mode 100644
index 0000000..a49cdaf
--- /dev/null
+++ b/web_train.py
@@ -0,0 +1,488 @@
+"""Training job API, worker queue, and SSE for the ltx-ws Web UI."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import shutil
+import uuid
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, AsyncIterator
+
+if TYPE_CHECKING:
+    from web_ui import AppState
+
+log = logging.getLogger("web_train")
+
+_TRAIN_BODIES: dict[str, dict[str, Any]] = {}
+
+
+class TrainJobStatus(str, Enum):
+    QUEUED = "queued"
+    RUNNING = "running"
+    DONE = "done"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+
+@dataclass
+class TrainJobRecord:
+    id: str
+    name: str
+    preset: str
+    status: str
+    created_at: str
+    phase: str = "queued"
+    step: int = 0
+    total_steps: int = 0
+    error: str | None = None
+    artifact_url: str | None = None
+    artifact_name: str | None = None
+    registered_lora_id: str | None = None
+    validation_clips: list[dict[str, Any]] = field(default_factory=list)
+
+
+def _init_train_state(state: AppState) -> None:
+    if getattr(state, "_train_initialized", False):
+        return
+    state.train_jobs: dict[str, TrainJobRecord] = {}
+    state.train_event_queues: dict[str, asyncio.Queue[dict[str, Any]]] = {}
+    state._pending_train: asyncio.Queue[str] = asyncio.Queue()
+    state._cancelled_train_jobs: set[str] = set()
+    state._active_train_job_id: str | None = None
+    state._train_worker_started = False
+    state._mlx_lock = asyncio.Lock()
+    state._train_initialized = True
+
+
+def is_training_active(state: AppState) -> bool:
+    _init_train_state(state)
+    return state._active_train_job_id is not None
+
+
+async def emit_train(state: AppState, job_id: str, event: dict[str, Any]) -> None:
+    _init_train_state(state)
+    q = state.train_event_queues.get(job_id)
+    if q:
+        await q.put(event)
+
+
+def _job_for_api(state: AppState, job: TrainJobRecord) -> dict[str, Any]:
+    data = asdict(job)
+    status = None
+    try:
+        from ltx_train_backend import load_status
+
+        status = load_status(state.output_dir, job.id)
+    except Exception:
+        pass
+    if status:
+        for key in (
+            "phase",
+            "step",
+            "total_steps",
+            "loss",
+            "lr",
+            "eta_s",
+            "artifact_url",
+            "artifact_lora",
+            "validation_clips",
+            "error",
+            "stats",
+        ):
+            if key in status and status[key] is not None:
+                data[key] = status[key]
+    return data
+
+
+def _parse_train_request(body: dict[str, Any]) -> Any:
+    from ltx_train_backend import (
+        PreprocessOptions,
+        SliceOptions,
+        TrainHyperparams,
+        TrainJobRequest,
+        TRAIN_PRESETS,
+    )
+
+    preset = str(body.get("preset") or "t2v").strip().lower()
+    if preset not in TRAIN_PRESETS:
+        preset = "t2v"
+    preset_info = TRAIN_PRESETS[preset]
+
+    slice_raw = body.get("slice") or {}
+    preprocess_raw = body.get("preprocess") or {}
+    train_raw = body.get("train") or {}
+
+    slice_opts = SliceOptions(
+        enabled=bool(slice_raw.get("enabled", False)),
+        interval=float(slice_raw.get("interval", 4.0)),
+        res=str(slice_raw.get("res") or "384x384"),
+        fps=float(slice_raw.get("fps", 24.0)),
+        fit=str(slice_raw.get("fit") or "crop"),
+        caption_template=slice_raw.get("caption_template"),
+        max_clips=slice_raw.get("max_clips"),
+    )
+    preprocess = PreprocessOptions(
+        width=int(preprocess_raw.get("width") or 704),
+        height=int(preprocess_raw.get("height") or 480),
+        max_frames=int(preprocess_raw.get("max_frames") or 97),
+        with_audio=bool(preprocess_raw.get("with_audio", preset_info.with_audio)),
+        frame_rate=float(preprocess_raw.get("frame_rate") or 24.0),
+    )
+    prompts_raw = train_raw.get("validation_prompts")
+    if isinstance(prompts_raw, str):
+        prompts = [p.strip() for p in prompts_raw.split("\n") if p.strip()]
+    elif isinstance(prompts_raw, list):
+        prompts = [str(p).strip() for p in prompts_raw if str(p).strip()]
+    else:
+        prompts = ["a cinematic landscape at sunset"]
+
+    train = TrainHyperparams(
+        steps=int(train_raw.get("steps") or 2000),
+        rank=int(train_raw.get("rank") or 64),
+        learning_rate=float(train_raw.get("learning_rate") or 5e-4),
+        validation_prompts=prompts,
+        validation_interval=int(train_raw.get("validation_interval") or 500),
+        checkpoint_interval=int(train_raw.get("checkpoint_interval") or 500),
+        low_ram=bool(train_raw.get("low_ram", preset_info.low_ram_default)),
+        seed=int(train_raw.get("seed") or 42),
+    )
+
+    return TrainJobRequest(
+        preset=preset,
+        name=str(body.get("name") or "My LoRA").strip() or "My LoRA",
+        model_id=str(body.get("model_id") or body.get("preferred_model") or "auto"),
+        model_dir=body.get("model_dir"),
+        slice=slice_opts,
+        preprocess=preprocess,
+        train=train,
+    )
+
+
+async def _execute_train_job(state: AppState, job_id: str) -> None:
+    from ltx_train_backend import TrainingCancelledError, run_train_job
+
+    _init_train_state(state)
+    job = state.train_jobs.get(job_id)
+    if not job:
+        return
+
+    body = _TRAIN_BODIES.get(job_id, {})
+    req = _parse_train_request(body)
+
+    job.status = TrainJobStatus.RUNNING.value
+    job.phase = "starting"
+    job.total_steps = int(req.train.steps)
+    state._active_train_job_id = job_id
+
+    loop = asyncio.get_running_loop()
+
+    def on_event(event: dict[str, Any]) -> None:
+        etype = event.get("type")
+        if etype == "train_step":
+            job.step = int(event.get("step") or job.step)
+            job.total_steps = int(event.get("total_steps") or job.total_steps)
+        elif etype == "phase_started":
+            job.phase = str(event.get("phase") or job.phase)
+        elif etype == "train_validation":
+            videos = event.get("videos") or []
+            job.validation_clips.extend(videos)
+        elif etype == "job_done":
+            job.artifact_url = event.get("artifact_url")
+            job.artifact_name = event.get("artifact_name")
+
+        def _schedule() -> None:
+            loop.create_task(emit_train(state, job_id, event))
+
+        loop.call_soon_threadsafe(_schedule)
+
+    def should_cancel() -> bool:
+        return job_id in state._cancelled_train_jobs
+
+    async with state._mlx_lock:
+        if is_training_active(state) and state._active_train_job_id != job_id:
+            raise RuntimeError("Another training job is active")
+        if state.is_generation_active():
+            raise RuntimeError("Cannot train while video generation is running")
+
+        try:
+            result = await asyncio.to_thread(
+                run_train_job,
+                req,
+                output_dir=state.output_dir,
+                job_id=job_id,
+                on_event=on_event,
+                should_cancel=should_cancel,
+            )
+            job.status = TrainJobStatus.DONE.value
+            job.phase = str(result.get("phase") or "done")
+            job.artifact_url = result.get("artifact_url") or job.artifact_url
+            if result.get("artifact_lora"):
+                job.artifact_name = Path(str(result["artifact_lora"])).name
+            await emit_train(
+                state,
+                job_id,
+                {
+                    "type": "job_done",
+                    "artifact_url": job.artifact_url,
+                    "artifact_name": job.artifact_name,
+                },
+            )
+        except TrainingCancelledError:
+            job.status = TrainJobStatus.CANCELLED.value
+            job.phase = "cancelled"
+            job.error = "Cancelled"
+            await emit_train(state, job_id, {"type": "error", "message": "Cancelled"})
+        finally:
+            state._active_train_job_id = None
+
+
+async def _train_worker_loop(state: AppState) -> None:
+    while True:
+        job_id = await state._pending_train.get()
+        try:
+            await _execute_train_job(state, job_id)
+        except Exception as exc:
+            job = state.train_jobs.get(job_id)
+            if job and job.status != TrainJobStatus.CANCELLED.value:
+                job.status = TrainJobStatus.FAILED.value
+                job.error = str(exc)
+                await emit_train(state, job_id, {"type": "error", "message": str(exc)})
+        finally:
+            await emit_train(
+                state,
+                job_id,
+                {"type": "job_complete", "job_id": job_id},
+            )
+            _TRAIN_BODIES.pop(job_id, None)
+
+
+def ensure_train_worker(state: AppState) -> None:
+    _init_train_state(state)
+    if state._train_worker_started:
+        return
+    asyncio.create_task(_train_worker_loop(state))
+    state._train_worker_started = True
+
+
+def register_train_routes(app: Any, state: AppState) -> None:
+    from fastapi import File, Form, HTTPException, UploadFile
+    from fastapi.responses import FileResponse, StreamingResponse
+    from ltx_train_backend import (
+        job_root,
+        load_status,
+        register_trained_lora,
+        trainer_health,
+        training_job_paths,
+    )
+
+    _init_train_state(state)
+
+    @app.get("/api/train/health")
+    async def train_health():
+        health = trainer_health(ffmpeg_required=False)
+        health["training_active"] = is_training_active(state)
+        health["generation_active"] = state.is_generation_active()
+        return health
+
+    @app.get("/api/train/presets")
+    async def train_presets():
+        health = trainer_health(ffmpeg_required=False)
+        return {"presets": health.get("presets") or []}
+
+    @app.get("/api/train/jobs")
+    async def list_train_jobs():
+        jobs = sorted(
+            state.train_jobs.values(),
+            key=lambda j: j.created_at,
+            reverse=True,
+        )
+        return {"jobs": [_job_for_api(state, j) for j in jobs]}
+
+    @app.get("/api/train/jobs/{job_id}")
+    async def get_train_job(job_id: str):
+        job = state.train_jobs.get(job_id)
+        if not job:
+            status = load_status(state.output_dir, job_id)
+            if not status:
+                raise HTTPException(404, "Job not found")
+            return status
+        return _job_for_api(state, job)
+
+    @app.post("/api/train/jobs")
+    async def create_train_job(
+        manifest: str = Form(...),
+        videos: list[UploadFile] = File(...),
+    ):
+        ensure_train_worker(state)
+        if is_training_active(state):
+            raise HTTPException(409, "A training job is already running")
+        if state.is_generation_active():
+            raise HTTPException(409, "Cannot start training while generation is active")
+
+        try:
+            body = json.loads(manifest)
+        except json.JSONDecodeError as exc:
+            raise HTTPException(400, f"Invalid manifest JSON: {exc}") from exc
+
+        if not videos:
+            raise HTTPException(400, "At least one video file is required")
+
+        video_exts = {".mp4", ".mov", ".avi", ".mkv", ".webm"}
+        has_video = any(
+            Path(u.filename or "").suffix.lower() in video_exts for u in videos
+        )
+        if not has_video:
+            raise HTTPException(400, "At least one video file is required")
+
+        job_id = f"train_{uuid.uuid4().hex[:10]}"
+        paths = training_job_paths(state.output_dir, job_id)
+        paths.ensure_dirs()
+
+        saved = 0
+        for upload in videos:
+            name = Path(upload.filename or f"upload_{saved}").name
+            dest = paths.raw / name
+            with dest.open("wb") as fh:
+                shutil.copyfileobj(upload.file, fh)
+            saved += 1
+
+        req = _parse_train_request(body)
+        job = TrainJobRecord(
+            id=job_id,
+            name=req.name,
+            preset=req.preset,
+            status=TrainJobStatus.QUEUED.value,
+            created_at=datetime.now().isoformat(),
+            total_steps=int(req.train.steps),
+        )
+        state.train_jobs[job_id] = job
+        _TRAIN_BODIES[job_id] = body
+        state.train_event_queues[job_id] = asyncio.Queue()
+        await state._pending_train.put(job_id)
+        log.info("Queued train job %s  preset=%s  videos=%d", job_id, req.preset, saved)
+        return {"job_id": job_id, "name": job.name, "preset": job.preset}
+
+    @app.post("/api/train/jobs/{job_id}/cancel")
+    async def cancel_train_job(job_id: str):
+        job = state.train_jobs.get(job_id)
+        if not job:
+            raise HTTPException(404, "Job not found")
+        if job.status in (
+            TrainJobStatus.DONE.value,
+            TrainJobStatus.FAILED.value,
+            TrainJobStatus.CANCELLED.value,
+        ):
+            return {"ok": True, "status": job.status}
+        state._cancelled_train_jobs.add(job_id)
+        job.status = TrainJobStatus.CANCELLED.value
+        return {"ok": True, "status": job.status}
+
+    @app.get("/api/train/jobs/{job_id}/artifacts/{artifact_path:path}")
+    async def train_artifact(job_id: str, artifact_path: str):
+        root = job_root(state.output_dir, job_id) / "outputs"
+        target = (root / artifact_path).resolve()
+        if not str(target).startswith(str(root.resolve())):
+            raise HTTPException(400, "Invalid path")
+        if not target.is_file():
+            raise HTTPException(404, "Artifact not found")
+        media = "video/mp4" if target.suffix.lower() == ".mp4" else "application/octet-stream"
+        return FileResponse(target, media_type=media, filename=target.name)
+
+    @app.post("/api/train/jobs/{job_id}/register-lora")
+    async def register_lora(job_id: str, body: dict[str, Any] | None = None):
+        from web_ui import _label_for_lora_spec, _lora_catalog, _read_custom_loras, _write_custom_loras
+
+        job = state.train_jobs.get(job_id)
+        status = load_status(state.output_dir, job_id)
+        artifact = None
+        if job and job.artifact_url:
+            artifact_name = job.artifact_name
+            if artifact_name:
+                artifact = job_root(state.output_dir, job_id) / "outputs" / artifact_name
+        if artifact is None and status:
+            al = status.get("artifact_lora")
+            if al:
+                artifact = Path(str(al))
+        if artifact is None or not artifact.is_file():
+            raise HTTPException(404, "Trained LoRA artifact not found")
+
+        body = body or {}
+        label = str(body.get("label") or (job.name if job else "") or "Trained LoRA").strip()
+        try:
+            scale = float(body.get("scale", 1.0))
+        except (TypeError, ValueError):
+            raise HTTPException(400, "scale must be a number")
+
+        dest = register_trained_lora(artifact, name=label)
+        spec = str(dest)
+        lid = f"custom_{uuid.uuid4().hex[:8]}"
+        entries = [
+            {
+                "id": e["id"],
+                "label": e["label"],
+                "spec": e["spec"],
+                "scale": e["scale"],
+            }
+            for e in _read_custom_loras(state.output_dir)
+        ]
+        entries.append({"id": lid, "label": label or _label_for_lora_spec(spec), "spec": spec, "scale": scale})
+        _write_custom_loras(state.output_dir, entries)
+        if job:
+            job.registered_lora_id = lid
+        lora_presets, default_lora_preset_id = _lora_catalog(state.output_dir)
+        preferred = state.preferred_lora_preset_ids()
+        if lid not in preferred:
+            preferred.append(lid)
+            state.persist_preferred_loras(preferred)
+        return {
+            "ok": True,
+            "id": lid,
+            "spec": spec,
+            "label": label,
+            "lora_presets": lora_presets,
+            "default_lora_preset_id": default_lora_preset_id,
+            "preferred_lora_preset_ids": state.preferred_lora_preset_ids(),
+        }
+
+    @app.get("/api/train/jobs/{job_id}/events")
+    async def train_events(job_id: str):
+        if job_id not in state.train_jobs:
+            status = load_status(state.output_dir, job_id)
+            if not status:
+                raise HTTPException(404, "Job not found")
+        if job_id not in state.train_event_queues:
+            state.train_event_queues[job_id] = asyncio.Queue()
+
+        async def stream() -> AsyncIterator[str]:
+            q = state.train_event_queues[job_id]
+            job = state.train_jobs.get(job_id)
+            if job and job.status in (
+                TrainJobStatus.DONE.value,
+                TrainJobStatus.FAILED.value,
+                TrainJobStatus.CANCELLED.value,
+            ):
+                snap = _job_for_api(state, job) if job else load_status(state.output_dir, job_id) or {}
+                yield f"data: {json.dumps({'type': 'snapshot', 'job': snap})}\n\n"
+                yield f"data: {json.dumps({'type': 'job_complete', 'job_id': job_id})}\n\n"
+                return
+            status = load_status(state.output_dir, job_id)
+            if status and status.get("phase") in ("done", "failed", "cancelled"):
+                yield f"data: {json.dumps({'type': 'snapshot', 'job': status})}\n\n"
+                yield f"data: {json.dumps({'type': 'job_complete', 'job_id': job_id})}\n\n"
+                return
+            while True:
+                try:
+                    event = await asyncio.wait_for(q.get(), timeout=120.0)
+                    yield f"data: {json.dumps(event)}\n\n"
+                    if event.get("type") in ("job_complete", "error", "job_done"):
+                        if event.get("type") in ("job_complete", "job_done"):
+                            break
+                except asyncio.TimeoutError:
+                    yield f"data: {json.dumps({'type': 'ping'})}\n\n"
+
+        return StreamingResponse(stream(), media_type="text/event-stream")
diff --git a/web_ui.py b/web_ui.py
index 00ef8a1..d850bd6 100644
--- a/web_ui.py
+++ b/web_ui.py
@@ -1711,7 +1711,13 @@ async def _finish_autoconcat(
 
 async def _execute_run(state: AppState, run_id: str) -> None:
     if state.embedded and state.video_server is not None:
-        await _execute_run_embedded(state, run_id)
+        from web_train import _init_train_state, is_training_active
+
+        _init_train_state(state)
+        if is_training_active(state):
+            raise RuntimeError("Cannot generate while LoRA training is active")
+        async with state._mlx_lock:
+            await _execute_run_embedded(state, run_id)
         return
     await _execute_run_via_ws(state, run_id)
 
@@ -2158,6 +2164,9 @@ def create_app(
     @asynccontextmanager
     async def lifespan(app: FastAPI):
         state.ensure_worker()
+        from web_train import ensure_train_worker
+
+        ensure_train_worker(state)
         loop = asyncio.get_running_loop()
 
         def _on_interrupt() -> None:
@@ -2781,6 +2790,10 @@ async def websocket_inference(ws: WebSocket) -> None:
             await ws.accept()
             await ws_handler(ws)
 
+    from web_train import register_train_routes
+
+    register_train_routes(app, state)
+
     if mount_static and resolve_web_dist().is_dir():
         app.mount("/", StaticFiles(directory=str(resolve_web_dist()), html=True), name="static")
 

From 47f928078465810c188ac77a3fdc1acd96d090c3 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Thu, 18 Jun 2026 21:54:32 +0200
Subject: [PATCH 5/7] Add durable train job persistence, resume, and IC-LoRA
 preset.

Persist manifests and status under web_outputs/train/, reload jobs on startup with interrupted marking, expose resume API, encode reference latents for video_to_video training, and extend the /train UI with paired uploads and reference downscale controls.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 ltx_train_backend.py        | 330 +++++++++++++++++++++++++++++++--
 train_configs/lora_v2v.yaml |  56 ++++++
 web/src/TrainPage.tsx       | 213 ++++++++++++++++++---
 web/src/api/train.ts        |  18 +-
 web/src/index.css           |  25 +++
 web_train.py                | 360 ++++++++++++++++++++++--------------
 6 files changed, 816 insertions(+), 186 deletions(-)
 create mode 100644 train_configs/lora_v2v.yaml

diff --git a/ltx_train_backend.py b/ltx_train_backend.py
index 04d1d81..41f99e3 100644
--- a/ltx_train_backend.py
+++ b/ltx_train_backend.py
@@ -13,6 +13,7 @@
 
 from __future__ import annotations
 
+import json
 import logging
 import re
 import shutil
@@ -74,8 +75,19 @@ class TrainPresetInfo:
         with_audio=True,
         low_ram_default=True,
     ),
+    "v2v": TrainPresetInfo(
+        id="v2v",
+        label="IC-LoRA (video-to-video)",
+        description="Learn a style transfer from paired reference → target clips.",
+        ram_hint="48 GB recommended",
+        with_audio=False,
+        low_ram_default=True,
+    ),
 }
 
+VIDEO_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm"}
+ACTIVE_JOB_PHASES = frozenset({"queued", "slicing", "preprocessing", "training", "starting"})
+
 
 @dataclass
 class SliceOptions:
@@ -95,6 +107,7 @@ class PreprocessOptions:
     max_frames: int = 97
     with_audio: bool = False
     frame_rate: float | None = 24.0
+    reference_downscale_factor: int = 2
 
 
 @dataclass
@@ -156,6 +169,8 @@ class TrainJobPaths:
 
     root: Path
     raw: Path
+    references: Path
+    reference_clips: Path
     clips: Path
     captions: Path
     preprocessed: Path
@@ -163,7 +178,16 @@ class TrainJobPaths:
     config: Path
 
     def ensure_dirs(self) -> None:
-        for d in (self.root, self.raw, self.clips, self.captions, self.preprocessed, self.outputs):
+        for d in (
+            self.root,
+            self.raw,
+            self.references,
+            self.reference_clips,
+            self.clips,
+            self.captions,
+            self.preprocessed,
+            self.outputs,
+        ):
             d.mkdir(parents=True, exist_ok=True)
 
 
@@ -172,6 +196,8 @@ def training_job_paths(output_dir: Path, job_id: str) -> TrainJobPaths:
     return TrainJobPaths(
         root=root,
         raw=root / "raw",
+        references=root / "references",
+        reference_clips=root / "reference_clips",
         clips=root / "clips",
         captions=root / "captions",
         preprocessed=root / "preprocessed",
@@ -210,14 +236,226 @@ def load_status(output_dir: Path, job_id: str) -> dict[str, Any] | None:
 
 
 def save_status(output_dir: Path, job_id: str, payload: dict[str, Any]) -> None:
-    import json
-
     root = job_root(output_dir, job_id)
     root.mkdir(parents=True, exist_ok=True)
     path = status_path(output_dir, job_id)
     path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
 
 
+def manifest_path(output_dir: Path, job_id: str) -> Path:
+    return job_root(output_dir, job_id) / "manifest.json"
+
+
+def save_manifest(output_dir: Path, job_id: str, payload: dict[str, Any]) -> None:
+    root = job_root(output_dir, job_id)
+    root.mkdir(parents=True, exist_ok=True)
+    manifest_path(output_dir, job_id).write_text(json.dumps(payload, indent=2), encoding="utf-8")
+
+
+def load_manifest(output_dir: Path, job_id: str) -> dict[str, Any] | None:
+    path = manifest_path(output_dir, job_id)
+    if not path.is_file():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except (OSError, ValueError):
+        return None
+
+
+def discover_train_job_ids(output_dir: Path) -> list[str]:
+    root = output_dir.resolve() / "train"
+    if not root.is_dir():
+        return []
+    ids: list[str] = []
+    for child in sorted(root.iterdir()):
+        if child.is_dir() and (child / "status.json").is_file():
+            ids.append(child.name)
+    return ids
+
+
+def _phase_to_status(phase: str | None) -> str:
+    key = (phase or "queued").strip().lower()
+    if key in ("done",):
+        return "done"
+    if key in ("failed",):
+        return "failed"
+    if key in ("cancelled",):
+        return "cancelled"
+    if key in ("interrupted",):
+        return "interrupted"
+    if key in ACTIVE_JOB_PHASES:
+        return "running"
+    return "queued"
+
+
+def reconcile_interrupted_jobs(output_dir: Path) -> int:
+    """Mark in-flight jobs as interrupted after a process restart."""
+    changed = 0
+    for job_id in discover_train_job_ids(output_dir):
+        status = load_status(output_dir, job_id)
+        if not status:
+            continue
+        phase = str(status.get("phase") or "").lower()
+        if phase in ACTIVE_JOB_PHASES:
+            status["phase"] = "interrupted"
+            status["status"] = "interrupted"
+            status["error"] = status.get("error") or "Interrupted by server restart"
+            save_status(output_dir, job_id, status)
+            changed += 1
+    return changed
+
+
+def job_api_payload(output_dir: Path, job_id: str) -> dict[str, Any] | None:
+    """Merge persisted status + manifest for API responses."""
+    status = load_status(output_dir, job_id)
+    manifest = load_manifest(output_dir, job_id)
+    if not status and not manifest:
+        return None
+    data: dict[str, Any] = {"id": job_id}
+    if status:
+        data.update(status)
+        data["id"] = job_id
+        data["status"] = status.get("status") or _phase_to_status(status.get("phase"))
+    if manifest:
+        data.setdefault("name", manifest.get("name"))
+        data.setdefault("preset", manifest.get("preset"))
+        if manifest.get("registered_lora_id"):
+            data["registered_lora_id"] = manifest["registered_lora_id"]
+        data["created_at"] = manifest.get("created_at") or data.get("created_at")
+    if "created_at" not in data:
+        data["created_at"] = ""
+    return data
+
+
+def parse_train_request(body: dict[str, Any]) -> TrainJobRequest:
+    preset = str(body.get("preset") or "t2v").strip().lower()
+    if preset not in TRAIN_PRESETS:
+        preset = "t2v"
+    preset_info = TRAIN_PRESETS[preset]
+
+    slice_raw = body.get("slice") or {}
+    preprocess_raw = body.get("preprocess") or {}
+    train_raw = body.get("train") or {}
+
+    slice_opts = SliceOptions(
+        enabled=bool(slice_raw.get("enabled", False)),
+        interval=float(slice_raw.get("interval", 4.0)),
+        res=str(slice_raw.get("res") or "384x384"),
+        fps=float(slice_raw.get("fps", 24.0)),
+        fit=str(slice_raw.get("fit") or "crop"),
+        caption_template=slice_raw.get("caption_template"),
+        max_clips=slice_raw.get("max_clips"),
+    )
+    preprocess = PreprocessOptions(
+        width=int(preprocess_raw.get("width") or 704),
+        height=int(preprocess_raw.get("height") or 480),
+        max_frames=int(preprocess_raw.get("max_frames") or 97),
+        with_audio=bool(preprocess_raw.get("with_audio", preset_info.with_audio)),
+        frame_rate=float(preprocess_raw.get("frame_rate") or 24.0),
+        reference_downscale_factor=int(preprocess_raw.get("reference_downscale_factor") or 2),
+    )
+    prompts_raw = train_raw.get("validation_prompts")
+    if isinstance(prompts_raw, str):
+        prompts = [p.strip() for p in prompts_raw.split("\n") if p.strip()]
+    elif isinstance(prompts_raw, list):
+        prompts = [str(p).strip() for p in prompts_raw if str(p).strip()]
+    else:
+        prompts = ["a cinematic landscape at sunset"]
+
+    train = TrainHyperparams(
+        steps=int(train_raw.get("steps") or 2000),
+        rank=int(train_raw.get("rank") or 64),
+        learning_rate=float(train_raw.get("learning_rate") or 5e-4),
+        validation_prompts=prompts,
+        validation_interval=int(train_raw.get("validation_interval") or 500),
+        checkpoint_interval=int(train_raw.get("checkpoint_interval") or 500),
+        low_ram=bool(train_raw.get("low_ram", preset_info.low_ram_default)),
+        seed=int(train_raw.get("seed") or 42),
+    )
+
+    return TrainJobRequest(
+        preset=preset,
+        name=str(body.get("name") or "My LoRA").strip() or "My LoRA",
+        model_id=str(body.get("model_id") or body.get("preferred_model") or "auto"),
+        model_dir=body.get("model_dir"),
+        slice=slice_opts,
+        preprocess=preprocess,
+        train=train,
+    )
+
+
+def _list_videos(directory: Path) -> list[Path]:
+    if not directory.is_dir():
+        return []
+    return sorted(
+        p
+        for p in directory.rglob("*")
+        if p.is_file() and p.suffix.lower() in VIDEO_EXTENSIONS
+    )
+
+
+def _pair_reference_videos(targets: list[Path], references_dir: Path) -> list[Path]:
+    paired: list[Path] = []
+    for target in targets:
+        direct = references_dir / target.name
+        if direct.is_file():
+            paired.append(direct)
+            continue
+        matches = sorted(
+            p
+            for p in references_dir.rglob("*")
+            if p.is_file() and p.suffix.lower() in VIDEO_EXTENSIONS and p.stem == target.stem
+        )
+        if not matches:
+            raise ValueError(f"No reference video paired with target {target.name}")
+        paired.append(matches[0])
+    return paired
+
+
+def _validation_reference_paths(paths: TrainJobPaths, num_prompts: int) -> list[str]:
+    refs = _list_videos(paths.references)
+    if not refs:
+        raise ValueError("IC-LoRA requires reference videos in the job references folder")
+    out: list[str] = []
+    for i in range(num_prompts):
+        out.append(str(refs[min(i, len(refs) - 1)].resolve()))
+    return out
+
+
+def encode_reference_latents(
+    reference_videos: list[Path],
+    *,
+    preprocessed_root: Path,
+    model_dir: str,
+    target_height: int,
+    target_width: int,
+    max_frames: int,
+    frame_rate: float | None,
+    downscale_factor: int,
+) -> None:
+    """Encode paired reference clips into ``reference_latents/`` for IC-LoRA."""
+    from ltx_trainer_mlx.preprocess import _encode_all_videos
+
+    factor = max(1, int(downscale_factor))
+    ref_h = max(32, int(target_height) // factor)
+    ref_w = max(32, int(target_width) // factor)
+    ref_h = (ref_h // 32) * 32
+    ref_w = (ref_w // 32) * 32
+
+    ref_dir = preprocessed_root / ".precomputed" / "reference_latents"
+    ref_dir.mkdir(parents=True, exist_ok=True)
+
+    _encode_all_videos(
+        video_files=reference_videos,
+        latents_dir=ref_dir,
+        model_dir=model_dir,
+        target_height=ref_h,
+        target_width=ref_w,
+        max_frames=max_frames,
+        frame_rate=frame_rate,
+    )
+
+
 def _preset_yaml_path(preset: str) -> Path:
     key = (preset or "t2v").strip().lower()
     path = TRAIN_CONFIGS_DIR / f"lora_{key}.yaml"
@@ -263,16 +501,22 @@ def build_trainer_config(req: TrainJobRequest, *, paths: TrainJobPaths) -> Any:
     nf = _nearest_valid_frames(int(req.preprocess.max_frames))
     val["video_dims"] = [w, h, nf]
     val["frame_rate"] = float(req.preprocess.frame_rate or 24.0)
-    val["generate_audio"] = bool(TRAIN_PRESETS.get(req.preset, TRAIN_PRESETS["t2v"]).with_audio)
+    preset_info = TRAIN_PRESETS.get(req.preset, TRAIN_PRESETS["t2v"])
+    if req.preset == "v2v":
+        val["reference_videos"] = _validation_reference_paths(paths, len(prompts))
+        val["reference_downscale_factor"] = max(1, int(req.preprocess.reference_downscale_factor))
+        val["generate_audio"] = False
+    else:
+        val["generate_audio"] = bool(preset_info.with_audio)
     raw["validation"] = val
 
     ckpt = raw.get("checkpoints") or {}
     ckpt["interval"] = int(req.train.checkpoint_interval)
     raw["checkpoints"] = ckpt
 
-    strat = raw.get("training_strategy") or {}
-    preset_info = TRAIN_PRESETS.get(req.preset, TRAIN_PRESETS["t2v"])
-    strat["generate_audio"] = preset_info.with_audio
+    strat = dict(raw.get("training_strategy") or {})
+    if req.preset != "v2v":
+        strat["generate_audio"] = preset_info.with_audio
     raw["training_strategy"] = strat
 
     return LtxTrainerConfig(**raw)
@@ -335,26 +579,37 @@ def run_train_job(
         "name": req.name,
         "preset": req.preset,
         "phase": "queued",
+        "status": "running",
         "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
         "step": 0,
         "total_steps": int(req.train.steps),
         "job_dir": str(paths.root),
         "model_path": None,
         "error": None,
+        "validation_clips": [],
     }
+    existing = load_status(output_dir, job_id)
+    if existing:
+        status["created_at"] = existing.get("created_at") or status["created_at"]
+        status["validation_clips"] = existing.get("validation_clips") or []
     save_status(output_dir, job_id, status)
 
     def emit(event: dict[str, Any]) -> None:
         nonlocal status
         status.update({k: v for k, v in event.items() if k != "type"})
+        status["status"] = _phase_to_status(status.get("phase"))
         save_status(output_dir, job_id, status)
         if on_event:
             on_event(event)
 
     videos_dir = paths.raw
+    references_dir = paths.references
     captions_dir: str | None = None
 
     try:
+        if req.preset == "v2v" and not _list_videos(paths.references):
+            raise ValueError("IC-LoRA requires paired reference videos (upload to references/)")
+
         if req.slice.enabled:
             _check_cancel(should_cancel)
             status["phase"] = "slicing"
@@ -363,11 +618,7 @@ def emit(event: dict[str, Any]) -> None:
                 raise RuntimeError("ffmpeg is required for slice")
             from ltx_trainer_mlx.slice_clips import slice_videos
 
-            sources = sorted(
-                p
-                for p in paths.raw.iterdir()
-                if p.suffix.lower() in {".mp4", ".mov", ".avi", ".mkv", ".webm"}
-            )
+            sources = _list_videos(paths.raw)
             if not sources:
                 raise ValueError("No video files found in upload")
             count = slice_videos(
@@ -380,9 +631,32 @@ def emit(event: dict[str, Any]) -> None:
                 caption_template=req.slice.caption_template,
                 max_clips=req.slice.max_clips,
             )
-            emit({"type": "phase_progress", "phase": "slicing", "message": f"Created {count} clips"})
+            emit({"type": "phase_progress", "phase": "slicing", "message": f"Created {count} target clips"})
             videos_dir = paths.clips
             captions_dir = None
+
+            if req.preset == "v2v":
+                ref_sources = _list_videos(paths.references)
+                if not ref_sources:
+                    raise ValueError("IC-LoRA requires reference videos when slicing")
+                ref_count = slice_videos(
+                    [str(p) for p in ref_sources],
+                    str(paths.reference_clips),
+                    interval=float(req.slice.interval),
+                    res=str(req.slice.res),
+                    fps=float(req.slice.fps),
+                    fit=str(req.slice.fit),
+                    caption_template=req.slice.caption_template,
+                    max_clips=req.slice.max_clips,
+                )
+                emit(
+                    {
+                        "type": "phase_progress",
+                        "phase": "slicing",
+                        "message": f"Created {ref_count} reference clips",
+                    }
+                )
+                references_dir = paths.reference_clips
         else:
             txts = list(paths.raw.glob("*.txt"))
             if txts:
@@ -393,7 +667,7 @@ def emit(event: dict[str, Any]) -> None:
 
         _check_cancel(should_cancel)
         status["phase"] = "preprocessing"
-        emit({"type": "phase_started", "phase": "preprocessing", "message": "Encoding latents…"})
+        emit({"type": "phase_started", "phase": "preprocessing", "message": "Encoding target latents…"})
         from ltx_trainer_mlx.preprocess import preprocess_dataset
 
         model_path = resolve_mlx_weights_directory(req.model_id, req.model_dir)
@@ -413,6 +687,31 @@ def emit(event: dict[str, Any]) -> None:
             with_audio=with_audio,
             frame_rate=float(req.preprocess.frame_rate) if req.preprocess.frame_rate else None,
         )
+
+        if req.preset == "v2v":
+            _check_cancel(should_cancel)
+            emit(
+                {
+                    "type": "phase_progress",
+                    "phase": "preprocessing",
+                    "message": "Encoding reference latents for IC-LoRA…",
+                }
+            )
+            targets = _list_videos(videos_dir)
+            ref_paths = _pair_reference_videos(targets, references_dir)
+            if len(ref_paths) != len(targets):
+                raise ValueError("Reference video count must match target clip count")
+            encode_reference_latents(
+                ref_paths,
+                preprocessed_root=paths.preprocessed,
+                model_dir=model_path,
+                target_height=int(req.preprocess.height or 704),
+                target_width=int(req.preprocess.width or 480),
+                max_frames=nf,
+                frame_rate=float(req.preprocess.frame_rate) if req.preprocess.frame_rate else None,
+                downscale_factor=int(req.preprocess.reference_downscale_factor),
+            )
+
         emit({"type": "phase_progress", "phase": "preprocessing", "message": "Preprocess complete"})
 
         _check_cancel(should_cancel)
@@ -473,6 +772,7 @@ def step_callback(step: int, total: int, validation_paths: list) -> None:
         lora_path = Path(saved_path)
         artifact_url = f"/api/train/jobs/{job_id}/artifacts/{lora_path.name}"
         status["phase"] = "done"
+        status["status"] = "done"
         status["artifact_lora"] = str(lora_path)
         status["artifact_url"] = artifact_url
         status["stats"] = stats.model_dump() if hasattr(stats, "model_dump") else dict(stats)
@@ -489,6 +789,7 @@ def step_callback(step: int, total: int, validation_paths: list) -> None:
 
     except TrainingCancelledError:
         status["phase"] = "cancelled"
+        status["status"] = "cancelled"
         status["error"] = "Cancelled"
         save_status(output_dir, job_id, status)
         emit({"type": "error", "phase": status.get("phase"), "message": "Cancelled"})
@@ -496,6 +797,7 @@ def step_callback(step: int, total: int, validation_paths: list) -> None:
     except Exception as exc:
         log.exception("Train job %s failed", job_id)
         status["phase"] = "failed"
+        status["status"] = "failed"
         status["error"] = str(exc)
         save_status(output_dir, job_id, status)
         emit({"type": "error", "phase": status.get("phase"), "message": str(exc)})
diff --git a/train_configs/lora_v2v.yaml b/train_configs/lora_v2v.yaml
new file mode 100644
index 0000000..5521ad3
--- /dev/null
+++ b/train_configs/lora_v2v.yaml
@@ -0,0 +1,56 @@
+# Hyperparameters only — paths injected at runtime (see lora_t2v.yaml header).
+model:
+  text_encoder_path: mlx-community/gemma-3-12b-it-4bit
+  training_mode: lora
+
+lora:
+  rank: 64
+  alpha: 64
+  dropout: 0.0
+  target_modules:
+    - to_k
+    - to_q
+    - to_v
+    - to_out.0
+
+optimization:
+  learning_rate: 5.0e-4
+  steps: 3000
+  batch_size: 1
+  gradient_accumulation_steps: 1
+  max_grad_norm: 1.0
+  weight_decay: 0.0
+  enable_gradient_checkpointing: true
+  scheduler_type: linear
+  scheduler_params:
+    start_factor: 1.0
+    end_factor: 0.1
+
+training_strategy:
+  name: video_to_video
+  first_frame_conditioning_p: 0.1
+  reference_latents_dir: reference_latents
+
+flow_matching:
+  timestep_sampling_mode: shifted_logit_normal
+
+validation:
+  prompts:
+    - "a person walking in a park"
+  reference_downscale_factor: 2
+  video_dims: [704, 480, 25]
+  frame_rate: 24.0
+  inference_steps: 8
+  interval: 500
+  guidance_scale: 4.0
+  stg_scale: 0.0
+  seed: 42
+  generate_audio: false
+  skip_initial_validation: true
+  include_reference_in_output: true
+
+checkpoints:
+  interval: 500
+  keep_last_n: 3
+
+seed: 42
diff --git a/web/src/TrainPage.tsx b/web/src/TrainPage.tsx
index 7595a22..327963f 100644
--- a/web/src/TrainPage.tsx
+++ b/web/src/TrainPage.tsx
@@ -7,6 +7,7 @@ import {
   fetchTrainJobs,
   fetchTrainPresets,
   registerTrainedLora,
+  resumeTrainJob,
   subscribeTrainJob,
   type TrainManifest,
 } from "./api/train";
@@ -45,6 +46,8 @@ function phaseLabel(phase?: string): string {
       return "Failed";
     case "cancelled":
       return "Cancelled";
+    case "interrupted":
+      return "Interrupted";
     default:
       return phase || "Queued";
   }
@@ -60,12 +63,16 @@ export default function TrainPage() {
   const [error, setError] = useState<string | null>(null);
   const [submitting, setSubmitting] = useState(false);
   const [registering, setRegistering] = useState(false);
+  const [resuming, setResuming] = useState(false);
 
   const [name, setName] = useState("My LoRA");
   const [preset, setPreset] = useState("t2v");
   const [videos, setVideos] = useState<File[]>([]);
+  const [references, setReferences] = useState<File[]>([]);
   const [dragOver, setDragOver] = useState(false);
+  const [refDragOver, setRefDragOver] = useState(false);
   const fileInputRef = useRef<HTMLInputElement>(null);
+  const refInputRef = useRef<HTMLInputElement>(null);
 
   const [sliceEnabled, setSliceEnabled] = useState(false);
   const [sliceInterval, setSliceInterval] = useState(4);
@@ -78,6 +85,7 @@ export default function TrainPage() {
   const [height, setHeight] = useState(480);
   const [maxFrames, setMaxFrames] = useState(97);
   const [withAudio, setWithAudio] = useState(false);
+  const [referenceDownscale, setReferenceDownscale] = useState(2);
 
   const [steps, setSteps] = useState(2000);
   const [rank, setRank] = useState(64);
@@ -119,10 +127,16 @@ export default function TrainPage() {
     refreshJobs();
   }, [refreshJobs]);
 
+  const isV2v = preset === "v2v";
+
   useEffect(() => {
     if (!selectedPreset) return;
     setWithAudio(selectedPreset.with_audio);
     setLowRam(selectedPreset.low_ram_default);
+    if (selectedPreset.id === "v2v") {
+      setSteps(3000);
+      setValidationPrompts("a person walking in a park");
+    }
   }, [selectedPreset?.id]);
 
   const manifest = useMemo((): TrainManifest => {
@@ -148,6 +162,7 @@ export default function TrainPage() {
         max_frames: maxFrames,
         with_audio: withAudio,
         frame_rate: 24,
+        reference_downscale_factor: referenceDownscale,
       },
       train: {
         steps,
@@ -174,6 +189,7 @@ export default function TrainPage() {
     height,
     maxFrames,
     withAudio,
+    referenceDownscale,
     steps,
     rank,
     learningRate,
@@ -239,7 +255,7 @@ export default function TrainPage() {
     return unsub;
   }, [activeJobId, updateJob, refreshJobs]);
 
-  function addFiles(fileList: FileList | File[]) {
+  function addTargetFiles(fileList: FileList | File[]) {
     const incoming = Array.from(fileList).filter((f) =>
       /\.(mp4|mov|avi|mkv|webm|txt)$/i.test(f.name),
     );
@@ -254,11 +270,31 @@ export default function TrainPage() {
     });
   }
 
+  function addReferenceFiles(fileList: FileList | File[]) {
+    const incoming = Array.from(fileList).filter((f) =>
+      /\.(mp4|mov|avi|mkv|webm)$/i.test(f.name),
+    );
+    if (!incoming.length) return;
+    setReferences((prev) => {
+      const names = new Set(prev.map((f) => f.name));
+      const merged = [...prev];
+      for (const f of incoming) {
+        if (!names.has(f.name)) merged.push(f);
+      }
+      return merged;
+    });
+  }
+
   async function startTraining() {
     setError(null);
     const videoFiles = videos.filter((f) => !f.name.toLowerCase().endsWith(".txt"));
     if (!videoFiles.length) {
-      setError("Add at least one video file (.mp4, .mov, …).");
+      setError("Add at least one target video file (.mp4, .mov, …).");
+      setStep("dataset");
+      return;
+    }
+    if (isV2v && !references.length) {
+      setError("IC-LoRA requires reference videos paired by matching filename.");
       setStep("dataset");
       return;
     }
@@ -272,7 +308,8 @@ export default function TrainPage() {
     }
     setSubmitting(true);
     try {
-      const result = await createTrainJob(manifest, videos);
+      const captionFiles = videos.filter((f) => f.name.toLowerCase().endsWith(".txt"));
+      const result = await createTrainJob(manifest, [...videoFiles, ...captionFiles], references);
       const job: TrainJob = {
         id: result.job_id,
         name: result.name,
@@ -293,6 +330,23 @@ export default function TrainPage() {
     }
   }
 
+  async function handleResume(jobId?: string) {
+    const id = jobId || activeJobId;
+    if (!id) return;
+    setResuming(true);
+    setError(null);
+    try {
+      await resumeTrainJob(id);
+      setActiveJobId(id);
+      setStep("runs");
+      await refreshJobs();
+    } catch (exc) {
+      setError(exc instanceof Error ? exc.message : String(exc));
+    } finally {
+      setResuming(false);
+    }
+  }
+
   async function handleCancel() {
     if (!activeJobId) return;
     try {
@@ -324,6 +378,7 @@ export default function TrainPage() {
 
   const videoCount = videos.filter((f) => !f.name.toLowerCase().endsWith(".txt")).length;
   const captionCount = videos.filter((f) => f.name.toLowerCase().endsWith(".txt")).length;
+  const referenceCount = references.length;
 
   return (
     <div className="train-page">
@@ -381,8 +436,9 @@ export default function TrainPage() {
             <section className="train-section">
               <h2>Dataset</h2>
               <p className="train-section-lead">
-                Drop training videos here. Optional <code>.txt</code> caption files with matching names are used when
-                slicing is off.
+                {isV2v
+                  ? "Upload target clips (desired output) and reference clips (conditioning). Pair by matching filename — e.g. scene01.mp4 + scene01.mp4."
+                  : "Drop training videos here. Optional .txt caption files with matching names are used when slicing is off."}
               </p>
 
               <label className="field">
@@ -415,7 +471,7 @@ export default function TrainPage() {
                 onDrop={(e) => {
                   e.preventDefault();
                   setDragOver(false);
-                  if (e.dataTransfer.files.length) addFiles(e.dataTransfer.files);
+                  if (e.dataTransfer.files.length) addTargetFiles(e.dataTransfer.files);
                 }}
                 onClick={() => fileInputRef.current?.click()}
                 role="button"
@@ -430,20 +486,64 @@ export default function TrainPage() {
                   multiple
                   accept="video/*,.txt"
                   hidden
-                  onChange={(e) => e.target.files && addFiles(e.target.files)}
+                  onChange={(e) => e.target.files && addTargetFiles(e.target.files)}
                 />
                 <div className="drop-zone-icon">↑</div>
-                <div className="drop-zone-title">Drop videos or click to browse</div>
+                <div className="drop-zone-title">
+                  {isV2v ? "Drop target videos" : "Drop videos or click to browse"}
+                </div>
                 <div className="drop-zone-meta">
-                  {videoCount} video{videoCount !== 1 ? "s" : ""}
+                  {videoCount} target video{videoCount !== 1 ? "s" : ""}
                   {captionCount > 0 ? ` · ${captionCount} caption file${captionCount !== 1 ? "s" : ""}` : ""}
                 </div>
               </div>
 
+              {isV2v && (
+                <>
+                  <p className="pairing-note">
+                    Reference clips drive IC-LoRA conditioning (e.g. depth maps, edges, or a source take). Use the same
+                    filenames as targets so pairing survives slicing.
+                  </p>
+                  <div
+                    className={`drop-zone reference-zone ${refDragOver ? "drag-over" : ""}`}
+                    onDragOver={(e) => {
+                      e.preventDefault();
+                      setRefDragOver(true);
+                    }}
+                    onDragLeave={() => setRefDragOver(false)}
+                    onDrop={(e) => {
+                      e.preventDefault();
+                      setRefDragOver(false);
+                      if (e.dataTransfer.files.length) addReferenceFiles(e.dataTransfer.files);
+                    }}
+                    onClick={() => refInputRef.current?.click()}
+                    role="button"
+                    tabIndex={0}
+                    onKeyDown={(e) => {
+                      if (e.key === "Enter" || e.key === " ") refInputRef.current?.click();
+                    }}
+                  >
+                    <input
+                      ref={refInputRef}
+                      type="file"
+                      multiple
+                      accept="video/*"
+                      hidden
+                      onChange={(e) => e.target.files && addReferenceFiles(e.target.files)}
+                    />
+                    <div className="drop-zone-icon">◎</div>
+                    <div className="drop-zone-title">Drop reference videos</div>
+                    <div className="drop-zone-meta">
+                      {referenceCount} reference video{referenceCount !== 1 ? "s" : ""}
+                    </div>
+                  </div>
+                </>
+              )}
+
               {videos.length > 0 && (
                 <ul className="file-list">
                   {videos.map((f) => (
-                    <li key={f.name}>
+                    <li key={`t-${f.name}`}>
                       <span>{f.name}</span>
                       <span className="file-size">{(f.size / 1024 / 1024).toFixed(1)} MB</span>
                       <button
@@ -459,6 +559,25 @@ export default function TrainPage() {
                 </ul>
               )}
 
+              {isV2v && references.length > 0 && (
+                <ul className="file-list">
+                  {references.map((f) => (
+                    <li key={`r-${f.name}`}>
+                      <span>ref: {f.name}</span>
+                      <span className="file-size">{(f.size / 1024 / 1024).toFixed(1)} MB</span>
+                      <button
+                        type="button"
+                        className="file-remove"
+                        aria-label={`Remove ${f.name}`}
+                        onClick={() => setReferences((prev) => prev.filter((x) => x !== f))}
+                      >
+                        ×
+                      </button>
+                    </li>
+                  ))}
+                </ul>
+              )}
+
               <details className="train-advanced">
                 <summary>Auto-slice long videos</summary>
                 <div className="train-advanced-body">
@@ -529,6 +648,7 @@ export default function TrainPage() {
               <h2>Preprocess</h2>
               <p className="train-section-lead">
                 Latents are encoded at this resolution. Frames are rounded to valid LTX lengths (8k+1).
+                {isV2v && " Reference latents are encoded at a lower resolution for IC-LoRA conditioning."}
               </p>
               <div className="field-grid">
                 <label className="field">
@@ -552,12 +672,24 @@ export default function TrainPage() {
                   <input
                     type="checkbox"
                     checked={withAudio}
-                    disabled={selectedPreset?.with_audio}
+                    disabled={selectedPreset?.with_audio || isV2v}
                     onChange={(e) => setWithAudio(e.target.checked)}
                   />
                   Encode audio latents
                   {selectedPreset?.with_audio && <span className="field-note"> (required for AV preset)</span>}
                 </label>
+                {isV2v && (
+                  <label className="field">
+                    <span>Reference downscale factor</span>
+                    <input
+                      type="number"
+                      min={1}
+                      max={8}
+                      value={referenceDownscale}
+                      onChange={(e) => setReferenceDownscale(Number(e.target.value))}
+                    />
+                  </label>
+                )}
               </div>
               <div className="train-actions">
                 <button type="button" className="btn-secondary" onClick={() => setStep("dataset")}>
@@ -637,7 +769,10 @@ export default function TrainPage() {
                   <strong>{name}</strong> · {selectedPreset?.label || preset} · {steps} steps · rank {rank}
                 </div>
                 <div className="train-summary-meta">
-                  {videoCount} video{videoCount !== 1 ? "s" : ""} · {width}×{height} · {maxFrames} frames
+                  {videoCount} target{isV2v ? "" : ""} video{videoCount !== 1 ? "s" : ""}
+                  {isV2v ? ` · ${referenceCount} reference${referenceCount !== 1 ? "s" : ""}` : ""}
+                  {" · "}
+                  {width}×{height} · {maxFrames} frames
                 </div>
               </div>
 
@@ -686,27 +821,51 @@ export default function TrainPage() {
                   <div className="job-detail-header">
                     <div>
                       <h3>{activeJob.name}</h3>
-                      <p className="job-phase">{phaseLabel(activeJob.phase)}</p>
+                      <p className="job-phase">
+                        {phaseLabel(activeJob.phase)} · {activeJob.preset}
+                      </p>
+                    </div>
+                    <div className="artifact-actions">
+                      {["interrupted", "failed"].includes(activeJob.status) && (
+                        <button
+                          type="button"
+                          className="btn-primary"
+                          disabled={resuming || !!health?.generation_active}
+                          onClick={() => handleResume()}
+                        >
+                          {resuming ? "Resuming…" : "Resume"}
+                        </button>
+                      )}
+                      {["queued", "running"].includes(activeJob.status) && (
+                        <button type="button" className="btn-danger" onClick={handleCancel}>
+                          Cancel
+                        </button>
+                      )}
                     </div>
-                    {["queued", "running"].includes(activeJob.status) && (
-                      <button type="button" className="btn-danger" onClick={handleCancel}>
-                        Cancel
-                      </button>
-                    )}
                   </div>
 
-                  {activeJob.phase === "training" && (
+                  {["queued", "running", "slicing", "preprocessing", "training", "starting"].includes(
+                    activeJob.phase || "",
+                  ) &&
+                    activeJob.status !== "interrupted" && (
                     <div className="progress-block">
                       <div className="progress-bar">
-                        <div className="progress-fill" style={{ width: `${trainProgress}%` }} />
-                      </div>
-                      <div className="progress-stats">
-                        <span>
-                          Step {activeJob.step || 0} / {activeJob.total_steps || steps}
-                        </span>
-                        {activeJob.loss != null && <span>Loss {activeJob.loss.toFixed(4)}</span>}
-                        <span>ETA {formatEta(activeJob.eta_s)}</span>
+                        <div
+                          className="progress-fill"
+                          style={{
+                            width: activeJob.phase === "training" ? `${trainProgress}%` : "12%",
+                          }}
+                        />
                       </div>
+                      {activeJob.phase === "training" && (
+                        <div className="progress-stats">
+                          <span>
+                            Step {activeJob.step || 0} / {activeJob.total_steps || steps}
+                          </span>
+                          {activeJob.loss != null && <span>Loss {activeJob.loss.toFixed(4)}</span>}
+                          <span>ETA {formatEta(activeJob.eta_s)}</span>
+                        </div>
+                      )}
                     </div>
                   )}
 
diff --git a/web/src/api/train.ts b/web/src/api/train.ts
index b9df8cc..3c535bd 100644
--- a/web/src/api/train.ts
+++ b/web/src/api/train.ts
@@ -39,6 +39,7 @@ export interface TrainManifest {
     max_frames: number;
     with_audio: boolean;
     frame_rate: number;
+    reference_downscale_factor?: number;
   };
   train: {
     steps: number;
@@ -54,13 +55,17 @@ export interface TrainManifest {
 
 export async function createTrainJob(
   manifest: TrainManifest,
-  files: File[],
+  targetFiles: File[],
+  referenceFiles: File[] = [],
 ): Promise<{ job_id: string; name: string; preset: string }> {
   const form = new FormData();
   form.append("manifest", JSON.stringify(manifest));
-  for (const file of files) {
+  for (const file of targetFiles) {
     form.append("videos", file, file.name);
   }
+  for (const file of referenceFiles) {
+    form.append("references", file, file.name);
+  }
   const res = await fetch("/api/train/jobs", { method: "POST", body: form });
   if (!res.ok) {
     const text = await res.text();
@@ -69,6 +74,15 @@ export async function createTrainJob(
   return res.json();
 }
 
+export async function resumeTrainJob(jobId: string): Promise<{ job_id: string; status: string }> {
+  const res = await fetch(`/api/train/jobs/${jobId}/resume`, { method: "POST" });
+  if (!res.ok) {
+    const text = await res.text();
+    throw new Error(text || `Resume failed (${res.status})`);
+  }
+  return res.json();
+}
+
 export async function cancelTrainJob(jobId: string): Promise<void> {
   const res = await fetch(`/api/train/jobs/${jobId}/cancel`, { method: "POST" });
   if (!res.ok) throw new Error(`Cancel failed (${res.status})`);
diff --git a/web/src/index.css b/web/src/index.css
index 61b9090..39a74ea 100644
--- a/web/src/index.css
+++ b/web/src/index.css
@@ -1700,6 +1700,31 @@ body {
   color: var(--danger);
 }
 
+.job-status.status-interrupted {
+  background: #fef3c7;
+  color: #b45309;
+}
+
+.pairing-note {
+  margin: 0 0 12px;
+  padding: 10px 12px;
+  background: var(--accent-soft);
+  border-radius: 10px;
+  font-size: 0.85rem;
+  line-height: 1.45;
+}
+
+.reference-zone {
+  margin-top: 12px;
+  border-color: #c4b5fd;
+}
+
+.reference-zone:hover,
+.reference-zone.drag-over {
+  border-color: #7c3aed;
+  background: #f5f3ff;
+}
+
 .job-card-meta {
   grid-column: 1 / -1;
   font-size: 0.75rem;
diff --git a/web_train.py b/web_train.py
index a49cdaf..73315c4 100644
--- a/web_train.py
+++ b/web_train.py
@@ -18,8 +18,6 @@
 
 log = logging.getLogger("web_train")
 
-_TRAIN_BODIES: dict[str, dict[str, Any]] = {}
-
 
 class TrainJobStatus(str, Enum):
     QUEUED = "queued"
@@ -27,6 +25,7 @@ class TrainJobStatus(str, Enum):
     DONE = "done"
     FAILED = "failed"
     CANCELLED = "cancelled"
+    INTERRUPTED = "interrupted"
 
 
 @dataclass
@@ -57,6 +56,47 @@ def _init_train_state(state: AppState) -> None:
     state._train_worker_started = False
     state._mlx_lock = asyncio.Lock()
     state._train_initialized = True
+    _load_train_jobs_from_disk(state)
+
+
+def _record_from_payload(job_id: str, payload: dict[str, Any]) -> TrainJobRecord:
+    validation = payload.get("validation_clips")
+    if validation is None:
+        validation = []
+    return TrainJobRecord(
+        id=job_id,
+        name=str(payload.get("name") or job_id),
+        preset=str(payload.get("preset") or "t2v"),
+        status=str(payload.get("status") or "queued"),
+        created_at=str(payload.get("created_at") or ""),
+        phase=str(payload.get("phase") or payload.get("status") or "queued"),
+        step=int(payload.get("step") or 0),
+        total_steps=int(payload.get("total_steps") or 0),
+        error=payload.get("error"),
+        artifact_url=payload.get("artifact_url"),
+        artifact_name=(
+            Path(str(payload["artifact_lora"])).name
+            if payload.get("artifact_lora") and not payload.get("artifact_name")
+            else payload.get("artifact_name")
+        ),
+        registered_lora_id=payload.get("registered_lora_id"),
+        validation_clips=list(validation) if isinstance(validation, list) else [],
+    )
+
+
+def _load_train_jobs_from_disk(state: AppState) -> None:
+    from ltx_train_backend import (
+        discover_train_job_ids,
+        job_api_payload,
+        reconcile_interrupted_jobs,
+    )
+
+    reconcile_interrupted_jobs(state.output_dir)
+    for job_id in discover_train_job_ids(state.output_dir):
+        payload = job_api_payload(state.output_dir, job_id)
+        if not payload:
+            continue
+        state.train_jobs[job_id] = _record_from_payload(job_id, payload)
 
 
 def is_training_active(state: AppState) -> bool:
@@ -71,108 +111,45 @@ async def emit_train(state: AppState, job_id: str, event: dict[str, Any]) -> Non
         await q.put(event)
 
 
-def _job_for_api(state: AppState, job: TrainJobRecord) -> dict[str, Any]:
-    data = asdict(job)
-    status = None
-    try:
-        from ltx_train_backend import load_status
-
-        status = load_status(state.output_dir, job.id)
-    except Exception:
-        pass
-    if status:
-        for key in (
-            "phase",
-            "step",
-            "total_steps",
-            "loss",
-            "lr",
-            "eta_s",
-            "artifact_url",
-            "artifact_lora",
-            "validation_clips",
-            "error",
-            "stats",
-        ):
-            if key in status and status[key] is not None:
-                data[key] = status[key]
-    return data
+def _job_for_api(state: AppState, job: TrainJobRecord | None = None, *, job_id: str | None = None) -> dict[str, Any]:
+    from ltx_train_backend import job_api_payload
 
+    jid = job_id or (job.id if job else "")
+    payload = job_api_payload(state.output_dir, jid)
+    if payload:
+        return payload
+    if job:
+        return asdict(job)
+    return {}
 
-def _parse_train_request(body: dict[str, Any]) -> Any:
-    from ltx_train_backend import (
-        PreprocessOptions,
-        SliceOptions,
-        TrainHyperparams,
-        TrainJobRequest,
-        TRAIN_PRESETS,
-    )
 
-    preset = str(body.get("preset") or "t2v").strip().lower()
-    if preset not in TRAIN_PRESETS:
-        preset = "t2v"
-    preset_info = TRAIN_PRESETS[preset]
-
-    slice_raw = body.get("slice") or {}
-    preprocess_raw = body.get("preprocess") or {}
-    train_raw = body.get("train") or {}
-
-    slice_opts = SliceOptions(
-        enabled=bool(slice_raw.get("enabled", False)),
-        interval=float(slice_raw.get("interval", 4.0)),
-        res=str(slice_raw.get("res") or "384x384"),
-        fps=float(slice_raw.get("fps", 24.0)),
-        fit=str(slice_raw.get("fit") or "crop"),
-        caption_template=slice_raw.get("caption_template"),
-        max_clips=slice_raw.get("max_clips"),
-    )
-    preprocess = PreprocessOptions(
-        width=int(preprocess_raw.get("width") or 704),
-        height=int(preprocess_raw.get("height") or 480),
-        max_frames=int(preprocess_raw.get("max_frames") or 97),
-        with_audio=bool(preprocess_raw.get("with_audio", preset_info.with_audio)),
-        frame_rate=float(preprocess_raw.get("frame_rate") or 24.0),
-    )
-    prompts_raw = train_raw.get("validation_prompts")
-    if isinstance(prompts_raw, str):
-        prompts = [p.strip() for p in prompts_raw.split("\n") if p.strip()]
-    elif isinstance(prompts_raw, list):
-        prompts = [str(p).strip() for p in prompts_raw if str(p).strip()]
-    else:
-        prompts = ["a cinematic landscape at sunset"]
-
-    train = TrainHyperparams(
-        steps=int(train_raw.get("steps") or 2000),
-        rank=int(train_raw.get("rank") or 64),
-        learning_rate=float(train_raw.get("learning_rate") or 5e-4),
-        validation_prompts=prompts,
-        validation_interval=int(train_raw.get("validation_interval") or 500),
-        checkpoint_interval=int(train_raw.get("checkpoint_interval") or 500),
-        low_ram=bool(train_raw.get("low_ram", preset_info.low_ram_default)),
-        seed=int(train_raw.get("seed") or 42),
-    )
+def _sync_job_record(state: AppState, job_id: str) -> None:
+    from ltx_train_backend import job_api_payload
 
-    return TrainJobRequest(
-        preset=preset,
-        name=str(body.get("name") or "My LoRA").strip() or "My LoRA",
-        model_id=str(body.get("model_id") or body.get("preferred_model") or "auto"),
-        model_dir=body.get("model_dir"),
-        slice=slice_opts,
-        preprocess=preprocess,
-        train=train,
-    )
+    payload = job_api_payload(state.output_dir, job_id)
+    if not payload:
+        return
+    state.train_jobs[job_id] = _record_from_payload(job_id, payload)
 
 
 async def _execute_train_job(state: AppState, job_id: str) -> None:
-    from ltx_train_backend import TrainingCancelledError, run_train_job
+    from ltx_train_backend import (
+        TrainingCancelledError,
+        load_manifest,
+        parse_train_request,
+        run_train_job,
+    )
 
     _init_train_state(state)
     job = state.train_jobs.get(job_id)
+    if not job:
+        _sync_job_record(state, job_id)
+        job = state.train_jobs.get(job_id)
     if not job:
         return
 
-    body = _TRAIN_BODIES.get(job_id, {})
-    req = _parse_train_request(body)
+    manifest = load_manifest(state.output_dir, job_id) or {}
+    req = parse_train_request(manifest)
 
     job.status = TrainJobStatus.RUNNING.value
     job.phase = "starting"
@@ -197,6 +174,7 @@ def on_event(event: dict[str, Any]) -> None:
 
         def _schedule() -> None:
             loop.create_task(emit_train(state, job_id, event))
+            _sync_job_record(state, job_id)
 
         loop.call_soon_threadsafe(_schedule)
 
@@ -223,6 +201,7 @@ def should_cancel() -> bool:
             job.artifact_url = result.get("artifact_url") or job.artifact_url
             if result.get("artifact_lora"):
                 job.artifact_name = Path(str(result["artifact_lora"])).name
+            _sync_job_record(state, job_id)
             await emit_train(
                 state,
                 job_id,
@@ -236,6 +215,7 @@ def should_cancel() -> bool:
             job.status = TrainJobStatus.CANCELLED.value
             job.phase = "cancelled"
             job.error = "Cancelled"
+            _sync_job_record(state, job_id)
             await emit_train(state, job_id, {"type": "error", "message": "Cancelled"})
         finally:
             state._active_train_job_id = None
@@ -251,6 +231,14 @@ async def _train_worker_loop(state: AppState) -> None:
             if job and job.status != TrainJobStatus.CANCELLED.value:
                 job.status = TrainJobStatus.FAILED.value
                 job.error = str(exc)
+                from ltx_train_backend import load_status, save_status
+
+                status = load_status(state.output_dir, job_id) or {}
+                status["phase"] = "failed"
+                status["status"] = "failed"
+                status["error"] = str(exc)
+                save_status(state.output_dir, job_id, status)
+                _sync_job_record(state, job_id)
                 await emit_train(state, job_id, {"type": "error", "message": str(exc)})
         finally:
             await emit_train(
@@ -258,7 +246,6 @@ async def _train_worker_loop(state: AppState) -> None:
                 job_id,
                 {"type": "job_complete", "job_id": job_id},
             )
-            _TRAIN_BODIES.pop(job_id, None)
 
 
 def ensure_train_worker(state: AppState) -> None:
@@ -269,13 +256,31 @@ def ensure_train_worker(state: AppState) -> None:
     state._train_worker_started = True
 
 
+def _save_uploads(uploads: list[Any], dest_dir: Path) -> int:
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    saved = 0
+    for upload in uploads:
+        name = Path(upload.filename or f"upload_{saved}").name
+        dest = dest_dir / name
+        with dest.open("wb") as fh:
+            shutil.copyfileobj(upload.file, fh)
+        saved += 1
+    return saved
+
+
 def register_train_routes(app: Any, state: AppState) -> None:
     from fastapi import File, Form, HTTPException, UploadFile
     from fastapi.responses import FileResponse, StreamingResponse
     from ltx_train_backend import (
+        VIDEO_EXTENSIONS,
+        job_api_payload,
         job_root,
+        load_manifest,
         load_status,
+        parse_train_request,
         register_trained_lora,
+        save_manifest,
+        save_status,
         trainer_health,
         training_job_paths,
     )
@@ -296,6 +301,7 @@ async def train_presets():
 
     @app.get("/api/train/jobs")
     async def list_train_jobs():
+        _load_train_jobs_from_disk(state)
         jobs = sorted(
             state.train_jobs.values(),
             key=lambda j: j.created_at,
@@ -305,18 +311,18 @@ async def list_train_jobs():
 
     @app.get("/api/train/jobs/{job_id}")
     async def get_train_job(job_id: str):
-        job = state.train_jobs.get(job_id)
-        if not job:
-            status = load_status(state.output_dir, job_id)
-            if not status:
-                raise HTTPException(404, "Job not found")
-            return status
-        return _job_for_api(state, job)
+        payload = job_api_payload(state.output_dir, job_id)
+        if not payload:
+            raise HTTPException(404, "Job not found")
+        if job_id in state.train_jobs:
+            state.train_jobs[job_id] = _record_from_payload(job_id, payload)
+        return payload
 
     @app.post("/api/train/jobs")
     async def create_train_job(
         manifest: str = Form(...),
         videos: list[UploadFile] = File(...),
+        references: list[UploadFile] | None = File(None),
     ):
         ensure_train_worker(state)
         if is_training_active(state):
@@ -330,48 +336,112 @@ async def create_train_job(
             raise HTTPException(400, f"Invalid manifest JSON: {exc}") from exc
 
         if not videos:
-            raise HTTPException(400, "At least one video file is required")
+            raise HTTPException(400, "At least one target video file is required")
 
-        video_exts = {".mp4", ".mov", ".avi", ".mkv", ".webm"}
-        has_video = any(
-            Path(u.filename or "").suffix.lower() in video_exts for u in videos
-        )
+        has_video = any(Path(u.filename or "").suffix.lower() in VIDEO_EXTENSIONS for u in videos)
         if not has_video:
-            raise HTTPException(400, "At least one video file is required")
+            raise HTTPException(400, "At least one target video file is required")
+
+        req = parse_train_request(body)
+        ref_uploads = references or []
+        if req.preset == "v2v":
+            if not ref_uploads:
+                raise HTTPException(400, "IC-LoRA requires reference videos")
+            ref_video = any(Path(u.filename or "").suffix.lower() in VIDEO_EXTENSIONS for u in ref_uploads)
+            if not ref_video:
+                raise HTTPException(400, "IC-LoRA requires at least one reference video file")
 
         job_id = f"train_{uuid.uuid4().hex[:10]}"
         paths = training_job_paths(state.output_dir, job_id)
         paths.ensure_dirs()
 
-        saved = 0
-        for upload in videos:
-            name = Path(upload.filename or f"upload_{saved}").name
-            dest = paths.raw / name
-            with dest.open("wb") as fh:
-                shutil.copyfileobj(upload.file, fh)
-            saved += 1
+        saved = _save_uploads(videos, paths.raw)
+        if ref_uploads:
+            _save_uploads(ref_uploads, paths.references)
+
+        created_at = datetime.now().isoformat()
+        body["created_at"] = created_at
+        save_manifest(state.output_dir, job_id, body)
 
-        req = _parse_train_request(body)
         job = TrainJobRecord(
             id=job_id,
             name=req.name,
             preset=req.preset,
             status=TrainJobStatus.QUEUED.value,
-            created_at=datetime.now().isoformat(),
+            created_at=created_at,
             total_steps=int(req.train.steps),
         )
         state.train_jobs[job_id] = job
-        _TRAIN_BODIES[job_id] = body
+        save_status(
+            state.output_dir,
+            job_id,
+            {
+                "job_id": job_id,
+                "name": req.name,
+                "preset": req.preset,
+                "phase": "queued",
+                "status": "queued",
+                "created_at": created_at,
+                "step": 0,
+                "total_steps": int(req.train.steps),
+                "job_dir": str(paths.root),
+                "validation_clips": [],
+            },
+        )
         state.train_event_queues[job_id] = asyncio.Queue()
         await state._pending_train.put(job_id)
-        log.info("Queued train job %s  preset=%s  videos=%d", job_id, req.preset, saved)
+        log.info(
+            "Queued train job %s  preset=%s  targets=%d  references=%d",
+            job_id,
+            req.preset,
+            saved,
+            len(ref_uploads),
+        )
         return {"job_id": job_id, "name": job.name, "preset": job.preset}
 
+    @app.post("/api/train/jobs/{job_id}/resume")
+    async def resume_train_job(job_id: str):
+        ensure_train_worker(state)
+        payload = job_api_payload(state.output_dir, job_id)
+        if not payload:
+            raise HTTPException(404, "Job not found")
+        status = str(payload.get("status") or "")
+        if status not in (TrainJobStatus.INTERRUPTED.value, TrainJobStatus.FAILED.value):
+            raise HTTPException(400, f"Job cannot be resumed from status {status!r}")
+        if is_training_active(state):
+            raise HTTPException(409, "A training job is already running")
+        if state.is_generation_active():
+            raise HTTPException(409, "Cannot start training while generation is active")
+        manifest = load_manifest(state.output_dir, job_id)
+        if not manifest:
+            raise HTTPException(400, "Job manifest missing; cannot resume")
+
+        state._cancelled_train_jobs.discard(job_id)
+        save_status(
+            state.output_dir,
+            job_id,
+            {
+                **(load_status(state.output_dir, job_id) or {}),
+                "phase": "queued",
+                "status": "queued",
+                "error": None,
+            },
+        )
+        _sync_job_record(state, job_id)
+        state.train_event_queues[job_id] = asyncio.Queue()
+        await state._pending_train.put(job_id)
+        log.info("Resumed train job %s", job_id)
+        return {"ok": True, "job_id": job_id, "status": "queued"}
+
     @app.post("/api/train/jobs/{job_id}/cancel")
     async def cancel_train_job(job_id: str):
         job = state.train_jobs.get(job_id)
         if not job:
-            raise HTTPException(404, "Job not found")
+            payload = job_api_payload(state.output_dir, job_id)
+            if not payload:
+                raise HTTPException(404, "Job not found")
+            job = _record_from_payload(job_id, payload)
+            state.train_jobs[job_id] = job
         if job.status in (
             TrainJobStatus.DONE.value,
             TrainJobStatus.FAILED.value,
@@ -380,6 +450,10 @@ async def cancel_train_job(job_id: str):
             return {"ok": True, "status": job.status}
         state._cancelled_train_jobs.add(job_id)
         job.status = TrainJobStatus.CANCELLED.value
+        status = load_status(state.output_dir, job_id) or {}
+        status["phase"] = "cancelled"
+        status["status"] = "cancelled"
+        save_status(state.output_dir, job_id, status)
         return {"ok": True, "status": job.status}
 
     @app.get("/api/train/jobs/{job_id}/artifacts/{artifact_path:path}")
@@ -397,22 +471,22 @@ async def train_artifact(job_id: str, artifact_path: str):
     async def register_lora(job_id: str, body: dict[str, Any] | None = None):
         from web_ui import _label_for_lora_spec, _lora_catalog, _read_custom_loras, _write_custom_loras
 
-        job = state.train_jobs.get(job_id)
-        status = load_status(state.output_dir, job_id)
+        payload = job_api_payload(state.output_dir, job_id)
+        if not payload:
+            raise HTTPException(404, "Job not found")
+        job = state.train_jobs.get(job_id) or _record_from_payload(job_id, payload)
+
         artifact = None
-        if job and job.artifact_url:
-            artifact_name = job.artifact_name
-            if artifact_name:
-                artifact = job_root(state.output_dir, job_id) / "outputs" / artifact_name
-        if artifact is None and status:
-            al = status.get("artifact_lora")
-            if al:
-                artifact = Path(str(al))
+        artifact_name = job.artifact_name or payload.get("artifact_name")
+        if artifact_name:
+            artifact = job_root(state.output_dir, job_id) / "outputs" / artifact_name
+        if artifact is None and payload.get("artifact_lora"):
+            artifact = Path(str(payload["artifact_lora"]))
         if artifact is None or not artifact.is_file():
             raise HTTPException(404, "Trained LoRA artifact not found")
 
         body = body or {}
-        label = str(body.get("label") or (job.name if job else "") or "Trained LoRA").strip()
+        label = str(body.get("label") or job.name or "Trained LoRA").strip()
         try:
             scale = float(body.get("scale", 1.0))
         except (TypeError, ValueError):
@@ -432,8 +506,13 @@ async def register_lora(job_id: str, body: dict[str, Any] | None = None):
         ]
         entries.append({"id": lid, "label": label or _label_for_lora_spec(spec), "spec": spec, "scale": scale})
         _write_custom_loras(state.output_dir, entries)
-        if job:
-            job.registered_lora_id = lid
+
+        manifest = load_manifest(state.output_dir, job_id) or {}
+        manifest["registered_lora_id"] = lid
+        save_manifest(state.output_dir, job_id, manifest)
+        job.registered_lora_id = lid
+        state.train_jobs[job_id] = job
+
         lora_presets, default_lora_preset_id = _lora_catalog(state.output_dir)
         preferred = state.preferred_lora_preset_ids()
         if lid not in preferred:
@@ -451,30 +530,25 @@ async def register_lora(job_id: str, body: dict[str, Any] | None = None):
 
     @app.get("/api/train/jobs/{job_id}/events")
     async def train_events(job_id: str):
-        if job_id not in state.train_jobs:
-            status = load_status(state.output_dir, job_id)
-            if not status:
-                raise HTTPException(404, "Job not found")
+        payload = job_api_payload(state.output_dir, job_id)
+        if not payload:
+            raise HTTPException(404, "Job not found")
         if job_id not in state.train_event_queues:
             state.train_event_queues[job_id] = asyncio.Queue()
 
         async def stream() -> AsyncIterator[str]:
             q = state.train_event_queues[job_id]
-            job = state.train_jobs.get(job_id)
-            if job and job.status in (
+            snap = job_api_payload(state.output_dir, job_id) or {}
+            terminal = snap.get("status") in (
                 TrainJobStatus.DONE.value,
                 TrainJobStatus.FAILED.value,
                 TrainJobStatus.CANCELLED.value,
-            ):
-                snap = _job_for_api(state, job) if job else load_status(state.output_dir, job_id) or {}
+                TrainJobStatus.INTERRUPTED.value,
+            )
+            if terminal and state._active_train_job_id != job_id:
                 yield f"data: {json.dumps({'type': 'snapshot', 'job': snap})}\n\n"
                 yield f"data: {json.dumps({'type': 'job_complete', 'job_id': job_id})}\n\n"
                 return
-            status = load_status(state.output_dir, job_id)
-            if status and status.get("phase") in ("done", "failed", "cancelled"):
-                yield f"data: {json.dumps({'type': 'snapshot', 'job': status})}\n\n"
-                yield f"data: {json.dumps({'type': 'job_complete', 'job_id': job_id})}\n\n"
-                return
             while True:
                 try:
                     event = await asyncio.wait_for(q.get(), timeout=120.0)

From e059ea4dbb7f6c541d37c5b08d6f2a99b2eedc1f Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Thu, 18 Jun 2026 22:07:50 +0200
Subject: [PATCH 6/7] Document training feature status and optional trainer
 install.

Mark completed TRAIN.md phases, add README /train setup notes, and align risk mitigations with persisted jobs and IC-LoRA support.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 AGENTS.md |  1 +
 README.md | 12 ++++++++++++
 TRAIN.md  | 46 ++++++++++++++++++++++++----------------------
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index a239244..e0deebd 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -399,6 +399,7 @@ Three ~5s segments, chained and merged:
 - **RAM:** longer clips / higher resolution need more unified memory; q8/q4 weights help on smaller Macs.
 - **No GPT rewrite** on this MLX server—optimize prompts yourself.
 - **ltx-2-mlx** version pinned in repo (see `requirements.txt` / `ltx_mlx_backend.py`); install matching MLX packages after pulls.
+- **Training (optional):** Web UI `/train` — install `ltx-trainer-mlx` (see `README.md` / `TRAIN.md`). Do not run training and generation concurrently (MLX lock). Prefer `ltx_generate_*` for inference after registering a trained LoRA.
 - **Weights:** **MLX only** — `dgrauet/ltx-2.3-mlx*` repos or local MLX snapshots; **never** `Lightricks/LTX-2.3` or other standard LTX checkpoints (see [MLX model weights only](#mlx-model-weights-only-mandatory)).
 
 ---
diff --git a/README.md b/README.md
index c412506..9c76d7d 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ Generate text-to-video, image-to-video, audio-to-video, retake, extend, and mult
 | **CLI client** | Scriptable batch runs, autocontinue, optional ffmpeg merge |
 | **MCP tools** | Drive generation from Cursor, Claude, or other MCP clients |
 | **LoRA** | Per-request or server-wide style adapters (optional) |
+| **LoRA training** | `/train` wizard — T2V, AV, and IC-LoRA presets (optional `ltx-trainer-mlx`; see below) |
 
 **Agent docs:** [`DIRECTOR.md`](DIRECTOR.md) (prompting & shot planning), [`AGENTS.md`](AGENTS.md) (MCP & pipelines), [`CLAUDE.md`](CLAUDE.md) (pointer to agent guides).
 
@@ -57,6 +58,17 @@ For gated Hugging Face models, set [`HF_TOKEN`](https://huggingface.co/docs/hugg
 
 Classic **venv + pip**: use `python3.12 -m venv .venv`, `pip install -r requirements.txt`, and the same two `ltx-2-mlx` package lines above.
 
+### Optional: LoRA training (`/train`)
+
+Training is **not** required for generation. To use the **Train LoRA** tab in the Web UI:
+
+```bash
+uv pip install \
+  "ltx-trainer-mlx @ git+https://github.com/dgrauet/ltx-2-mlx.git@v0.14.12#subdirectory=packages/ltx-trainer"
+```
+
+Then rebuild the Web UI if needed (`cd web && npm run build`), start `python server.py --web-ui` (or embedded `--web-ui`), and open **Train LoRA** in the header. Jobs and artifacts live under `web_outputs/train/<job_id>/`. See [`TRAIN.md`](TRAIN.md) for presets, IC-LoRA pairing, and the Phase 5 backlog (checkpoint resume, W&B, MCP).
+
 ---
 
 ## Quick start
diff --git a/TRAIN.md b/TRAIN.md
index a54e2a0..c7869fc 100644
--- a/TRAIN.md
+++ b/TRAIN.md
@@ -185,40 +185,42 @@ Pin tag to `LTX2_MLX_GIT_TAG` in `ltx_mlx_backend.py`.
 
 ### Phase 1 — Foundation (MVP)
 
-- [ ] `training` branch: router + empty `/train` shell + nav
-- [ ] `ltx_train_backend.py`: healthcheck, config templates, subprocess/thread wrapper for **preprocess only**
-- [ ] Dataset upload API + folder layout
-- [ ] Train page: upload videos + captions → preprocess button → log/progress panel
-- [ ] Docs in README: optional trainer install
+- [x] `training` branch: router + `/train` wizard + nav
+- [x] `ltx_train_backend.py`: healthcheck, config templates, full job runner (slice → preprocess → train)
+- [x] Dataset upload API + folder layout (`web_outputs/train/<job_id>/`)
+- [x] Train page: upload videos + captions, wizard steps, live progress panel
+- [x] Docs in README: optional trainer install
 
-**Exit criteria:** User can preprocess clips from Web UI; no training yet.
+**Exit criteria:** User can preprocess clips from Web UI; no training yet. **Done** (training included in Phase 2 delivery).
 
 ### Phase 2 — T2V LoRA training
 
-- [ ] `POST /api/train/runs` wrapping `LtxvTrainer.train(step_callback=...)`
-- [ ] SSE progress (step / total / ETA from `TrainingStats`)
-- [ ] Cancel flag checked between steps
-- [ ] Validation MP4 serving from `output_dir`
-- [ ] **Register LoRA** → existing `/api/loras/custom` flow
+- [x] `POST /api/train/jobs` wrapping `LtxvTrainer.train(step_callback=...)`
+- [x] SSE progress (step / total / ETA / loss)
+- [x] Cancel flag checked between steps
+- [x] Validation MP4 serving from `output_dir`
+- [x] **Register LoRA** → existing `/api/loras/custom` flow
 
-**Exit criteria:** End-to-end T2V LoRA on a toy dataset (≥2 clips); use in generator.
+**Exit criteria:** End-to-end T2V LoRA on a toy dataset (≥2 clips); use in generator. **Ready for manual QA.**
 
 ### Phase 3 — Slice + AV presets
 
-- [ ] Slice API (ffmpeg dependency check)
-- [ ] `with_audio` preprocess toggle
-- [ ] Presets: `lora_av_whisper` simplified form (audio-only target modules hidden behind preset)
-- [ ] RAM warning banners (`--low-ram` → `enable_gradient_checkpointing`)
+- [x] Slice in wizard (ffmpeg dependency check via `/api/train/health`)
+- [x] `with_audio` preprocess toggle (AV preset)
+- [x] AV preset (`lora_av.yaml`) with dev transformer + checkpointing defaults
+- [x] Low RAM toggle → `enable_gradient_checkpointing`
+- [ ] RAM warning banners when preset + free-memory estimate disagree (nice-to-have)
 
 ### Phase 4 — V2V / IC-LoRA training
 
-- [ ] Reference video upload + reference latent preprocess path
-- [ ] `video_to_video` strategy UI
-- [ ] Validation with `reference_videos`
+- [x] Reference video upload + reference latent preprocess path
+- [x] `video_to_video` strategy UI (`v2v` preset + `lora_v2v.yaml`)
+- [x] Validation with `reference_videos` (paths from job `references/`)
 
 ### Phase 5 — Polish
 
-- [ ] Resume from checkpoint (`model.load_checkpoint`)
+- [x] Job persistence (`manifest.json` + `status.json`); resume interrupted/failed jobs (`POST …/resume`)
+- [ ] Resume from training checkpoint (`model.load_checkpoint`)
 - [ ] W&B optional (`wandb` extra)
 - [ ] MCP tool `ltx_train_lora` for agents (optional)
 
@@ -231,8 +233,8 @@ Pin tag to `LTX2_MLX_GIT_TAG` in `ltx_mlx_backend.py`.
 | OOM during train | Default to q8 model path; expose checkpointing; block train if free RAM estimate low |
 | Train + generate concurrent | Global `mlx_busy` lock shared with `LocalVideoGenerator` |
 | Preprocess partial HF download | Use same resolved `model_path` as inference (full snapshot already cached) |
-| V2V reference latents | Defer to Phase 4; document manual preprocess steps until automated |
-| Long jobs lost on server restart | Persist `status.json`; optional resume; warn user |
+| V2V reference latents | Automated in `v2v` preset (`references/` → `reference_latents/`) |
+| Long jobs lost on server restart | `manifest.json` + `status.json`; jobs reloaded on startup; **Resume** for interrupted/failed |
 | Alpha trainer API | Pin v0.14.12; thin adapter layer in `ltx_train_backend.py` |
 
 ---

From 4846973f02a767a17a921df83c8c24e902fd6ac0 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Thu, 18 Jun 2026 23:21:05 +0200
Subject: [PATCH 7/7] Resume LoRA training from saved checkpoints via
 model.load_checkpoint.

Load the latest intermediate weights on job resume, train only remaining steps, skip slice/preprocess when artifacts exist, and surface checkpoint info in the train UI.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 TRAIN.md              |   4 +-
 ltx_train_backend.py  | 268 ++++++++++++++++++++++++++++++++++--------
 web/src/TrainPage.tsx |   9 +-
 web/src/api/train.ts  |   7 +-
 web/src/types.ts      |   4 +
 web_train.py          |  23 +++-
 6 files changed, 254 insertions(+), 61 deletions(-)

diff --git a/TRAIN.md b/TRAIN.md
index c7869fc..4970bbd 100644
--- a/TRAIN.md
+++ b/TRAIN.md
@@ -220,7 +220,7 @@ Pin tag to `LTX2_MLX_GIT_TAG` in `ltx_mlx_backend.py`.
 ### Phase 5 — Polish
 
 - [x] Job persistence (`manifest.json` + `status.json`); resume interrupted/failed jobs (`POST …/resume`)
-- [ ] Resume from training checkpoint (`model.load_checkpoint`)
+- [x] Resume from training checkpoint (`model.load_checkpoint`) — loads latest `outputs/checkpoints/*step_*.safetensors`, trains remaining steps, skips slice/preprocess when artifacts exist
 - [ ] W&B optional (`wandb` extra)
 - [ ] MCP tool `ltx_train_lora` for agents (optional)
 
@@ -234,7 +234,7 @@ Pin tag to `LTX2_MLX_GIT_TAG` in `ltx_mlx_backend.py`.
 | Train + generate concurrent | Global `mlx_busy` lock shared with `LocalVideoGenerator` |
 | Preprocess partial HF download | Use same resolved `model_path` as inference (full snapshot already cached) |
 | V2V reference latents | Automated in `v2v` preset (`references/` → `reference_latents/`) |
-| Long jobs lost on server restart | `manifest.json` + `status.json`; jobs reloaded on startup; **Resume** for interrupted/failed |
+| Long jobs lost on server restart | `manifest.json` + `status.json`; jobs reloaded on startup; **Resume** reloads LoRA weights from checkpoint when available |
 | Alpha trainer API | Pin v0.14.12; thin adapter layer in `ltx_train_backend.py` |
 
 ---
diff --git a/ltx_train_backend.py b/ltx_train_backend.py
index 41f99e3..c6d0a71 100644
--- a/ltx_train_backend.py
+++ b/ltx_train_backend.py
@@ -131,6 +131,10 @@ class TrainJobRequest:
     slice: SliceOptions = field(default_factory=SliceOptions)
     preprocess: PreprocessOptions = field(default_factory=PreprocessOptions)
     train: TrainHyperparams = field(default_factory=TrainHyperparams)
+    resume_from_checkpoint: bool = False
+
+
+CHECKPOINT_STEP_RE = re.compile(r"step_(\d+)", re.IGNORECASE)
 
 
 EventCallback = Callable[[dict[str, Any]], None]
@@ -322,11 +326,94 @@ def job_api_payload(output_dir: Path, job_id: str) -> dict[str, Any] | None:
         if manifest.get("registered_lora_id"):
             data["registered_lora_id"] = manifest["registered_lora_id"]
         data["created_at"] = manifest.get("created_at") or data.get("created_at")
+    enrich_job_checkpoint_info(output_dir, job_id, data)
     if "created_at" not in data:
         data["created_at"] = ""
     return data
 
 
+def _checkpoint_candidates(outputs_dir: Path) -> list[Path]:
+    ckpt_dir = outputs_dir / "checkpoints"
+    found: list[Path] = []
+    if ckpt_dir.is_dir():
+        found.extend(ckpt_dir.rglob("*step_*.safetensors"))
+        found.extend(ckpt_dir.rglob("*step_*.npz"))
+    found.extend(outputs_dir.glob("*step_*.safetensors"))
+    found.extend(outputs_dir.glob("lora_weights_step_*.safetensors"))
+    return found
+
+
+def _checkpoint_step_number(path: Path) -> int:
+    match = CHECKPOINT_STEP_RE.search(path.stem)
+    if not match:
+        return -1
+    try:
+        return int(match.group(1))
+    except ValueError:
+        return -1
+
+
+def find_latest_training_checkpoint(paths: TrainJobPaths) -> tuple[Path | None, int]:
+    """Return the newest intermediate LoRA checkpoint and its training step."""
+    candidates = _checkpoint_candidates(paths.outputs)
+    if not candidates:
+        return None, 0
+    best = max(candidates, key=_checkpoint_step_number)
+    step = _checkpoint_step_number(best)
+    if step < 0:
+        return None, 0
+    return best, step
+
+
+def enrich_job_checkpoint_info(output_dir: Path, job_id: str, data: dict[str, Any]) -> None:
+    paths = training_job_paths(output_dir, job_id)
+    ckpt, step = find_latest_training_checkpoint(paths)
+    total = int(data.get("total_steps") or 0)
+    data["latest_checkpoint"] = str(ckpt) if ckpt else None
+    data["latest_checkpoint_step"] = step if ckpt else None
+    data["can_resume_from_checkpoint"] = bool(
+        ckpt and step > 0 and (total <= 0 or step < total)
+    )
+
+
+def _preprocess_ready(paths: TrainJobPaths, *, v2v: bool) -> bool:
+    pc = paths.preprocessed / ".precomputed"
+    latents = pc / "latents"
+    conditions = pc / "conditions"
+    if not latents.is_dir() or not conditions.is_dir():
+        return False
+    if not any(latents.glob("*.safetensors")):
+        return False
+    if not any(conditions.glob("*.safetensors")):
+        return False
+    if v2v:
+        ref = pc / "reference_latents"
+        if not ref.is_dir() or not any(ref.glob("*.safetensors")):
+            return False
+    return True
+
+
+def _slice_artifacts_ready(paths: TrainJobPaths, req: TrainJobRequest) -> bool:
+    if req.slice.enabled:
+        return bool(_list_videos(paths.clips))
+    return bool(_list_videos(paths.raw))
+
+
+def _resolve_video_dirs(
+    paths: TrainJobPaths,
+    req: TrainJobRequest,
+) -> tuple[Path, Path, str | None]:
+    """Return (videos_dir, references_dir, captions_dir) for training."""
+    videos_dir = paths.clips if req.slice.enabled else paths.raw
+    references_dir = paths.reference_clips if req.slice.enabled else paths.references
+    captions_dir: str | None = None
+    if not req.slice.enabled:
+        txts = list(paths.raw.glob("*.txt"))
+        if txts:
+            captions_dir = str(paths.captions)
+    return videos_dir, references_dir, captions_dir
+
+
 def parse_train_request(body: dict[str, Any]) -> TrainJobRequest:
     preset = str(body.get("preset") or "t2v").strip().lower()
     if preset not in TRAIN_PRESETS:
@@ -381,6 +468,7 @@ def parse_train_request(body: dict[str, Any]) -> TrainJobRequest:
         slice=slice_opts,
         preprocess=preprocess,
         train=train,
+        resume_from_checkpoint=bool(body.get("resume_from_checkpoint", False)),
     )
 
 
@@ -466,7 +554,13 @@ def _preset_yaml_path(preset: str) -> Path:
     return path
 
 
-def build_trainer_config(req: TrainJobRequest, *, paths: TrainJobPaths) -> Any:
+def build_trainer_config(
+    req: TrainJobRequest,
+    *,
+    paths: TrainJobPaths,
+    load_checkpoint: str | Path | None = None,
+    training_steps: int | None = None,
+) -> Any:
     from ltx_trainer_mlx.config import LtxTrainerConfig
 
     raw = yaml.safe_load(_preset_yaml_path(req.preset).read_text(encoding="utf-8"))
@@ -474,12 +568,14 @@ def build_trainer_config(req: TrainJobRequest, *, paths: TrainJobPaths) -> Any:
 
     model_block = dict(raw.get("model") or {})
     model_block["model_path"] = model_path
+    if load_checkpoint:
+        model_block["load_checkpoint"] = str(load_checkpoint)
     raw["model"] = model_block
     raw["data"] = {"preprocessed_data_root": str(paths.preprocessed.resolve())}
     raw["output_dir"] = str(paths.outputs.resolve())
     raw["seed"] = int(req.train.seed)
 
-    raw["optimization"]["steps"] = int(req.train.steps)
+    raw["optimization"]["steps"] = int(training_steps if training_steps is not None else req.train.steps)
     raw["optimization"]["learning_rate"] = float(req.train.learning_rate)
     if req.train.low_ram:
         raw["optimization"]["enable_gradient_checkpointing"] = True
@@ -605,12 +701,43 @@ def emit(event: dict[str, Any]) -> None:
     videos_dir = paths.raw
     references_dir = paths.references
     captions_dir: str | None = None
+    resume_step = 0
+    load_checkpoint: str | Path | None = None
+    training_steps = int(req.train.steps)
+    total_steps_target = int(req.train.steps)
+
+    if req.resume_from_checkpoint:
+        ckpt_path, resume_step = find_latest_training_checkpoint(paths)
+        if ckpt_path and resume_step > 0:
+            if resume_step >= total_steps_target:
+                raise ValueError(
+                    f"Checkpoint is already at step {resume_step} "
+                    f"(target {total_steps_target}); nothing left to train"
+                )
+            load_checkpoint = paths.outputs / "checkpoints"
+            if not load_checkpoint.is_dir():
+                load_checkpoint = ckpt_path.parent
+            training_steps = total_steps_target - resume_step
+            status["resume_from_step"] = resume_step
+            status["load_checkpoint"] = str(ckpt_path)
+            status["step"] = resume_step
+            save_status(output_dir, job_id, status)
+            emit(
+                {
+                    "type": "phase_progress",
+                    "phase": "training",
+                    "message": f"Resuming from checkpoint step {resume_step} "
+                    f"({training_steps} steps remaining)",
+                }
+            )
 
     try:
         if req.preset == "v2v" and not _list_videos(paths.references):
-            raise ValueError("IC-LoRA requires paired reference videos (upload to references/)")
+            if not (req.resume_from_checkpoint and _preprocess_ready(paths, v2v=True)):
+                raise ValueError("IC-LoRA requires paired reference videos (upload to references/)")
 
-        if req.slice.enabled:
+        skip_slice = req.resume_from_checkpoint and _slice_artifacts_ready(paths, req)
+        if req.slice.enabled and not skip_slice:
             _check_cancel(should_cancel)
             status["phase"] = "slicing"
             emit({"type": "phase_started", "phase": "slicing", "message": "Slicing source videos…"})
@@ -657,71 +784,103 @@ def emit(event: dict[str, Any]) -> None:
                     }
                 )
                 references_dir = paths.reference_clips
+        elif req.slice.enabled:
+            videos_dir, references_dir, captions_dir = _resolve_video_dirs(paths, req)
+            emit(
+                {
+                    "type": "phase_progress",
+                    "phase": "slicing",
+                    "message": "Using existing sliced clips",
+                }
+            )
         else:
             txts = list(paths.raw.glob("*.txt"))
             if txts:
                 paths.captions.mkdir(parents=True, exist_ok=True)
                 for t in txts:
-                    shutil.copy2(t, paths.captions / t.name)
+                    dest = paths.captions / t.name
+                    if not dest.is_file():
+                        shutil.copy2(t, dest)
                 captions_dir = str(paths.captions)
 
-        _check_cancel(should_cancel)
-        status["phase"] = "preprocessing"
-        emit({"type": "phase_started", "phase": "preprocessing", "message": "Encoding target latents…"})
-        from ltx_trainer_mlx.preprocess import preprocess_dataset
-
-        model_path = resolve_mlx_weights_directory(req.model_id, req.model_dir)
-        status["model_path"] = model_path
-        nf = _nearest_valid_frames(int(req.preprocess.max_frames))
-        preset_info = TRAIN_PRESETS.get(req.preset, TRAIN_PRESETS["t2v"])
-        with_audio = req.preprocess.with_audio or preset_info.with_audio
-        preprocess_dataset(
-            videos_dir=str(videos_dir),
-            output_dir=str(paths.preprocessed),
-            model_dir=model_path,
-            gemma_model_id=DEFAULT_GEMMA,
-            target_height=int(req.preprocess.height) if req.preprocess.height else None,
-            target_width=int(req.preprocess.width) if req.preprocess.width else None,
-            max_frames=nf,
-            captions_dir=captions_dir,
-            with_audio=with_audio,
-            frame_rate=float(req.preprocess.frame_rate) if req.preprocess.frame_rate else None,
+        skip_preprocess = req.resume_from_checkpoint and _preprocess_ready(
+            paths, v2v=req.preset == "v2v"
         )
-
-        if req.preset == "v2v":
+        if not skip_preprocess:
             _check_cancel(should_cancel)
+            status["phase"] = "preprocessing"
+            emit({"type": "phase_started", "phase": "preprocessing", "message": "Encoding target latents…"})
+            from ltx_trainer_mlx.preprocess import preprocess_dataset
+
+            videos_dir, references_dir, captions_dir = _resolve_video_dirs(paths, req)
+            model_path = resolve_mlx_weights_directory(req.model_id, req.model_dir)
+            status["model_path"] = model_path
+            nf = _nearest_valid_frames(int(req.preprocess.max_frames))
+            preset_info = TRAIN_PRESETS.get(req.preset, TRAIN_PRESETS["t2v"])
+            with_audio = req.preprocess.with_audio or preset_info.with_audio
+            preprocess_dataset(
+                videos_dir=str(videos_dir),
+                output_dir=str(paths.preprocessed),
+                model_dir=model_path,
+                gemma_model_id=DEFAULT_GEMMA,
+                target_height=int(req.preprocess.height) if req.preprocess.height else None,
+                target_width=int(req.preprocess.width) if req.preprocess.width else None,
+                max_frames=nf,
+                captions_dir=captions_dir,
+                with_audio=with_audio,
+                frame_rate=float(req.preprocess.frame_rate) if req.preprocess.frame_rate else None,
+            )
+
+            if req.preset == "v2v":
+                _check_cancel(should_cancel)
+                emit(
+                    {
+                        "type": "phase_progress",
+                        "phase": "preprocessing",
+                        "message": "Encoding reference latents for IC-LoRA…",
+                    }
+                )
+                targets = _list_videos(videos_dir)
+                ref_paths = _pair_reference_videos(targets, references_dir)
+                if len(ref_paths) != len(targets):
+                    raise ValueError("Reference video count must match target clip count")
+                encode_reference_latents(
+                    ref_paths,
+                    preprocessed_root=paths.preprocessed,
+                    model_dir=model_path,
+                    target_height=int(req.preprocess.height or 704),
+                    target_width=int(req.preprocess.width or 480),
+                    max_frames=nf,
+                    frame_rate=float(req.preprocess.frame_rate) if req.preprocess.frame_rate else None,
+                    downscale_factor=int(req.preprocess.reference_downscale_factor),
+                )
+
+            emit({"type": "phase_progress", "phase": "preprocessing", "message": "Preprocess complete"})
+        else:
             emit(
                 {
                     "type": "phase_progress",
                     "phase": "preprocessing",
-                    "message": "Encoding reference latents for IC-LoRA…",
+                    "message": "Using existing preprocessed latents",
                 }
             )
-            targets = _list_videos(videos_dir)
-            ref_paths = _pair_reference_videos(targets, references_dir)
-            if len(ref_paths) != len(targets):
-                raise ValueError("Reference video count must match target clip count")
-            encode_reference_latents(
-                ref_paths,
-                preprocessed_root=paths.preprocessed,
-                model_dir=model_path,
-                target_height=int(req.preprocess.height or 704),
-                target_width=int(req.preprocess.width or 480),
-                max_frames=nf,
-                frame_rate=float(req.preprocess.frame_rate) if req.preprocess.frame_rate else None,
-                downscale_factor=int(req.preprocess.reference_downscale_factor),
-            )
-
-        emit({"type": "phase_progress", "phase": "preprocessing", "message": "Preprocess complete"})
 
         _check_cancel(should_cancel)
         status["phase"] = "training"
-        status["total_steps"] = int(req.train.steps)
-        emit({"type": "phase_started", "phase": "training", "message": "Training LoRA…"})
+        status["total_steps"] = total_steps_target
+        train_message = "Training LoRA…"
+        if resume_step > 0:
+            train_message = f"Resuming training from step {resume_step}…"
+        emit({"type": "phase_started", "phase": "training", "message": train_message})
 
         from ltx_trainer_mlx.trainer import LtxvTrainer
 
-        config = build_trainer_config(req, paths=paths)
+        config = build_trainer_config(
+            req,
+            paths=paths,
+            load_checkpoint=load_checkpoint,
+            training_steps=training_steps,
+        )
         paths.config.write_text(yaml.safe_dump(config.model_dump(mode="json")), encoding="utf-8")
 
         train_t0 = time.time()
@@ -732,13 +891,14 @@ def on_metrics(m: dict[str, float]) -> None:
 
         def step_callback(step: int, total: int, validation_paths: list) -> None:
             _check_cancel(should_cancel)
+            global_step = int(resume_step) + int(step)
             elapsed = max(time.time() - train_t0, 1e-6)
             eta_s = (elapsed / max(step, 1)) * max(total - step, 0)
             payload: dict[str, Any] = {
                 "type": "train_step",
                 "phase": "training",
-                "step": int(step),
-                "total_steps": int(total),
+                "step": global_step,
+                "total_steps": total_steps_target,
                 "eta_s": round(eta_s, 1),
             }
             if last_metrics:
@@ -754,13 +914,13 @@ def step_callback(step: int, total: int, validation_paths: list) -> None:
                         rel = p.name
                     rels.append(
                         {
-                            "step": int(step),
+                            "step": global_step,
                             "filename": str(rel),
                             "url": f"/api/train/jobs/{job_id}/artifacts/{rel.as_posix()}",
                         }
                     )
                 status.setdefault("validation_clips", []).extend(rels)
-                emit({"type": "train_validation", "step": int(step), "videos": rels})
+                emit({"type": "train_validation", "step": global_step, "videos": rels})
 
         with _metrics_hook(on_metrics):
             trainer = LtxvTrainer(config)
@@ -802,3 +962,7 @@ def step_callback(step: int, total: int, validation_paths: list) -> None:
         save_status(output_dir, job_id, status)
         emit({"type": "error", "phase": status.get("phase"), "message": str(exc)})
         raise
+    finally:
+        manifest = load_manifest(output_dir, job_id)
+        if manifest and manifest.pop("resume_from_checkpoint", None):
+            save_manifest(output_dir, job_id, manifest)
diff --git a/web/src/TrainPage.tsx b/web/src/TrainPage.tsx
index 327963f..fcd8a09 100644
--- a/web/src/TrainPage.tsx
+++ b/web/src/TrainPage.tsx
@@ -823,6 +823,9 @@ export default function TrainPage() {
                       <h3>{activeJob.name}</h3>
                       <p className="job-phase">
                         {phaseLabel(activeJob.phase)} · {activeJob.preset}
+                        {activeJob.latest_checkpoint_step != null && activeJob.latest_checkpoint_step > 0 && (
+                          <> · checkpoint step {activeJob.latest_checkpoint_step}</>
+                        )}
                       </p>
                     </div>
                     <div className="artifact-actions">
@@ -833,7 +836,11 @@ export default function TrainPage() {
                           disabled={resuming || !!health?.generation_active}
                           onClick={() => handleResume()}
                         >
-                          {resuming ? "Resuming…" : "Resume"}
+                          {resuming
+                            ? "Resuming…"
+                            : activeJob.can_resume_from_checkpoint
+                              ? `Resume from checkpoint (step ${activeJob.latest_checkpoint_step})`
+                              : "Resume job"}
                         </button>
                       )}
                       {["queued", "running"].includes(activeJob.status) && (
diff --git a/web/src/api/train.ts b/web/src/api/train.ts
index 3c535bd..e6f785d 100644
--- a/web/src/api/train.ts
+++ b/web/src/api/train.ts
@@ -74,7 +74,12 @@ export async function createTrainJob(
   return res.json();
 }
 
-export async function resumeTrainJob(jobId: string): Promise<{ job_id: string; status: string }> {
+export async function resumeTrainJob(jobId: string): Promise<{
+  job_id: string;
+  status: string;
+  resume_from_checkpoint?: boolean;
+  latest_checkpoint_step?: number | null;
+}> {
   const res = await fetch(`/api/train/jobs/${jobId}/resume`, { method: "POST" });
   if (!res.ok) {
     const text = await res.text();
diff --git a/web/src/types.ts b/web/src/types.ts
index 2e50038..4e32712 100644
--- a/web/src/types.ts
+++ b/web/src/types.ts
@@ -154,4 +154,8 @@ export interface TrainJob {
   artifact_name?: string | null;
   registered_lora_id?: string | null;
   validation_clips?: TrainValidationClip[];
+  latest_checkpoint?: string | null;
+  latest_checkpoint_step?: number | null;
+  can_resume_from_checkpoint?: boolean;
+  resume_from_step?: number;
 }
diff --git a/web_train.py b/web_train.py
index 73315c4..cbfd8f6 100644
--- a/web_train.py
+++ b/web_train.py
@@ -283,6 +283,7 @@ def register_train_routes(app: Any, state: AppState) -> None:
         save_status,
         trainer_health,
         training_job_paths,
+        find_latest_training_checkpoint,
     )
 
     _init_train_state(state)
@@ -412,9 +413,10 @@ async def resume_train_job(job_id: str):
             raise HTTPException(409, "A training job is already running")
         if state.is_generation_active():
             raise HTTPException(409, "Cannot start training while generation is active")
-        manifest = load_manifest(state.output_dir, job_id)
-        if not manifest:
-            raise HTTPException(400, "Job manifest missing; cannot resume")
+        manifest = load_manifest(state.output_dir, job_id) or {}
+        ckpt_path, ckpt_step = find_latest_training_checkpoint(training_job_paths(state.output_dir, job_id))
+        manifest["resume_from_checkpoint"] = True
+        save_manifest(state.output_dir, job_id, manifest)
 
         state._cancelled_train_jobs.discard(job_id)
         save_status(
@@ -430,8 +432,19 @@ async def resume_train_job(job_id: str):
         _sync_job_record(state, job_id)
         state.train_event_queues[job_id] = asyncio.Queue()
         await state._pending_train.put(job_id)
-        log.info("Resumed train job %s", job_id)
-        return {"ok": True, "job_id": job_id, "status": "queued"}
+        log.info(
+            "Resumed train job %s (checkpoint_step=%s path=%s)",
+            job_id,
+            ckpt_step if ckpt_path else None,
+            ckpt_path,
+        )
+        return {
+            "ok": True,
+            "job_id": job_id,
+            "status": "queued",
+            "resume_from_checkpoint": bool(ckpt_path and ckpt_step > 0),
+            "latest_checkpoint_step": ckpt_step if ckpt_path else None,
+        }
 
     @app.post("/api/train/jobs/{job_id}/cancel")
     async def cancel_train_job(job_id: str):