diff --git a/docs/cookbook/notifications.mdx b/docs/cookbook/notifications.mdx index 1eacd6e7..9e467e0e 100644 --- a/docs/cookbook/notifications.mdx +++ b/docs/cookbook/notifications.mdx @@ -5,10 +5,14 @@ description: "Post to a webhook when a training run finishes or fails, from insi # Slack / Discord notifications -Training runs take long enough that nobody actually watches Studio the whole time. The terminal `onCompleted` and `onFailed` callbacks are perfect places to fan a status message out to wherever your team already lives. +A training job is two waits back-to-back: GPU allocation at the start (variable, can be several minutes when a worker starts from cold; see the [Quickstart](/quickstart) for why) and then the training run itself (about 7 to 12 minutes for the templates). Nobody actually watches a browser tab for that long. The terminal `onCompleted` and `onFailed` callbacks are the natural spot to fan a status message out to wherever your team already lives so you only look back when the run is genuinely done. This recipe uses Slack incoming webhooks; Discord, Microsoft Teams, and arbitrary HTTP endpoints work the same way. Anything you can `fetch`, you can notify. + +If you only want a desktop ping for your own runs, Studio already shows a browser notification, an in-page toast, and a tab-title indicator on `training.completed` / `training.failed` once you click **Run training** (and accept the permission prompt). This recipe is for fanning the same signal into a shared channel. + + ## The pattern ```ts diff --git a/docs/introduction.mdx b/docs/introduction.mdx index 600fffc9..9c005980 100644 --- a/docs/introduction.mdx +++ b/docs/introduction.mdx @@ -77,6 +77,6 @@ bun dev -The first run finishes in roughly 7 to 12 minutes depending on the template. +The training run itself takes about 7 to 12 minutes depending on the template. Arkor runs training on dedicated training GPUs that are not kept warm between jobs (inference uses a separate, always-warm pool, which is why the Playground stays snappy). When you start a training job, Arkor allocates a training GPU on demand: if one is still warm from a recent job it is ready in seconds, otherwise the worker has to start from cold, which can take several minutes. The [Quickstart](/quickstart) walks through what this looks like in Studio. Continue with the [Quickstart](/quickstart) for a step-by-step walkthrough. diff --git a/docs/ja/cookbook/notifications.mdx b/docs/ja/cookbook/notifications.mdx index 89261eeb..9f7833e3 100644 --- a/docs/ja/cookbook/notifications.mdx +++ b/docs/ja/cookbook/notifications.mdx @@ -5,10 +5,14 @@ description: "学習が完了 or 失敗したときに、trainer.ts の中から # Slack / Discord 通知 -学習は長いので、ずっと Studio を見続ける人はいません。終端の `onCompleted` と `onFailed` コールバックは、チームが普段いる場所にステータスメッセージを流すのにうってつけです。 +学習ジョブは実質 2 つの待ちが続きます。最初に GPU の確保 (warm な GPU が残っていない場合は数分かかることもある cold start。理由は [Quickstart](/ja/quickstart) 参照)、次に学習本体 (テンプレートで約 7〜12 分)。ブラウザータブをそんなに長くは見ていられません。終端の `onCompleted` と `onFailed` コールバックは、チームが普段いる場所にステータスメッセージを流すのに自然な場所です。本当に完了したときだけ振り返ればよくなります。 このレシピは Slack incoming Webhook を使います。Discord、Microsoft Teams、任意の HTTP エンドポイントも同じやり方で動きます。`fetch` できるものなら何でも通知先になります。 + +自分のためのデスクトップ通知だけで十分なら、Studio が `training.completed` / `training.failed` 時にブラウザー通知・ページ内 toast・タブタイトルのインジケーターを出します (**Run training** をクリックして通知許可を受け入れた後で有効)。このレシピはそれをチームの共有チャンネルに飛ばすためのものです。 + + ## パターン ```ts diff --git a/docs/ja/introduction.mdx b/docs/ja/introduction.mdx index f3049d54..8e970499 100644 --- a/docs/ja/introduction.mdx +++ b/docs/ja/introduction.mdx @@ -77,6 +77,6 @@ bun dev -最初の実行は、テンプレートにもよりますがおおよそ 7〜12 分で終わります。 +学習本体はテンプレートにもよりますが約 7〜12 分です。Arkor は推論 (Inference) と学習 (Training) で別の GPU プールを使い分けており、推論側は常に warm な active worker 上で動くので Playground は即応します。一方の学習側はコストを安定させるため warm 維持しません。学習ジョブを開始すると、Arkor はそのときに学習用 GPU を確保しに行きます。直前のジョブで使われた直後など GPU がまだ warm な状態であれば確保は数秒で終わりますが、warm な GPU がなければ cold な状態から (プロビジョニング・コンテナイメージ取得・ベースモデルのロード) 立ち上げる必要があり、これに数分かかります。Studio 上で何が見えるかは [Quickstart](/ja/quickstart) で確認できます。 ステップごとの手順は [Quickstart](/ja/quickstart) に続きます。 diff --git a/docs/ja/quickstart.mdx b/docs/ja/quickstart.mdx index c5aa03e0..08a73763 100644 --- a/docs/ja/quickstart.mdx +++ b/docs/ja/quickstart.mdx @@ -3,7 +3,16 @@ title: "クイックスタート" description: "Arkor プロジェクトを生成し、小さなオープンウェイト LLM をファインチューニングして、ローカル Studio からチャットしてみる。" --- -数分でゼロからスタートし、ローカル Playground でチャットできるファインチューン済みモデルが得られます。学習自体は 7〜12 分。セットアップ時間は回線と既存のインストール状況に左右されます。 +数分でゼロからスタートし、ローカル Playground でチャットできるファインチューン済みモデルが得られます。学習本体は 7〜12 分。最初に流すジョブは、学習本体が始まる前に GPU の確保 (立ち上がり) を待つ時間が入ります。これは数分かかることがあり、ときどきそれ以上かかります。間が空いた後の最初のジョブだけに発生する待ちです。セットアップ時間は回線と既存のインストール状況に左右されます。 + + +**Arkor が GPU をどう使い分けているか。** Arkor は GPU を 2 つのプールに分けて運用しています。 + +- **推論 (Inference) 用 GPU。** 学習が終わったモデルで推論するための GPU です (Playground や本番の推論)。active worker 上に常駐させているので、いつでも warm な状態で即座に応答します。 +- **学習 (Training) 用 GPU。** ファインチューニング用の GPU です。コストを安定させるため warm 維持はしません。**Run training** を押すと、その時点で GPU を確保しに行きます。直前に他のジョブで使われた直後など、GPU がまだ warm な状態であれば確保は数秒で終わります。warm な GPU がなければ cold な状態 (プロビジョニング・コンテナイメージ取得・ベースモデルのロード。いわゆる cold start) から立ち上げる必要があり、これに数分かかります。 + +GPU が確保できた後、学習本体自体はテンプレートにより約 7〜12 分です。Studio の Jobs 画面では両フェーズが可視化されます。 + ## 前提条件 @@ -41,11 +50,11 @@ cd my-arkor-app プロジェクト生成ツールがどのテンプレートを使うか尋ねてきます。最終的に作りたいものに最も近いものを選んでください: -| テンプレート | タスク | 出力の形 | 推定学習時間 | -| ------------ | ----------------------- | ----------------------------------------------------- | ------------ | -| `triage` | サポートのトリアージ | `{ category, urgency, summary, nextAction }` | 約 7 分 | -| `translate` | 9 言語の翻訳 | `{ translation, detectedLanguage }` | 約 7 分 | -| `redaction` | 個人情報のマスク | `{ redactedText, redactedCount, tags }` | 約 12 分 | +| テンプレート | タスク | 出力の形 | 学習本体 (GPU 確保時間を除く) | +| ------------ | ----------------------- | ----------------------------------------------------- | ----------------------------- | +| `triage` | サポートのトリアージ | `{ category, urgency, summary, nextAction }` | 約 7 分 | +| `translate` | 9 言語の翻訳 | `{ translation, detectedLanguage }` | 約 7 分 | +| `redaction` | 個人情報のマスク | `{ redactedText, redactedCount, tags }` | 約 12 分 | いずれも同じ小型のオープンウェイトベース(`gemma-4-E4B-it`)と、HuggingFace 上の厳選された公開データセットを組み合わせたものです。学習は本物で、数分で終わるので、ループ全体をエンドツーエンドで体験できます。 @@ -119,7 +128,7 @@ bun dev 学習が進行中に重要なビューは 3 つあります: - **Jobs。** 学習ジョブの一覧。クリックするとライブステータスが見られます。 -- **Loss チャートとイベントログ。** マネージド GPU から進捗がストリームされるにつれて、Loss(学習中のモデルの誤差を示す数値で、低いほどモデルが正解に近づいているサイン)の曲線が更新され、ログのテールに学習イベントが表示されます。最初の学習はテンプレートにより 7〜12 分かかります。 +- **Loss チャートとイベントログ。** マネージド GPU から進捗がストリームされるにつれて、Loss(学習中のモデルの誤差を示す数値で、低いほどモデルが正解に近づいているサイン)の曲線が更新され、ログのテールに学習イベントが表示されます。最初のジョブの最初のフェーズは GPU の確保で、その間ステータスは `Warming up GPU` と表示され、Loss チャートは `Waiting for GPU` のプレースホルダー、イベントログは空のままです。`training.started` イベントが届くとステータスが `Running` に変わり、曲線が更新を始めます。学習本体自体はテンプレートにより 7〜12 分かかります。 - **Playground。** ジョブが完了したら、最終アダプタをセレクタから選んでチャット。モード切替でベースモデルとアダプタを行き来できます。学習中に中間チェックポイントで推論を走らせたい場合は Studio ではなく `onCheckpoint` コールバックを使ってください。 学習の合間に `src/arkor/` を編集した場合は、Run training ページをリロード(または `arkor build` を実行)してから次のクリックをすると、新しいコードが動きます。 diff --git a/docs/ja/studio/jobs.mdx b/docs/ja/studio/jobs.mdx index 245a2a8d..60f3582a 100644 --- a/docs/ja/studio/jobs.mdx +++ b/docs/ja/studio/jobs.mdx @@ -21,13 +21,24 @@ Run training パネルはページ読み込み時に `/api/manifest` を 1 度 トレーナー選択やフラグを渡す入力フォームはありません: Studio は常に `createArkor` で登録されたトレーナーを走らせ、`arkor start` は `.arkor/build/index.mjs` があれば再利用します。同じページで複数回クリックする間に `src/arkor/` の編集は自動では拾われません。編集の合間に Run training ページをリロード(あるいはターミナルから `arkor build`)してから次のクリックをしてください。具体的なリビルドルールは [CLI § build / start](/ja/cli/build-and-start) を参照。 +### 「最初のジョブ」で起きていること + +**Run training** をクリックすると、2 つのフェーズが続きます: + +1. **GPU の確保。** ジョブは `Warming up GPU` と表示されます。Loss チャートは `Waiting for GPU` のプレースホルダー、イベントリストは空のままで、Metadata サイドバーの **Phase** 行は `Warming up GPU`、**GPU warm-up** タイマーが動きます。このフェーズの時間は状況次第です。直前のジョブで warm な状態の GPU が残っていれば普通は 1 分以内ですが、cold な状態から立ち上げる必要がある場合は数分かかることがあります。なぜそうなるかは [Quickstart](/ja/quickstart) を参照。 +2. **学習本体。** `training.started` が届くとステータスが `Running` に変わり、Loss チャートが `training.log` フレームから更新を始め、**Phase** 行が `Training run` になります。これが Quickstart のテンプレート表にある 7〜12 分のウィンドウです。 + + +**Run training** はブラウザータブをフォアグラウンドにしてからクリックしてください。Studio が通知許可をブラウザーに求め、ジョブが完了または失敗したときにデスクトップ通知とタブタイトルのインジケーターを表示するので、タブを見張る必要がなくなります。 + + ## Jobs リスト Jobs リストはマウント時に 1 度、その後 5 秒ごとに `GET /api/jobs` をポーリングします。手動リフレッシュボタンはなく、間隔は固定です。 | 列 | ソース | | --- | --- | -| Status | `Job.status`(`queued` / `running` / `completed` / `failed` / `cancelled`)。セルは色付け用の CSS クラスを持つ。 | +| Status | `Job.status`(`queued` / `running` / `completed` / `failed` / `cancelled`)に、Studio 側のリスト表示で 1 つの表示ステートを足したもの。`createdAt` が直近 90 秒以内の `queued` ジョブは `Warming up GPU` として表示される。リスト画面は `/api/jobs` のポーリングだけで SSE は開かないため、ここでは createdAt の窓だけで判定する。Job 詳細では同じルールに加えて SSE ストリームの接続状態も加味される。セルは色付け用の CSS クラスを持つ。 | | Name | `Job.name`。`#/jobs/` にリンク。 | | Created | `new Date(Job.createdAt).toLocaleString()`。 | | ID | `Job.id`、等幅フォント。 | diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx index af2ac19e..2f1e397c 100644 --- a/docs/quickstart.mdx +++ b/docs/quickstart.mdx @@ -3,7 +3,16 @@ title: "Quickstart" description: "Scaffold an Arkor project, fine-tune a small open-weight LLM, and chat with it from a local Studio." --- -In a few minutes you will go from zero to a fine-tuned model you can chat with in a local Playground. The training itself takes 7 to 12 minutes; setup time depends on your connection and what is already installed. +In a few minutes you will go from zero to a fine-tuned model you can chat with in a local Playground. The training run itself takes 7 to 12 minutes. The first job you submit also waits for a GPU to be allocated before the training run starts; that wait can be a few minutes (occasionally longer) and only happens for the first run after a quiet period. + + +**How Arkor uses GPUs.** Arkor splits GPUs into two pools: + +- **Inference GPUs** run the model after training (the Playground and any production inference). These stay on an active worker, so they are always warm and respond immediately. +- **Training GPUs** run fine-tuning jobs. To keep costs predictable, these are not kept warm. When you click **Run training**, Arkor allocates a GPU on demand. If a worker is still warm from a recent job, the allocation finishes in seconds. Otherwise it starts from cold (the so-called cold start: provisioning, pulling the container image, loading the base model), which can take several minutes. + +After GPU allocation, the training run itself takes about 7 to 12 minutes for the templates. The Jobs view in Studio shows both phases. + ## Prerequisites @@ -41,11 +50,11 @@ cd my-arkor-app The scaffolder asks which template you want. Pick the one closest to what you eventually want to build: -| Template | Task | Output shape | Estimated training | -| ----------- | ----------------------- | ----------------------------------------------------- | ------------------ | -| `triage` | Support triage | `{ category, urgency, summary, nextAction }` | ~7 min | -| `translate` | 9-language translation | `{ translation, detectedLanguage }` | ~7 min | -| `redaction` | PII redaction | `{ redactedText, redactedCount, tags }` | ~12 min | +| Template | Task | Output shape | Training run (excludes GPU allocation) | +| ----------- | ----------------------- | ----------------------------------------------------- | -------------------------------------- | +| `triage` | Support triage | `{ category, urgency, summary, nextAction }` | ~7 min | +| `translate` | 9-language translation | `{ translation, detectedLanguage }` | ~7 min | +| `redaction` | PII redaction | `{ redactedText, redactedCount, tags }` | ~12 min | Each template pairs the same small open-weight base (`gemma-4-E4B-it`) with a curated public dataset on HuggingFace. The training is real and finishes in minutes, so you get to see the whole loop end to end. @@ -119,7 +128,7 @@ In the browser, click **Run training**. Studio submits the job to the managed ba Once a run is in flight, three views matter: - **Jobs.** A list of training runs. Click into one to see live status. -- **Loss chart and event log.** As the run streams from the managed GPU, the loss curve updates and the log tail shows training events. The first run takes 7 to 12 minutes depending on the template. +- **Loss chart and event log.** As the run streams from the managed GPU, the loss curve updates and the log tail shows training events. The first job's first phase is the GPU allocation; during that time the status reads `Warming up GPU`, the loss chart shows a `Waiting for GPU` placeholder, and the events list is empty. Once the `training.started` event arrives, the status flips to `Running` and the curve begins to update. The training run itself takes about 7 to 12 minutes depending on the template. - **Playground.** After a job completes, pick the final adapter from the selector and chat with it. Use the mode toggle to switch between the base model and the adapter. To run inference on intermediate checkpoints while a run is still in flight, use `onCheckpoint` callbacks instead of Studio. If you edit `src/arkor/` between runs, refresh the Run training page (or run `arkor build`) before the next click so the new code is what runs. diff --git a/docs/studio/jobs.mdx b/docs/studio/jobs.mdx index 4937531b..16ae54c8 100644 --- a/docs/studio/jobs.mdx +++ b/docs/studio/jobs.mdx @@ -21,13 +21,24 @@ When you click it, Studio sends `POST /api/train`. The backend spawns `arkor sta There is no input form for picking the trainer or passing flags: Studio always runs the trainer registered through `createArkor`, and `arkor start` reuses `.arkor/build/index.mjs` if it already exists. Edits to `src/arkor/` are not picked up automatically across multiple clicks on the same page; reload the Run training page (or run `arkor build` from a terminal) between edits and the next click. See [CLI § build / start](/cli/build-and-start) for the precise rebuild rules. +### What "first run" looks like + +Click **Run training**. Two phases follow: + +1. **GPU allocation.** The job appears as `Warming up GPU`. The loss chart shows a `Waiting for GPU` placeholder, the events list is empty, and the **Phase** row in the metadata sidebar reads `Warming up GPU` while the **GPU warm-up** timer ticks. This phase varies in length: typically under a minute when a worker is still warm from a recent job, occasionally several minutes when one has to start from cold. See the [Quickstart](/quickstart) for why this happens. +2. **The training run.** When `training.started` arrives, the status flips to `Running`, the loss chart starts updating from `training.log` frames, and the **Phase** row reads `Training run`. This is the 7 to 12 minute window in the template table on the Quickstart. + + +Click **Run training** with the browser tab focused. Studio asks the browser for notification permission and surfaces a desktop toast plus a tab-title indicator when the job completes or fails, so you do not have to watch the tab. + + ## Jobs list The Jobs list polls `GET /api/jobs` once at mount, then every 5 seconds. There is no manual refresh button; the interval is fixed. | Column | Source | | --- | --- | -| Status | `Job.status` (`queued` / `running` / `completed` / `failed` / `cancelled`). The cell carries a CSS class for colouring. | +| Status | `Job.status` (`queued` / `running` / `completed` / `failed` / `cancelled`) plus one Studio-derived display state on the list: a `queued` job whose `createdAt` is within the last 90 seconds renders as `Warming up GPU`. The list page only polls `/api/jobs`; Job detail uses the same rule and additionally consults the live SSE stream. The cell carries a CSS class for colouring. | | Name | `Job.name`. Links to `#/jobs/`. | | Created | `new Date(Job.createdAt).toLocaleString()`. | | ID | `Job.id`, monospaced. | diff --git a/e2e/studio/src/specs/cold-start.spec.ts b/e2e/studio/src/specs/cold-start.spec.ts new file mode 100644 index 00000000..3b8657d4 --- /dev/null +++ b/e2e/studio/src/specs/cold-start.spec.ts @@ -0,0 +1,222 @@ +import type { IncomingMessage, ServerResponse } from "node:http"; +import { expect, test } from "../harness/fixture"; + +const QUEUED_JOB = { + id: "job-cold-start", + name: "cold-start-trainer", + // The job stays `queued` on the wire until our deferred SSE fires + // `training.started`. The display-status derivation in the SPA + // promotes that to `Warming up GPU` while the EventSource is open + // without a `training.started` frame. + status: "queued", + createdAt: new Date(Date.now() - 5_000).toISOString(), + startedAt: null, + completedAt: null, + error: null, + config: { model: "unsloth/gemma-4-E4B-it" }, +}; + +function jobsBody(status: "queued" | "running" | "completed" = "queued") { + const job = { ...QUEUED_JOB, status }; + if (status === "running") { + return { jobs: [{ ...job, startedAt: new Date().toISOString() }] }; + } + if (status === "completed") { + const completedAt = new Date().toISOString(); + return { + jobs: [ + { + ...job, + status, + startedAt: completedAt, + completedAt, + }, + ], + }; + } + return { jobs: [job] }; +} + +interface DeferredSseControls { + /** Send `training.started` over the still-open SSE socket. */ + emitStarted: () => Promise; + /** Send `training.completed` and end the stream. */ + emitCompleted: () => Promise; + /** Resolves once the SSE socket has connected. */ + connected: Promise; +} + +/** + * Install a `/v1/jobs/{id}/events/stream` route that opens the SSE + * socket and keeps it open with no frames until the caller invokes + * `emitStarted()` / `emitCompleted()`. Mirrors the "cold start, then + * training begins" wire trace. + */ +function installDeferredSseRoute(cloudApi: { + setRoute: ( + method: string, + path: string, + handler: (req: IncomingMessage, res: ServerResponse) => void, + ) => void; +}): DeferredSseControls { + let resolveConnected: () => void; + const connected = new Promise((resolve) => { + resolveConnected = resolve; + }); + + // Held outside the handler so test bodies can drive frames from the + // outside. The handler captures the latest response and resolves the + // `connected` gate when the socket actually attaches. + let active: ServerResponse | null = null; + + cloudApi.setRoute( + "GET", + `/v1/jobs/${QUEUED_JOB.id}/events/stream`, + (req, res) => { + const url = new URL(req.url ?? "", "http://x"); + if ( + url.searchParams.get("orgSlug") !== "studio-e2e-org" || + url.searchParams.get("projectSlug") !== "studio-e2e-project" + ) { + res.statusCode = 400; + res.end(JSON.stringify({ error: "scope mismatch" })); + return; + } + res.statusCode = 200; + res.setHeader("content-type", "text/event-stream"); + res.setHeader("cache-control", "no-cache, no-transform"); + // Empty comment frame keeps the SSE socket alive for browsers + // that wait for a first chunk before reporting "open" but + // crucially does not carry a named event, so the SPA stays in + // `provisioning` (it is waiting for `training.started`). + res.write(":\n\n"); + active = res; + resolveConnected(); + }, + ); + + return { + connected, + emitStarted: async () => { + await connected; + if (!active) throw new Error("SSE socket was not opened"); + active.write( + `event: training.started\ndata: ${JSON.stringify({ + timestamp: new Date().toISOString(), + })}\n\n`, + ); + }, + emitCompleted: async () => { + await connected; + if (!active) throw new Error("SSE socket was not opened"); + active.write( + `event: training.completed\ndata: ${JSON.stringify({ + timestamp: new Date().toISOString(), + artifacts: [{ id: "artifact-1" }], + })}\n\n`, + ); + active.end(); + }, + }; +} + +test.describe("Cold-start training UX", () => { + test("shows the Warming up GPU phase before training.started arrives", async ({ + page, + studio, + cloudApi, + }) => { + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("queued"))); + }); + const sse = installDeferredSseRoute(cloudApi); + + await page.goto(`${studio.url}/#/jobs/${QUEUED_JOB.id}`); + await sse.connected; + + await expect( + page.getByText("Warming up the GPU for this job."), + ).toBeVisible(); + await expect(page.getByText("Waiting for GPU").first()).toBeVisible(); + + // Flip the wire state to running and emit the SSE frame: the + // display should switch to Running with the loss-chart placeholder + // gone. + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("running"))); + }); + await sse.emitStarted(); + + await expect( + page.getByText("Warming up the GPU for this job."), + ).toBeHidden(); + }); + + test("fires a desktop toast on training.completed with notification permission granted", async ({ + page, + studio, + cloudApi, + context, + }) => { + await context.grantPermissions(["notifications"]); + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("queued"))); + }); + const sse = installDeferredSseRoute(cloudApi); + + await page.goto(`${studio.url}/#/jobs/${QUEUED_JOB.id}`); + await sse.connected; + await sse.emitStarted(); + + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("completed"))); + }); + await sse.emitCompleted(); + + // Toast carries the run name and the completed label. + await expect( + page.getByText("Training run completed").first(), + ).toBeVisible(); + await expect( + page.getByText(/cold-start-trainer \(1 artifact\)/).first(), + ).toBeVisible(); + }); + + test("falls back to in-page toast only when notification permission is denied", async ({ + page, + studio, + cloudApi, + context, + }) => { + await context.clearPermissions(); + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("queued"))); + }); + const sse = installDeferredSseRoute(cloudApi); + + await page.goto(`${studio.url}/#/jobs/${QUEUED_JOB.id}`); + await sse.connected; + await sse.emitStarted(); + + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("completed"))); + }); + await sse.emitCompleted(); + + await expect( + page.getByText("Training run completed").first(), + ).toBeVisible(); + }); +}); diff --git a/packages/studio-app/src/App.tsx b/packages/studio-app/src/App.tsx index 5ce48720..b63d0a4a 100644 --- a/packages/studio-app/src/App.tsx +++ b/packages/studio-app/src/App.tsx @@ -4,9 +4,12 @@ import { Overview } from "./pages/Overview"; import { JobsList } from "./pages/JobsList"; import { JobDetail } from "./pages/JobDetail"; import { Playground } from "./pages/Playground"; +import { ToastProvider } from "./components/ui/Toast"; import { fetchCredentials, type Credentials } from "./lib/api"; import { useHashRoute } from "./route"; +const DEFAULT_TITLE = "Arkor"; + export function App() { const [creds, setCreds] = useState(null); const [error, setError] = useState(null); @@ -28,14 +31,30 @@ export function App() { }; }, []); + // Drop any `✓` / `⚠` prefix that a prior `notifyJobTerminal` left on the + // tab title once the user navigates. Without this the indicator persists + // across job pages and prefixes from different jobs accumulate. + const routeKey = + route.kind === "job" + ? `job:${route.id}` + : route.kind === "playground" + ? `playground:${route.adapterJobId ?? ""}` + : route.kind; + useEffect(() => { + document.title = DEFAULT_TITLE; + }, [routeKey]); + return ( - - {route.kind === "home" && } - {route.kind === "jobs" && } - {route.kind === "job" && } - {route.kind === "playground" && ( - - )} - + <> + + {route.kind === "home" && } + {route.kind === "jobs" && } + {route.kind === "job" && } + {route.kind === "playground" && ( + + )} + + + ); } diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index eee5094c..b4f9ba05 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -4,6 +4,7 @@ import { streamTraining, type ManifestResult, } from "../lib/api"; +import { ensurePermissionOnGesture } from "../lib/notify"; import { Play, StopCircle } from "./icons"; import { Button } from "./ui/Button"; @@ -65,6 +66,10 @@ export function RunTraining() { }, [log]); async function run() { + // Tied to a user gesture so browsers (Chrome / Firefox) accept the + // permission prompt. The helper no-ops if permission is already + // granted or denied. + ensurePermissionOnGesture(); setRunning(true); setLog(""); const ac = new AbortController(); diff --git a/packages/studio-app/src/components/jobs/EventsStream.tsx b/packages/studio-app/src/components/jobs/EventsStream.tsx index 01271363..220f3a0d 100644 --- a/packages/studio-app/src/components/jobs/EventsStream.tsx +++ b/packages/studio-app/src/components/jobs/EventsStream.tsx @@ -24,7 +24,13 @@ function formatTime(ts: number): string { return `${hh}:${mm}:${ss}`; } -export function EventsStream({ events }: { events: EventEntry[] }) { +export function EventsStream({ + events, + provisioning = false, +}: { + events: EventEntry[]; + provisioning?: boolean; +}) { const wrapRef = useRef(null); const stickRef = useRef(true); @@ -51,7 +57,9 @@ export function EventsStream({ events }: { events: EventEntry[] }) { > {events.length === 0 ? (
- Listening for events… + {provisioning + ? "Warming up the GPU. The first event lands as soon as the trainer begins." + : "Listening for events…"}
) : (
    diff --git a/packages/studio-app/src/components/jobs/JobsTable.tsx b/packages/studio-app/src/components/jobs/JobsTable.tsx index ecae21f0..5e128c77 100644 --- a/packages/studio-app/src/components/jobs/JobsTable.tsx +++ b/packages/studio-app/src/components/jobs/JobsTable.tsx @@ -1,5 +1,6 @@ import type { Job } from "../../lib/api"; import { formatDuration, truncateMiddle } from "../../lib/format"; +import { computeDisplayStatus } from "../../lib/jobStatus"; import { RelativeTime } from "../ui/RelativeTime"; import { StatusBadge } from "../ui/StatusBadge"; @@ -48,7 +49,13 @@ export function JobsTable({ className="group relative border-t border-zinc-100 transition-colors hover:bg-zinc-50 focus-within:bg-zinc-50 dark:border-zinc-900 dark:hover:bg-zinc-900/60 dark:focus-within:bg-zinc-900/60" > - + {/* diff --git a/packages/studio-app/src/components/ui/StatusBadge.tsx b/packages/studio-app/src/components/ui/StatusBadge.tsx index d3b6c0ec..6302c00f 100644 --- a/packages/studio-app/src/components/ui/StatusBadge.tsx +++ b/packages/studio-app/src/components/ui/StatusBadge.tsx @@ -9,16 +9,21 @@ interface StatusBadgeProps { className?: string; } -const VARIANT: Record< - Status, - { label: string; pill: string; dot: string; pulse: boolean } -> = { +type Variant = { label: string; pill: string; dot: string; pulse: boolean }; + +const VARIANT: Record = { queued: { label: "Queued", pill: "border-zinc-200 bg-zinc-50 text-zinc-600 dark:border-zinc-800 dark:bg-zinc-900 dark:text-zinc-400", dot: "bg-zinc-400 dark:bg-zinc-500", pulse: false, }, + provisioning: { + label: "Warming up GPU", + pill: "border-amber-200 bg-amber-50 text-amber-700 dark:border-amber-400/30 dark:bg-amber-400/10 dark:text-amber-300", + dot: "bg-amber-500", + pulse: true, + }, running: { label: "Running", pill: "border-teal-200 bg-teal-50 text-teal-700 dark:border-teal-400/30 dark:bg-teal-400/10 dark:text-teal-300", diff --git a/packages/studio-app/src/components/ui/Toast.tsx b/packages/studio-app/src/components/ui/Toast.tsx new file mode 100644 index 00000000..9004c129 --- /dev/null +++ b/packages/studio-app/src/components/ui/Toast.tsx @@ -0,0 +1,138 @@ +import { useEffect, useRef, useState } from "react"; +import { TOAST_EVENT_NAME, type ToastEventDetail } from "../../lib/notify"; +import { cn } from "./cn"; + +interface ToastItem { + id: number; + status: "completed" | "failed"; + jobName: string; + jobId: string; + message: string; +} + +const AUTO_DISMISS_MS = 6000; +const MAX_VISIBLE = 3; + +export function ToastProvider() { + const [toasts, setToasts] = useState([]); + const counterRef = useRef(0); + + useEffect(() => { + const onToast = (ev: Event) => { + const detail = (ev as CustomEvent).detail; + if (!detail) return; + const id = counterRef.current++; + setToasts((prev) => { + const next = [ + ...prev, + { + id, + status: detail.status, + jobName: detail.jobName, + jobId: detail.jobId, + message: detail.message, + }, + ]; + return next.length > MAX_VISIBLE ? next.slice(-MAX_VISIBLE) : next; + }); + window.setTimeout(() => { + setToasts((prev) => prev.filter((t) => t.id !== id)); + }, AUTO_DISMISS_MS); + }; + window.addEventListener(TOAST_EVENT_NAME, onToast); + return () => window.removeEventListener(TOAST_EVENT_NAME, onToast); + }, []); + + if (toasts.length === 0) return null; + + return ( +
    + {toasts.map((t) => ( + + setToasts((prev) => prev.filter((x) => x.id !== t.id)) + } + /> + ))} +
    + ); +} + +function ToastCard({ + toast, + onDismiss, +}: { + toast: ToastItem; + onDismiss: () => void; +}) { + const tone = + toast.status === "completed" + ? "border-emerald-200 bg-white dark:border-emerald-400/30 dark:bg-zinc-950" + : "border-red-200 bg-white dark:border-red-400/30 dark:bg-zinc-950"; + const accent = + toast.status === "completed" + ? "text-emerald-700 dark:text-emerald-300" + : "text-red-700 dark:text-red-300"; + + function openJob() { + // `parseRoute()` does not decode path segments, so leave the id raw + // here to match the unencoded links emitted from JobsTable. + window.location.hash = `#/jobs/${toast.jobId}`; + onDismiss(); + } + + return ( +
    +
    +
    +
    + {toast.status === "completed" ? "Training run completed" : "Training run failed"} +
    +
    + {toast.message} +
    + +
    + +
    +
    + ); +} diff --git a/packages/studio-app/src/lib/jobStatus.test.ts b/packages/studio-app/src/lib/jobStatus.test.ts new file mode 100644 index 00000000..7bac1772 --- /dev/null +++ b/packages/studio-app/src/lib/jobStatus.test.ts @@ -0,0 +1,116 @@ +import { describe, expect, it } from "vitest"; +import { computeDisplayStatus } from "./jobStatus"; + +describe("computeDisplayStatus", () => { + it("returns the SSE terminal status when present", () => { + expect( + computeDisplayStatus({ + job: { status: "running" }, + liveStatus: "running", + terminalStatus: "completed", + }), + ).toBe("completed"); + expect( + computeDisplayStatus({ + job: { status: "running" }, + liveStatus: "running", + terminalStatus: "failed", + }), + ).toBe("failed"); + }); + + it("preempts liveStatus when polled status is already terminal", () => { + expect( + computeDisplayStatus({ + job: { status: "completed" }, + liveStatus: "running", + }), + ).toBe("completed"); + expect( + computeDisplayStatus({ + job: { status: "cancelled" }, + liveStatus: "running", + }), + ).toBe("cancelled"); + }); + + it("flips to running when SSE training.started is seen", () => { + expect( + computeDisplayStatus({ + job: { status: "queued" }, + liveStatus: "running", + eventStreamConnected: true, + }), + ).toBe("running"); + }); + + it("returns provisioning while queued and the event stream is open", () => { + expect( + computeDisplayStatus({ + job: { status: "queued" }, + eventStreamConnected: true, + }), + ).toBe("provisioning"); + }); + + it("returns provisioning while queued and createdAt is recent", () => { + const now = Date.UTC(2026, 4, 11, 12, 0, 0); + const created = new Date(now - 30_000).toISOString(); + expect( + computeDisplayStatus({ + job: { status: "queued", createdAt: created }, + now, + }), + ).toBe("provisioning"); + }); + + it("returns queued when not connected and createdAt is past the recent window", () => { + const now = Date.UTC(2026, 4, 11, 12, 0, 0); + const created = new Date(now - 5 * 60_000).toISOString(); + expect( + computeDisplayStatus({ + job: { status: "queued", createdAt: created }, + now, + }), + ).toBe("queued"); + }); + + it("treats job = null as queued (no createdAt to anchor)", () => { + expect(computeDisplayStatus({ job: null })).toBe("queued"); + }); + + it("treats job = null as provisioning when the event stream is open", () => { + expect( + computeDisplayStatus({ job: null, eventStreamConnected: true }), + ).toBe("provisioning"); + }); + + it("respects a custom recentMs window", () => { + const now = Date.UTC(2026, 4, 11, 12, 0, 0); + const created = new Date(now - 30_000).toISOString(); + expect( + computeDisplayStatus({ + job: { status: "queued", createdAt: created }, + now, + recentMs: 10_000, + }), + ).toBe("queued"); + }); + + it("falls through to polled non-terminal status when no live signals", () => { + expect( + computeDisplayStatus({ + job: { status: "running" }, + }), + ).toBe("running"); + }); + + it("ignores invalid createdAt", () => { + expect( + computeDisplayStatus({ + job: { status: "queued", createdAt: "not-a-date" }, + now: Date.now(), + }), + ).toBe("queued"); + }); +}); diff --git a/packages/studio-app/src/lib/jobStatus.ts b/packages/studio-app/src/lib/jobStatus.ts new file mode 100644 index 00000000..7dcdfbfe --- /dev/null +++ b/packages/studio-app/src/lib/jobStatus.ts @@ -0,0 +1,69 @@ +import type { Job } from "./api"; + +export type DisplayStatus = Job["status"] | "provisioning"; + +export interface ComputeDisplayStatusInput { + job: { status: Job["status"]; createdAt?: string } | null | undefined; + liveStatus?: Job["status"] | null; + terminalStatus?: "completed" | "failed" | null; + eventStreamConnected?: boolean; + now?: number; + recentMs?: number; +} + +const DEFAULT_RECENT_MS = 90_000; + +/** + * Resolve the status to show in the UI. Wire status (`Job["status"]`) + * only carries queued / running / completed / failed / cancelled; this + * synthesises `provisioning` for a queued job whose SSE stream is open + * (or whose createdAt is within `recentMs`) so the UI can tell "GPU + * warming up" apart from "sitting in a backlog". The wire shape is + * unchanged. + * + * Precedence: + * 1. SSE terminal frame observed in this session + * 2. Polled terminal status from /api/jobs (so a stale liveStatus = + * "running" cannot mask a terminal that arrived after the SSE + * connection dropped) + * 3. SSE-derived liveStatus = "running" (i.e. training.started seen) + * 4. Synthetic `provisioning` when the job is queued AND either the + * event stream is open without a training.started yet, or + * createdAt is within `recentMs` + * 5. Polled non-terminal status, default queued + */ +export function computeDisplayStatus(input: ComputeDisplayStatusInput): DisplayStatus { + const { + job, + liveStatus, + terminalStatus, + eventStreamConnected, + now, + recentMs = DEFAULT_RECENT_MS, + } = input; + + if (terminalStatus) return terminalStatus; + + if ( + job?.status === "completed" || + job?.status === "failed" || + job?.status === "cancelled" + ) { + return job.status; + } + + if (liveStatus === "running") return "running"; + + const isQueued = !job || job.status === "queued"; + if (isQueued) { + if (eventStreamConnected) return "provisioning"; + if (job?.createdAt && typeof now === "number") { + const created = Date.parse(job.createdAt); + if (Number.isFinite(created) && now - created < recentMs) { + return "provisioning"; + } + } + } + + return job?.status ?? "queued"; +} diff --git a/packages/studio-app/src/lib/notify.test.ts b/packages/studio-app/src/lib/notify.test.ts new file mode 100644 index 00000000..d8429017 --- /dev/null +++ b/packages/studio-app/src/lib/notify.test.ts @@ -0,0 +1,279 @@ +// @vitest-environment jsdom +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { + TOAST_EVENT_NAME, + _resetNotifyForTest, + ensurePermissionOnGesture, + notifyJobTerminal, + type ToastEventDetail, +} from "./notify"; + +interface FakeNotificationInstance { + title: string; + options: NotificationOptions | undefined; + onclick: ((this: Notification, ev: Event) => unknown) | null; + close: ReturnType; +} + +function installFakeNotification( + permission: NotificationPermission, + requestPermissionImpl?: () => Promise, +): { + instances: FakeNotificationInstance[]; + requestPermission: ReturnType; +} { + const instances: FakeNotificationInstance[] = []; + const requestPermission = vi.fn( + requestPermissionImpl ?? (() => Promise.resolve(permission)), + ); + class FakeNotification { + static permission: NotificationPermission = permission; + static requestPermission = requestPermission; + title: string; + options: NotificationOptions | undefined; + onclick: FakeNotificationInstance["onclick"] = null; + close = vi.fn(); + constructor(title: string, options?: NotificationOptions) { + this.title = title; + this.options = options; + instances.push(this); + } + } + (globalThis as unknown as { Notification: typeof FakeNotification }).Notification = + FakeNotification; + return { instances, requestPermission }; +} + +function uninstallNotification() { + delete (globalThis as unknown as { Notification?: unknown }).Notification; +} + +function setVisibility( + state: "visible" | "hidden", + hasFocus: boolean = state === "visible", +) { + Object.defineProperty(document, "visibilityState", { + value: state, + configurable: true, + }); + Object.defineProperty(document, "hasFocus", { + value: () => hasFocus, + configurable: true, + }); +} + +beforeEach(() => { + _resetNotifyForTest(); + document.title = "Arkor"; +}); + +afterEach(() => { + uninstallNotification(); + vi.restoreAllMocks(); +}); + +describe("ensurePermissionOnGesture", () => { + it("requests permission only when current state is default", () => { + const { requestPermission } = installFakeNotification("default"); + ensurePermissionOnGesture(); + expect(requestPermission).toHaveBeenCalledTimes(1); + }); + + it("does nothing when permission is already granted", () => { + const { requestPermission } = installFakeNotification("granted"); + ensurePermissionOnGesture(); + expect(requestPermission).not.toHaveBeenCalled(); + }); + + it("does nothing when permission is denied", () => { + const { requestPermission } = installFakeNotification("denied"); + ensurePermissionOnGesture(); + expect(requestPermission).not.toHaveBeenCalled(); + }); + + it("does nothing when Notification API is absent", () => { + uninstallNotification(); + expect(() => ensurePermissionOnGesture()).not.toThrow(); + }); + + it("swallows synchronous throws from requestPermission", () => { + installFakeNotification("default", () => { + throw new Error("blocked"); + }); + expect(() => ensurePermissionOnGesture()).not.toThrow(); + }); +}); + +describe("notifyJobTerminal", () => { + it("fires an OS Notification when granted and tab is hidden", () => { + setVisibility("hidden"); + const { instances } = installFakeNotification("granted"); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-1", + artifacts: 3, + }); + + expect(instances).toHaveLength(1); + expect(instances[0]?.title).toBe("Training run completed"); + expect(instances[0]?.options?.body).toBe("demo (3 artifacts)"); + expect(instances[0]?.options?.tag).toBe("arkor-job-job-1"); + expect(detail).toHaveLength(1); + expect(detail[0]?.message).toBe("demo (3 artifacts)"); + expect(document.title.startsWith("✓ ")).toBe(true); + }); + + it("only emits a toast when the tab is focused (no OS notification, no title prefix)", () => { + setVisibility("visible"); + const { instances } = installFakeNotification("granted"); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-2", + artifacts: 1, + }); + + expect(instances).toHaveLength(0); + expect(detail).toHaveLength(1); + expect(document.title).toBe("Arkor"); + }); + + it("uses the warning prefix and constructs no Notification when permission is denied", () => { + setVisibility("hidden"); + const { instances } = installFakeNotification("denied"); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + notifyJobTerminal({ + status: "failed", + jobName: "demo", + jobId: "job-3", + error: "boom", + }); + + expect(instances).toHaveLength(0); + expect(detail).toHaveLength(1); + expect(detail[0]?.message).toBe("demo failed: boom"); + expect(document.title.startsWith("⚠ ")).toBe(true); + }); + + it("deduplicates repeat calls for the same (jobId, status)", () => { + setVisibility("hidden"); + const { instances } = installFakeNotification("granted"); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-4", + artifacts: 2, + }); + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-4", + artifacts: 2, + }); + + expect(instances).toHaveLength(1); + expect(detail).toHaveLength(1); + }); + + it("allows the same job to notify on a different terminal status (failed after completed)", () => { + setVisibility("hidden"); + installFakeNotification("granted"); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-5", + }); + notifyJobTerminal({ + status: "failed", + jobName: "demo", + jobId: "job-5", + error: "x", + }); + + // The new prefix replaces the old one rather than stacking. + expect(document.title).toBe("⚠ Arkor"); + }); + + it("replaces an existing prefix instead of stacking when the status differs", () => { + setVisibility("hidden"); + installFakeNotification("granted"); + + notifyJobTerminal({ status: "completed", jobName: "A", jobId: "job-A" }); + notifyJobTerminal({ status: "failed", jobName: "B", jobId: "job-B" }); + + expect(document.title).toBe("⚠ Arkor"); + }); + + it("treats a visible tab that is not focused as backgrounded (fires OS notification)", () => { + setVisibility("visible", false); + const { instances } = installFakeNotification("granted"); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-bg", + artifacts: 1, + }); + + expect(instances).toHaveLength(1); + expect(document.title.startsWith("✓ ")).toBe(true); + }); + + it("does not stack the same title prefix repeatedly", () => { + setVisibility("hidden"); + installFakeNotification("granted"); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-6", + }); + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-7", + }); + + const matches = document.title.match(/✓ /g); + expect(matches?.length).toBe(1); + }); + + it("survives without the Notification API at all (toast still fires)", () => { + setVisibility("hidden"); + uninstallNotification(); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + expect(() => + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-8", + }), + ).not.toThrow(); + expect(detail).toHaveLength(1); + }); +}); diff --git a/packages/studio-app/src/lib/notify.ts b/packages/studio-app/src/lib/notify.ts new file mode 100644 index 00000000..90ac3d9a --- /dev/null +++ b/packages/studio-app/src/lib/notify.ts @@ -0,0 +1,152 @@ +export interface NotifyJobTerminalInput { + status: "completed" | "failed"; + jobName: string; + jobId: string; + artifacts?: number; + error?: string; +} + +export interface ToastEventDetail { + status: "completed" | "failed"; + jobName: string; + jobId: string; + message: string; +} + +const TOAST_EVENT = "arkor:toast"; + +const notifiedTerminals = new Set(); + +type NotificationCtor = typeof globalThis.Notification | undefined; + +function getNotificationCtor(): NotificationCtor { + if (typeof globalThis === "undefined") return undefined; + return (globalThis as { Notification?: NotificationCtor }).Notification; +} + +/** + * Ask for OS notification permission, but only when the current state is + * `default` so we don't re-prompt users who already chose. Safe to call + * from a user-gesture handler (the click on Run training). No-ops in + * environments without the Notification API. + */ +export function ensurePermissionOnGesture(): void { + const Ctor = getNotificationCtor(); + if (!Ctor) return; + if (Ctor.permission !== "default") return; + try { + const result = Ctor.requestPermission(); + if (result && typeof (result as Promise).then === "function") { + (result as Promise).catch(() => undefined); + } + } catch { + // Synchronous throws happen on a few older browsers; fall through. + } +} + +function summaryFor(input: NotifyJobTerminalInput): string { + if (input.status === "completed") { + const n = input.artifacts ?? 0; + return `${input.jobName} (${n} artifact${n === 1 ? "" : "s"})`; + } + return input.error + ? `${input.jobName} failed: ${input.error}` + : `${input.jobName} failed`; +} + +// Match either of the prefixes we set so a second terminal event of a +// different kind (e.g. ✓ then ⚠) replaces the existing marker instead +// of stacking as "⚠ ✓ Arkor". +const TITLE_PREFIX_RE = /^[✓⚠] /; + +function ensureTitlePrefix(prefix: string): void { + if (typeof document === "undefined") return; + const stripped = document.title.replace(TITLE_PREFIX_RE, ""); + document.title = `${prefix}${stripped}`; +} + +function isTabFocused(): boolean { + if (typeof document === "undefined") return false; + // `visibilityState === "visible"` is necessary but not sufficient: a + // tab that is the selected one in a browser window sitting behind + // another app still reports `visible`, yet the user clearly cannot + // see in-page toasts. Check `hasFocus()` whenever it is available. + if (document.visibilityState !== "visible") return false; + if (typeof document.hasFocus === "function") return document.hasFocus(); + return true; +} + +/** + * Emit a terminal-event notification through three layers: + * + * 1. Toast: always, via a `CustomEvent("arkor:toast")` that the + * ToastProvider listens for. Works regardless of focus / permission. + * 2. Title prefix (`✓` / `⚠`): only when the tab is not focused. + * Cleared on the next route change by an effect in `App` that + * resets `document.title` whenever the route key changes. + * 3. OS Notification: only when permission was granted AND the tab is + * not focused. Tagged with the jobId so a duplicate SSE frame won't + * buzz the user twice. + * + * Deduplicated by `${jobId}:${status}` so reconnect-driven re-deliveries + * of `training.completed` (or polled status arriving after the SSE + * terminal frame already triggered us) only notify once. + */ +export function notifyJobTerminal(input: NotifyJobTerminalInput): void { + const dedupKey = `${input.jobId}:${input.status}`; + if (notifiedTerminals.has(dedupKey)) return; + notifiedTerminals.add(dedupKey); + + const message = summaryFor(input); + + if (typeof window !== "undefined") { + window.dispatchEvent( + new CustomEvent(TOAST_EVENT, { + detail: { + status: input.status, + jobName: input.jobName, + jobId: input.jobId, + message, + }, + }), + ); + } + + if (isTabFocused()) return; + + ensureTitlePrefix(input.status === "completed" ? "✓ " : "⚠ "); + + const Ctor = getNotificationCtor(); + if (!Ctor || Ctor.permission !== "granted") return; + try { + // Safari and a few WebViews still throw synchronously here when + // permission was granted in a prior session but the construction + // happens outside an active gesture (which is exactly our case: + // the terminal SSE frame can arrive minutes after the click). + // Toast + title prefix above already covered the user. + const n = new Ctor( + input.status === "completed" + ? "Training run completed" + : "Training run failed", + { + body: message, + tag: `arkor-job-${input.jobId}`, + }, + ); + n.onclick = () => { + window.focus(); + // `parseRoute()` does not decode path segments, so leave the id + // raw here to match the unencoded links emitted from JobsTable. + window.location.hash = `#/jobs/${input.jobId}`; + n.close(); + }; + } catch { + // intentional: see comment above + } +} + +export const TOAST_EVENT_NAME = TOAST_EVENT; + +export function _resetNotifyForTest(): void { + notifiedTerminals.clear(); +} diff --git a/packages/studio-app/src/pages/JobDetail.tsx b/packages/studio-app/src/pages/JobDetail.tsx index cd46ce88..b128de65 100644 --- a/packages/studio-app/src/pages/JobDetail.tsx +++ b/packages/studio-app/src/pages/JobDetail.tsx @@ -1,4 +1,4 @@ -import { useEffect, useState } from "react"; +import { useEffect, useRef, useState } from "react"; import { fetchJobs, openJobEvents, type Job } from "../lib/api"; import { ArrowLeft, Sparkles } from "../components/icons"; import { @@ -21,6 +21,8 @@ import { } from "../components/ui/Card"; import { StatusBadge } from "../components/ui/StatusBadge"; import { formatDuration, truncateMiddle } from "../lib/format"; +import { computeDisplayStatus } from "../lib/jobStatus"; +import { notifyJobTerminal } from "../lib/notify"; const MAX_LOSS_POINTS = 2000; @@ -42,20 +44,62 @@ export function JobDetail({ jobId }: { jobId: string }) { // still drive the visible status. const [liveStatus, setLiveStatus] = useState(null); const [liveStartedAt, setLiveStartedAt] = useState(null); + // True once the SSE stream opens or delivers any frame. Lets us tell + // "queued, GPU is warming up" apart from "queued, sitting in a backlog". + const [eventStreamConnected, setEventStreamConnected] = useState(false); + // Latest job.name held in a ref so the SSE listener closures (whose + // useEffect depends only on jobId) can surface a meaningful name in + // the terminal-event notification even when the polled `job` resolves + // after the closures were registered. + const jobNameRef = useRef(null); + // Wall-clock time the user landed on this job's page. Used to skip + // notifications for terminal frames whose timestamp predates the + // mount — i.e. SSE history replay or a poll observing a job that + // had already finished before the page was opened. + const mountedAtRef = useRef(Date.now()); + // Previously observed wire status; lets the polling tick recognise a + // queued/running -> completed/failed transition and fire a + // notification when the SSE stream missed the terminal frame. + const previousPolledStatusRef = useRef(null); useEffect(() => { setJob(null); + mountedAtRef.current = Date.now(); + previousPolledStatusRef.current = null; let cancelled = false; let timer: ReturnType | undefined; // Chained setTimeout instead of setInterval so a slow /api/jobs // request can't pile up overlapping calls. SSE remains the source // of truth for live status; polling is just for completedAt / - // config / etc that the SSE stream doesn't carry. + // config / etc that the SSE stream doesn't carry — plus a + // poll-driven terminal notification below as a safety net when + // the SSE stream dropped before the terminal frame arrived + // (machine sleep, proxy hiccup, etc). async function tick() { try { const { jobs } = await fetchJobs(); if (!cancelled) { - setJob(jobs.find((j) => j.id === jobId) ?? null); + const next = jobs.find((j) => j.id === jobId) ?? null; + const prevStatus = previousPolledStatusRef.current; + const wasTerminal = + prevStatus === "completed" || + prevStatus === "failed" || + prevStatus === "cancelled"; + if ( + next && + (next.status === "completed" || next.status === "failed") && + !wasTerminal && + shouldNotifyTerminal(next.completedAt, mountedAtRef.current) + ) { + notifyJobTerminal({ + status: next.status, + jobName: next.name ?? jobId, + jobId, + error: next.error ?? undefined, + }); + } + previousPolledStatusRef.current = next?.status ?? null; + setJob(next); } } catch { // ignore — events stream is the source of truth for live status @@ -70,6 +114,10 @@ export function JobDetail({ jobId }: { jobId: string }) { }; }, [jobId]); + useEffect(() => { + jobNameRef.current = job?.name ?? null; + }, [job?.name]); + useEffect(() => { // Clear per-job state when navigating between jobs so events, loss // points, terminal status, advanced toggle, and event-id counter @@ -83,6 +131,7 @@ export function JobDetail({ jobId }: { jobId: string }) { setEventErr(null); setLiveStatus(null); setLiveStartedAt(null); + setEventStreamConnected(false); let counter = 0; @@ -143,9 +192,16 @@ export function JobDetail({ jobId }: { jobId: string }) { // Any received frame means the EventSource is alive again — drop // any stale "stream interrupted" banner from the prior disconnect. setEventErr(null); + setEventStreamConnected(true); } const es = openJobEvents(jobId); + // Treat the EventSource as connected from the moment we attach so + // the display flips into `provisioning` immediately even when the + // backend takes a while to send the first frame; `onerror` below + // walks this back so a real disconnect drops us out of warm-up. + setEventStreamConnected(true); + es.onopen = () => setEventStreamConnected(true); es.addEventListener("training.started", (ev: MessageEvent) => { const parsed = safeParse(ev.data); pushEvent("training.started", ev.data, parsed); @@ -224,73 +280,108 @@ export function JobDetail({ jobId }: { jobId: string }) { // SSE payload carries the trainer-side completion timestamp; use // it so duration / "Completed" stay correct without depending on // the next /api/jobs poll. + let artifacts = 0; + let completedAt: string | undefined; if (parsed && typeof parsed === "object") { const d = parsed as { artifacts?: unknown[]; timestamp?: string }; - setTerminal({ + artifacts = Array.isArray(d.artifacts) ? d.artifacts.length : 0; + completedAt = d.timestamp; + } + setTerminal({ status: "completed", artifacts, completedAt }); + if (shouldNotifyTerminal(completedAt, mountedAtRef.current)) { + notifyJobTerminal({ status: "completed", - artifacts: Array.isArray(d.artifacts) ? d.artifacts.length : 0, - completedAt: d.timestamp, + jobName: jobNameRef.current ?? jobId, + jobId, + artifacts, }); - } else { - setTerminal({ status: "completed", artifacts: 0 }); } }); es.addEventListener("training.failed", (ev: MessageEvent) => { const parsed = safeParse(ev.data); pushEvent("training.failed", ev.data, parsed); + let error: string | undefined; + let completedAt: string | undefined; if (parsed && typeof parsed === "object") { const d = parsed as { error?: string; timestamp?: string }; - setTerminal({ + error = d.error; + completedAt = d.timestamp; + } + setTerminal({ status: "failed", error, artifacts: 0, completedAt }); + if (shouldNotifyTerminal(completedAt, mountedAtRef.current)) { + notifyJobTerminal({ status: "failed", - error: d.error, - artifacts: 0, - completedAt: d.timestamp, + jobName: jobNameRef.current ?? jobId, + jobId, + error, }); - } else { - setTerminal({ status: "failed", artifacts: 0 }); } }); - es.addEventListener("end", () => es.close()); + es.addEventListener("end", () => { + es.close(); + setEventStreamConnected(false); + }); + // Deliberately do NOT drop `eventStreamConnected` on `onerror`: the + // EventSource fires `onerror` during its built-in reconnect loop + // (and, under the Hono `/api/jobs/:id/events` proxy on Linux CI, + // even on benign keep-alive transitions). Treating that as a real + // disconnect would yank the user out of the warm-up display while + // training is still going. The "stream interrupted" banner alone + // is enough; once a frame lands again, `pushEvent` clears it and + // reaffirms the connection. es.onerror = () => setEventErr("Event stream interrupted."); - return () => es.close(); + return () => { + es.close(); + setEventStreamConnected(false); + }; }, [jobId]); - // Status precedence: - // 1. SSE terminal frame (training.completed / training.failed) we - // observed in this session — most authoritative. - // 2. Polled terminal status from /api/jobs — also authoritative, - // and crucially it preempts a stale `liveStatus = "running"` - // that can linger if the SSE stream dropped before the - // terminal frame arrived. - // 3. SSE-derived `liveStatus` for the running phase, which lets - // us flip the UI to "running" before /api/jobs catches up. - // 4. The polled non-terminal status, if anything. - // 5. Default "queued". - const polledIsTerminal = - job?.status === "completed" || - job?.status === "failed" || - job?.status === "cancelled"; - const status: Job["status"] = - terminal?.status ?? - (polledIsTerminal ? job!.status : (liveStatus ?? job?.status ?? "queued")); + // Status precedence is centralised in `computeDisplayStatus` so the + // same rule drives the badge here, the badge in the page header, the + // sidebar meta row, and the JobsTable list-level heuristic. + const status = computeDisplayStatus({ + job, + liveStatus, + terminalStatus: terminal?.status ?? null, + eventStreamConnected, + now, + }); - // Live duration ticker while the job is running. - const isRunning = status === "running" && !terminal; + // Tick `now` while either the warm-up timer or the run timer is moving. + const isTicking = + !terminal && (status === "provisioning" || status === "running"); useEffect(() => { - if (!isRunning) return; + if (!isTicking) return; const t = setInterval(() => setNow(Date.now()), 1000); return () => clearInterval(t); - }, [isRunning]); + }, [isTicking]); const duration = computeDuration(job, liveStartedAt, terminal, now); + const warmupMs = + status === "provisioning" && job?.createdAt + ? (() => { + const created = Date.parse(job.createdAt); + return Number.isNaN(created) ? null : Math.max(0, now - created); + })() + : null; const meta: JobMetaItem[] = [ { label: "Status", value: }, { - label: "Duration", - value: duration === null ? "—" : formatDuration(duration), - mono: true, + label: "Phase", + value: phaseLabel(status), }, + status === "provisioning" + ? { + label: "GPU warm-up", + value: warmupMs === null ? "—" : formatDuration(warmupMs), + mono: true, + } + : { + label: "Duration", + value: duration === null ? "—" : formatDuration(duration), + mono: true, + }, { label: "Created", value: job?.createdAt ? formatAbsoluteTime(job.createdAt) : "—", @@ -392,6 +483,16 @@ export function JobDetail({ jobId }: { jobId: string }) { {eventErr} ) : null} + {status === "provisioning" ? ( +
    +
    Warming up the GPU for this job.
    +
    + Training GPUs are not kept warm between jobs. Allocation is fast + when one is still warm from a recent run; otherwise the worker + starts from cold, which can take several minutes. +
    +
    + ) : null}
    @@ -412,7 +513,14 @@ export function JobDetail({ jobId }: { jobId: string }) {
    - + {status === "provisioning" && points.length === 0 ? ( +
    + Waiting for GPU. The loss curve starts the moment the + trainer reports its first step. +
    + ) : ( + + )}
    @@ -425,7 +533,10 @@ export function JobDetail({ jobId }: { jobId: string }) { - +
    @@ -477,6 +588,28 @@ function AdvancedToggle({ ); } +// Tolerance (ms) between the trainer-side terminal timestamp and the +// page mount, so a few seconds of clock skew or polling lag does not +// suppress a notification for a run that genuinely finished after the +// user landed on the page. +const TERMINAL_NOTIFY_GRACE_MS = 5_000; + +function shouldNotifyTerminal( + timestamp: string | undefined | null, + mountedAt: number, +): boolean { + if (!timestamp) return true; + const t = Date.parse(timestamp); + if (!Number.isFinite(t)) return true; + return t >= mountedAt - TERMINAL_NOTIFY_GRACE_MS; +} + +function phaseLabel(status: string): string { + if (status === "provisioning") return "Warming up GPU"; + if (status === "running") return "Training run"; + return "—"; +} + function computeDuration( job: Job | null, liveStartedAt: string | null,