From e1a8e5ced2a5a15891b9dc71f7d7bc55a4b391b4 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:05:45 +0900 Subject: [PATCH 01/15] feat(studio-app): add jobStatus + notify libs for cold-start UX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure helpers used by the cold-start UX work in follow-up commits. - computeDisplayStatus synthesises a `provisioning` display state from a queued job whose SSE stream is open (or whose createdAt is within a recent window), without changing the wire shape of `Job["status"]`. - notifyJobTerminal dispatches an in-page toast event, sets a tab title prefix (✓ / ⚠), and fires an OS Notification when permission was granted, deduplicated by (jobId, status) so reconnect-driven re-deliveries do not buzz the user twice. --- packages/studio-app/src/lib/jobStatus.test.ts | 116 ++++++++ packages/studio-app/src/lib/jobStatus.ts | 69 +++++ packages/studio-app/src/lib/notify.test.ts | 250 ++++++++++++++++++ packages/studio-app/src/lib/notify.ts | 140 ++++++++++ 4 files changed, 575 insertions(+) create mode 100644 packages/studio-app/src/lib/jobStatus.test.ts create mode 100644 packages/studio-app/src/lib/jobStatus.ts create mode 100644 packages/studio-app/src/lib/notify.test.ts create mode 100644 packages/studio-app/src/lib/notify.ts diff --git a/packages/studio-app/src/lib/jobStatus.test.ts b/packages/studio-app/src/lib/jobStatus.test.ts new file mode 100644 index 00000000..7bac1772 --- /dev/null +++ b/packages/studio-app/src/lib/jobStatus.test.ts @@ -0,0 +1,116 @@ +import { describe, expect, it } from "vitest"; +import { computeDisplayStatus } from "./jobStatus"; + +describe("computeDisplayStatus", () => { + it("returns the SSE terminal status when present", () => { + expect( + computeDisplayStatus({ + job: { status: "running" }, + liveStatus: "running", + terminalStatus: "completed", + }), + ).toBe("completed"); + expect( + computeDisplayStatus({ + job: { status: "running" }, + liveStatus: "running", + terminalStatus: "failed", + }), + ).toBe("failed"); + }); + + it("preempts liveStatus when polled status is already terminal", () => { + expect( + computeDisplayStatus({ + job: { status: "completed" }, + liveStatus: "running", + }), + ).toBe("completed"); + expect( + computeDisplayStatus({ + job: { status: "cancelled" }, + liveStatus: "running", + }), + ).toBe("cancelled"); + }); + + it("flips to running when SSE training.started is seen", () => { + expect( + computeDisplayStatus({ + job: { status: "queued" }, + liveStatus: "running", + eventStreamConnected: true, + }), + ).toBe("running"); + }); + + it("returns provisioning while queued and the event stream is open", () => { + expect( + computeDisplayStatus({ + job: { status: "queued" }, + eventStreamConnected: true, + }), + ).toBe("provisioning"); + }); + + it("returns provisioning while queued and createdAt is recent", () => { + const now = Date.UTC(2026, 4, 11, 12, 0, 0); + const created = new Date(now - 30_000).toISOString(); + expect( + computeDisplayStatus({ + job: { status: "queued", createdAt: created }, + now, + }), + ).toBe("provisioning"); + }); + + it("returns queued when not connected and createdAt is past the recent window", () => { + const now = Date.UTC(2026, 4, 11, 12, 0, 0); + const created = new Date(now - 5 * 60_000).toISOString(); + expect( + computeDisplayStatus({ + job: { status: "queued", createdAt: created }, + now, + }), + ).toBe("queued"); + }); + + it("treats job = null as queued (no createdAt to anchor)", () => { + expect(computeDisplayStatus({ job: null })).toBe("queued"); + }); + + it("treats job = null as provisioning when the event stream is open", () => { + expect( + computeDisplayStatus({ job: null, eventStreamConnected: true }), + ).toBe("provisioning"); + }); + + it("respects a custom recentMs window", () => { + const now = Date.UTC(2026, 4, 11, 12, 0, 0); + const created = new Date(now - 30_000).toISOString(); + expect( + computeDisplayStatus({ + job: { status: "queued", createdAt: created }, + now, + recentMs: 10_000, + }), + ).toBe("queued"); + }); + + it("falls through to polled non-terminal status when no live signals", () => { + expect( + computeDisplayStatus({ + job: { status: "running" }, + }), + ).toBe("running"); + }); + + it("ignores invalid createdAt", () => { + expect( + computeDisplayStatus({ + job: { status: "queued", createdAt: "not-a-date" }, + now: Date.now(), + }), + ).toBe("queued"); + }); +}); diff --git a/packages/studio-app/src/lib/jobStatus.ts b/packages/studio-app/src/lib/jobStatus.ts new file mode 100644 index 00000000..7dcdfbfe --- /dev/null +++ b/packages/studio-app/src/lib/jobStatus.ts @@ -0,0 +1,69 @@ +import type { Job } from "./api"; + +export type DisplayStatus = Job["status"] | "provisioning"; + +export interface ComputeDisplayStatusInput { + job: { status: Job["status"]; createdAt?: string } | null | undefined; + liveStatus?: Job["status"] | null; + terminalStatus?: "completed" | "failed" | null; + eventStreamConnected?: boolean; + now?: number; + recentMs?: number; +} + +const DEFAULT_RECENT_MS = 90_000; + +/** + * Resolve the status to show in the UI. Wire status (`Job["status"]`) + * only carries queued / running / completed / failed / cancelled; this + * synthesises `provisioning` for a queued job whose SSE stream is open + * (or whose createdAt is within `recentMs`) so the UI can tell "GPU + * warming up" apart from "sitting in a backlog". The wire shape is + * unchanged. + * + * Precedence: + * 1. SSE terminal frame observed in this session + * 2. Polled terminal status from /api/jobs (so a stale liveStatus = + * "running" cannot mask a terminal that arrived after the SSE + * connection dropped) + * 3. SSE-derived liveStatus = "running" (i.e. training.started seen) + * 4. Synthetic `provisioning` when the job is queued AND either the + * event stream is open without a training.started yet, or + * createdAt is within `recentMs` + * 5. Polled non-terminal status, default queued + */ +export function computeDisplayStatus(input: ComputeDisplayStatusInput): DisplayStatus { + const { + job, + liveStatus, + terminalStatus, + eventStreamConnected, + now, + recentMs = DEFAULT_RECENT_MS, + } = input; + + if (terminalStatus) return terminalStatus; + + if ( + job?.status === "completed" || + job?.status === "failed" || + job?.status === "cancelled" + ) { + return job.status; + } + + if (liveStatus === "running") return "running"; + + const isQueued = !job || job.status === "queued"; + if (isQueued) { + if (eventStreamConnected) return "provisioning"; + if (job?.createdAt && typeof now === "number") { + const created = Date.parse(job.createdAt); + if (Number.isFinite(created) && now - created < recentMs) { + return "provisioning"; + } + } + } + + return job?.status ?? "queued"; +} diff --git a/packages/studio-app/src/lib/notify.test.ts b/packages/studio-app/src/lib/notify.test.ts new file mode 100644 index 00000000..0fc09457 --- /dev/null +++ b/packages/studio-app/src/lib/notify.test.ts @@ -0,0 +1,250 @@ +// @vitest-environment jsdom +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { + TOAST_EVENT_NAME, + _resetNotifyForTest, + ensurePermissionOnGesture, + notifyJobTerminal, + type ToastEventDetail, +} from "./notify"; + +interface FakeNotificationInstance { + title: string; + options: NotificationOptions | undefined; + onclick: ((this: Notification, ev: Event) => unknown) | null; + close: ReturnType; +} + +function installFakeNotification( + permission: NotificationPermission, + requestPermissionImpl?: () => Promise, +): { + instances: FakeNotificationInstance[]; + requestPermission: ReturnType; +} { + const instances: FakeNotificationInstance[] = []; + const requestPermission = vi.fn( + requestPermissionImpl ?? (() => Promise.resolve(permission)), + ); + class FakeNotification { + static permission: NotificationPermission = permission; + static requestPermission = requestPermission; + title: string; + options: NotificationOptions | undefined; + onclick: FakeNotificationInstance["onclick"] = null; + close = vi.fn(); + constructor(title: string, options?: NotificationOptions) { + this.title = title; + this.options = options; + instances.push(this); + } + } + (globalThis as unknown as { Notification: typeof FakeNotification }).Notification = + FakeNotification; + return { instances, requestPermission }; +} + +function uninstallNotification() { + delete (globalThis as unknown as { Notification?: unknown }).Notification; +} + +function setVisibility(state: "visible" | "hidden") { + Object.defineProperty(document, "visibilityState", { + value: state, + configurable: true, + }); + Object.defineProperty(document, "hasFocus", { + value: () => state === "visible", + configurable: true, + }); +} + +beforeEach(() => { + _resetNotifyForTest(); + document.title = "Arkor"; +}); + +afterEach(() => { + uninstallNotification(); + vi.restoreAllMocks(); +}); + +describe("ensurePermissionOnGesture", () => { + it("requests permission only when current state is default", () => { + const { requestPermission } = installFakeNotification("default"); + ensurePermissionOnGesture(); + expect(requestPermission).toHaveBeenCalledTimes(1); + }); + + it("does nothing when permission is already granted", () => { + const { requestPermission } = installFakeNotification("granted"); + ensurePermissionOnGesture(); + expect(requestPermission).not.toHaveBeenCalled(); + }); + + it("does nothing when permission is denied", () => { + const { requestPermission } = installFakeNotification("denied"); + ensurePermissionOnGesture(); + expect(requestPermission).not.toHaveBeenCalled(); + }); + + it("does nothing when Notification API is absent", () => { + uninstallNotification(); + expect(() => ensurePermissionOnGesture()).not.toThrow(); + }); + + it("swallows synchronous throws from requestPermission", () => { + installFakeNotification("default", () => { + throw new Error("blocked"); + }); + expect(() => ensurePermissionOnGesture()).not.toThrow(); + }); +}); + +describe("notifyJobTerminal", () => { + it("fires an OS Notification when granted and tab is hidden", () => { + setVisibility("hidden"); + const { instances } = installFakeNotification("granted"); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-1", + artifacts: 3, + }); + + expect(instances).toHaveLength(1); + expect(instances[0]?.title).toBe("Training run completed"); + expect(instances[0]?.options?.body).toBe("demo (3 artifacts)"); + expect(instances[0]?.options?.tag).toBe("arkor-job-job-1"); + expect(detail).toHaveLength(1); + expect(detail[0]?.message).toBe("demo (3 artifacts)"); + expect(document.title.startsWith("✓ ")).toBe(true); + }); + + it("only emits a toast when the tab is focused (no OS notification, no title prefix)", () => { + setVisibility("visible"); + const { instances } = installFakeNotification("granted"); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-2", + artifacts: 1, + }); + + expect(instances).toHaveLength(0); + expect(detail).toHaveLength(1); + expect(document.title).toBe("Arkor"); + }); + + it("uses the warning prefix and constructs no Notification when permission is denied", () => { + setVisibility("hidden"); + const { instances } = installFakeNotification("denied"); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + notifyJobTerminal({ + status: "failed", + jobName: "demo", + jobId: "job-3", + error: "boom", + }); + + expect(instances).toHaveLength(0); + expect(detail).toHaveLength(1); + expect(detail[0]?.message).toBe("demo failed: boom"); + expect(document.title.startsWith("⚠ ")).toBe(true); + }); + + it("deduplicates repeat calls for the same (jobId, status)", () => { + setVisibility("hidden"); + const { instances } = installFakeNotification("granted"); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-4", + artifacts: 2, + }); + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-4", + artifacts: 2, + }); + + expect(instances).toHaveLength(1); + expect(detail).toHaveLength(1); + }); + + it("allows the same job to notify on a different terminal status (failed after completed)", () => { + setVisibility("hidden"); + installFakeNotification("granted"); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-5", + }); + notifyJobTerminal({ + status: "failed", + jobName: "demo", + jobId: "job-5", + error: "x", + }); + + expect(document.title.startsWith("⚠ ")).toBe(true); + }); + + it("does not stack the same title prefix repeatedly", () => { + setVisibility("hidden"); + installFakeNotification("granted"); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-6", + }); + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-7", + }); + + const matches = document.title.match(/✓ /g); + expect(matches?.length).toBe(1); + }); + + it("survives without the Notification API at all (toast still fires)", () => { + setVisibility("hidden"); + uninstallNotification(); + const detail: ToastEventDetail[] = []; + window.addEventListener(TOAST_EVENT_NAME, ((ev: Event) => { + detail.push((ev as CustomEvent).detail); + }) as EventListener); + + expect(() => + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-8", + }), + ).not.toThrow(); + expect(detail).toHaveLength(1); + }); +}); diff --git a/packages/studio-app/src/lib/notify.ts b/packages/studio-app/src/lib/notify.ts new file mode 100644 index 00000000..9ffb54d2 --- /dev/null +++ b/packages/studio-app/src/lib/notify.ts @@ -0,0 +1,140 @@ +export interface NotifyJobTerminalInput { + status: "completed" | "failed"; + jobName: string; + jobId: string; + artifacts?: number; + error?: string; +} + +export interface ToastEventDetail { + status: "completed" | "failed"; + jobName: string; + jobId: string; + message: string; +} + +const TOAST_EVENT = "arkor:toast"; + +const notifiedTerminals = new Set(); + +type NotificationCtor = typeof globalThis.Notification | undefined; + +function getNotificationCtor(): NotificationCtor { + if (typeof globalThis === "undefined") return undefined; + return (globalThis as { Notification?: NotificationCtor }).Notification; +} + +/** + * Ask for OS notification permission, but only when the current state is + * `default` so we don't re-prompt users who already chose. Safe to call + * from a user-gesture handler (the click on Run training). No-ops in + * environments without the Notification API. + */ +export function ensurePermissionOnGesture(): void { + const Ctor = getNotificationCtor(); + if (!Ctor) return; + if (Ctor.permission !== "default") return; + try { + const result = Ctor.requestPermission(); + if (result && typeof (result as Promise).then === "function") { + (result as Promise).catch(() => undefined); + } + } catch { + // Synchronous throws happen on a few older browsers; fall through. + } +} + +function summaryFor(input: NotifyJobTerminalInput): string { + if (input.status === "completed") { + const n = input.artifacts ?? 0; + return `${input.jobName} (${n} artifact${n === 1 ? "" : "s"})`; + } + return input.error + ? `${input.jobName} failed: ${input.error}` + : `${input.jobName} failed`; +} + +function ensureTitlePrefix(prefix: string): void { + if (typeof document === "undefined") return; + if (!document.title.startsWith(prefix)) { + document.title = `${prefix}${document.title}`; + } +} + +function isTabFocused(): boolean { + if (typeof document === "undefined") return false; + if (document.visibilityState === "visible") return true; + return typeof document.hasFocus === "function" && document.hasFocus(); +} + +/** + * Emit a terminal-event notification through three layers: + * + * 1. Toast: always, via a `CustomEvent("arkor:toast")` that the + * ToastProvider listens for. Works regardless of focus / permission. + * 2. Title prefix (`✓` / `⚠`): only when the tab is not focused. Cleared + * on the next route change by the title controller in App. + * 3. OS Notification: only when permission was granted AND the tab is + * not focused. Tagged with the jobId so a duplicate SSE frame won't + * buzz the user twice. + * + * Deduplicated by `${jobId}:${status}` so reconnect-driven re-deliveries + * of `training.completed` (or polled status arriving after the SSE + * terminal frame already triggered us) only notify once. + */ +export function notifyJobTerminal(input: NotifyJobTerminalInput): void { + const dedupKey = `${input.jobId}:${input.status}`; + if (notifiedTerminals.has(dedupKey)) return; + notifiedTerminals.add(dedupKey); + + const message = summaryFor(input); + + if (typeof window !== "undefined") { + window.dispatchEvent( + new CustomEvent(TOAST_EVENT, { + detail: { + status: input.status, + jobName: input.jobName, + jobId: input.jobId, + message, + }, + }), + ); + } + + if (isTabFocused()) return; + + ensureTitlePrefix(input.status === "completed" ? "✓ " : "⚠ "); + + const Ctor = getNotificationCtor(); + if (!Ctor || Ctor.permission !== "granted") return; + try { + // Safari and a few WebViews still throw synchronously here when + // permission was granted in a prior session but the construction + // happens outside an active gesture (which is exactly our case: + // the terminal SSE frame can arrive minutes after the click). + // Toast + title prefix above already covered the user. + const n = new Ctor( + input.status === "completed" + ? "Training run completed" + : "Training run failed", + { + body: message, + tag: `arkor-job-${input.jobId}`, + }, + ); + n.onclick = () => { + window.focus(); + window.location.hash = `#/jobs/${encodeURIComponent(input.jobId)}`; + n.close(); + }; + } catch { + // intentional: see comment above + } +} + +export const TOAST_EVENT_NAME = TOAST_EVENT; + +export function _resetNotifyForTest(): void { + notifiedTerminals.clear(); +} From 572016da398669a3a3f23c9faf45a48adcaf8e34 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:05:57 +0900 Subject: [PATCH 02/15] feat(studio-app): add ToastProvider for job completion notifications Mounts a top-right toast stack at the App root that listens for the `arkor:toast` CustomEvent dispatched by notifyJobTerminal. Self- contained so the SPA does not take on a third-party toast dep under the workspace's minimumReleaseAge / no-downgrade pnpm policy. --- packages/studio-app/src/App.tsx | 20 +-- .../studio-app/src/components/ui/Toast.tsx | 136 ++++++++++++++++++ 2 files changed, 148 insertions(+), 8 deletions(-) create mode 100644 packages/studio-app/src/components/ui/Toast.tsx diff --git a/packages/studio-app/src/App.tsx b/packages/studio-app/src/App.tsx index 5ce48720..db470280 100644 --- a/packages/studio-app/src/App.tsx +++ b/packages/studio-app/src/App.tsx @@ -4,6 +4,7 @@ import { Overview } from "./pages/Overview"; import { JobsList } from "./pages/JobsList"; import { JobDetail } from "./pages/JobDetail"; import { Playground } from "./pages/Playground"; +import { ToastProvider } from "./components/ui/Toast"; import { fetchCredentials, type Credentials } from "./lib/api"; import { useHashRoute } from "./route"; @@ -29,13 +30,16 @@ export function App() { }, []); return ( - - {route.kind === "home" && } - {route.kind === "jobs" && } - {route.kind === "job" && } - {route.kind === "playground" && ( - - )} - + <> + + {route.kind === "home" && } + {route.kind === "jobs" && } + {route.kind === "job" && } + {route.kind === "playground" && ( + + )} + + + ); } diff --git a/packages/studio-app/src/components/ui/Toast.tsx b/packages/studio-app/src/components/ui/Toast.tsx new file mode 100644 index 00000000..2240d7d7 --- /dev/null +++ b/packages/studio-app/src/components/ui/Toast.tsx @@ -0,0 +1,136 @@ +import { useEffect, useRef, useState } from "react"; +import { TOAST_EVENT_NAME, type ToastEventDetail } from "../../lib/notify"; +import { cn } from "./cn"; + +interface ToastItem { + id: number; + status: "completed" | "failed"; + jobName: string; + jobId: string; + message: string; +} + +const AUTO_DISMISS_MS = 6000; +const MAX_VISIBLE = 3; + +export function ToastProvider() { + const [toasts, setToasts] = useState([]); + const counterRef = useRef(0); + + useEffect(() => { + const onToast = (ev: Event) => { + const detail = (ev as CustomEvent).detail; + if (!detail) return; + const id = counterRef.current++; + setToasts((prev) => { + const next = [ + ...prev, + { + id, + status: detail.status, + jobName: detail.jobName, + jobId: detail.jobId, + message: detail.message, + }, + ]; + return next.length > MAX_VISIBLE ? next.slice(-MAX_VISIBLE) : next; + }); + window.setTimeout(() => { + setToasts((prev) => prev.filter((t) => t.id !== id)); + }, AUTO_DISMISS_MS); + }; + window.addEventListener(TOAST_EVENT_NAME, onToast); + return () => window.removeEventListener(TOAST_EVENT_NAME, onToast); + }, []); + + if (toasts.length === 0) return null; + + return ( +
+ {toasts.map((t) => ( + + setToasts((prev) => prev.filter((x) => x.id !== t.id)) + } + /> + ))} +
+ ); +} + +function ToastCard({ + toast, + onDismiss, +}: { + toast: ToastItem; + onDismiss: () => void; +}) { + const tone = + toast.status === "completed" + ? "border-emerald-200 bg-white dark:border-emerald-400/30 dark:bg-zinc-950" + : "border-red-200 bg-white dark:border-red-400/30 dark:bg-zinc-950"; + const accent = + toast.status === "completed" + ? "text-emerald-700 dark:text-emerald-300" + : "text-red-700 dark:text-red-300"; + + function openJob() { + window.location.hash = `#/jobs/${encodeURIComponent(toast.jobId)}`; + onDismiss(); + } + + return ( +
+
+
+
+ {toast.status === "completed" ? "Training run completed" : "Training run failed"} +
+
+ {toast.message} +
+ +
+ +
+
+ ); +} From a87a2e363498167872a91d5b4ffd40e5eecb43ea Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:06:04 +0900 Subject: [PATCH 03/15] feat(studio-app): surface provisioning state in shared UI elements - StatusBadge: add the amber pulsing `provisioning` variant labelled "Warming up GPU". - EventsStream: take a `provisioning` prop and switch the empty-state copy when the SSE stream is open but no event has arrived yet. - JobsTable: route Job.status through computeDisplayStatus so a recently-created queued job in the list also renders as warming up, matching the JobDetail page. --- .../studio-app/src/components/jobs/EventsStream.tsx | 12 ++++++++++-- .../studio-app/src/components/jobs/JobsTable.tsx | 9 ++++++++- .../studio-app/src/components/ui/StatusBadge.tsx | 13 +++++++++---- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/packages/studio-app/src/components/jobs/EventsStream.tsx b/packages/studio-app/src/components/jobs/EventsStream.tsx index 01271363..220f3a0d 100644 --- a/packages/studio-app/src/components/jobs/EventsStream.tsx +++ b/packages/studio-app/src/components/jobs/EventsStream.tsx @@ -24,7 +24,13 @@ function formatTime(ts: number): string { return `${hh}:${mm}:${ss}`; } -export function EventsStream({ events }: { events: EventEntry[] }) { +export function EventsStream({ + events, + provisioning = false, +}: { + events: EventEntry[]; + provisioning?: boolean; +}) { const wrapRef = useRef(null); const stickRef = useRef(true); @@ -51,7 +57,9 @@ export function EventsStream({ events }: { events: EventEntry[] }) { > {events.length === 0 ? (
- Listening for events… + {provisioning + ? "Warming up the GPU. The first event lands as soon as the trainer begins." + : "Listening for events…"}
) : (
    diff --git a/packages/studio-app/src/components/jobs/JobsTable.tsx b/packages/studio-app/src/components/jobs/JobsTable.tsx index ecae21f0..5e128c77 100644 --- a/packages/studio-app/src/components/jobs/JobsTable.tsx +++ b/packages/studio-app/src/components/jobs/JobsTable.tsx @@ -1,5 +1,6 @@ import type { Job } from "../../lib/api"; import { formatDuration, truncateMiddle } from "../../lib/format"; +import { computeDisplayStatus } from "../../lib/jobStatus"; import { RelativeTime } from "../ui/RelativeTime"; import { StatusBadge } from "../ui/StatusBadge"; @@ -48,7 +49,13 @@ export function JobsTable({ className="group relative border-t border-zinc-100 transition-colors hover:bg-zinc-50 focus-within:bg-zinc-50 dark:border-zinc-900 dark:hover:bg-zinc-900/60 dark:focus-within:bg-zinc-900/60" > - + {/* diff --git a/packages/studio-app/src/components/ui/StatusBadge.tsx b/packages/studio-app/src/components/ui/StatusBadge.tsx index d3b6c0ec..6302c00f 100644 --- a/packages/studio-app/src/components/ui/StatusBadge.tsx +++ b/packages/studio-app/src/components/ui/StatusBadge.tsx @@ -9,16 +9,21 @@ interface StatusBadgeProps { className?: string; } -const VARIANT: Record< - Status, - { label: string; pill: string; dot: string; pulse: boolean } -> = { +type Variant = { label: string; pill: string; dot: string; pulse: boolean }; + +const VARIANT: Record = { queued: { label: "Queued", pill: "border-zinc-200 bg-zinc-50 text-zinc-600 dark:border-zinc-800 dark:bg-zinc-900 dark:text-zinc-400", dot: "bg-zinc-400 dark:bg-zinc-500", pulse: false, }, + provisioning: { + label: "Warming up GPU", + pill: "border-amber-200 bg-amber-50 text-amber-700 dark:border-amber-400/30 dark:bg-amber-400/10 dark:text-amber-300", + dot: "bg-amber-500", + pulse: true, + }, running: { label: "Running", pill: "border-teal-200 bg-teal-50 text-teal-700 dark:border-teal-400/30 dark:bg-teal-400/10 dark:text-teal-300", From 6ce0bbff1dc55d6d85d784c8058f963c979627b6 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:06:20 +0900 Subject: [PATCH 04/15] feat(studio-app): show GPU warm-up phase and notify on terminal events JobDetail flips into a `provisioning` view while the SSE stream is open but no `training.started` has arrived: an amber banner explains the wait, the loss chart shows a "Waiting for GPU" placeholder, the Phase / GPU warm-up rows in the meta sidebar tick a live timer, and EventsStream's empty-state copy switches. On training.completed / training.failed the page calls notifyJobTerminal so the user gets a toast, a tab-title indicator, and (when permission was granted) an OS notification even with the tab in the background. The job name is held in a ref so the SSE listener closures can surface it even if the polled `job` resolves after the listeners were registered. --- packages/studio-app/src/pages/JobDetail.tsx | 146 ++++++++++++++------ 1 file changed, 103 insertions(+), 43 deletions(-) diff --git a/packages/studio-app/src/pages/JobDetail.tsx b/packages/studio-app/src/pages/JobDetail.tsx index cd46ce88..84a4f8ba 100644 --- a/packages/studio-app/src/pages/JobDetail.tsx +++ b/packages/studio-app/src/pages/JobDetail.tsx @@ -1,4 +1,4 @@ -import { useEffect, useState } from "react"; +import { useEffect, useRef, useState } from "react"; import { fetchJobs, openJobEvents, type Job } from "../lib/api"; import { ArrowLeft, Sparkles } from "../components/icons"; import { @@ -21,6 +21,8 @@ import { } from "../components/ui/Card"; import { StatusBadge } from "../components/ui/StatusBadge"; import { formatDuration, truncateMiddle } from "../lib/format"; +import { computeDisplayStatus } from "../lib/jobStatus"; +import { notifyJobTerminal } from "../lib/notify"; const MAX_LOSS_POINTS = 2000; @@ -42,6 +44,14 @@ export function JobDetail({ jobId }: { jobId: string }) { // still drive the visible status. const [liveStatus, setLiveStatus] = useState(null); const [liveStartedAt, setLiveStartedAt] = useState(null); + // True once the SSE stream opens or delivers any frame. Lets us tell + // "queued, GPU is warming up" apart from "queued, sitting in a backlog". + const [eventStreamConnected, setEventStreamConnected] = useState(false); + // Latest job.name held in a ref so the SSE listener closures (whose + // useEffect depends only on jobId) can surface a meaningful name in + // the terminal-event notification even when the polled `job` resolves + // after the closures were registered. + const jobNameRef = useRef(null); useEffect(() => { setJob(null); @@ -70,6 +80,10 @@ export function JobDetail({ jobId }: { jobId: string }) { }; }, [jobId]); + useEffect(() => { + jobNameRef.current = job?.name ?? null; + }, [job?.name]); + useEffect(() => { // Clear per-job state when navigating between jobs so events, loss // points, terminal status, advanced toggle, and event-id counter @@ -83,6 +97,7 @@ export function JobDetail({ jobId }: { jobId: string }) { setEventErr(null); setLiveStatus(null); setLiveStartedAt(null); + setEventStreamConnected(false); let counter = 0; @@ -143,9 +158,11 @@ export function JobDetail({ jobId }: { jobId: string }) { // Any received frame means the EventSource is alive again — drop // any stale "stream interrupted" banner from the prior disconnect. setEventErr(null); + setEventStreamConnected(true); } const es = openJobEvents(jobId); + es.onopen = () => setEventStreamConnected(true); es.addEventListener("training.started", (ev: MessageEvent) => { const parsed = safeParse(ev.data); pushEvent("training.started", ev.data, parsed); @@ -224,73 +241,90 @@ export function JobDetail({ jobId }: { jobId: string }) { // SSE payload carries the trainer-side completion timestamp; use // it so duration / "Completed" stay correct without depending on // the next /api/jobs poll. + let artifacts = 0; + let completedAt: string | undefined; if (parsed && typeof parsed === "object") { const d = parsed as { artifacts?: unknown[]; timestamp?: string }; - setTerminal({ - status: "completed", - artifacts: Array.isArray(d.artifacts) ? d.artifacts.length : 0, - completedAt: d.timestamp, - }); - } else { - setTerminal({ status: "completed", artifacts: 0 }); + artifacts = Array.isArray(d.artifacts) ? d.artifacts.length : 0; + completedAt = d.timestamp; } + setTerminal({ status: "completed", artifacts, completedAt }); + notifyJobTerminal({ + status: "completed", + jobName: jobNameRef.current ?? jobId, + jobId, + artifacts, + }); }); es.addEventListener("training.failed", (ev: MessageEvent) => { const parsed = safeParse(ev.data); pushEvent("training.failed", ev.data, parsed); + let error: string | undefined; + let completedAt: string | undefined; if (parsed && typeof parsed === "object") { const d = parsed as { error?: string; timestamp?: string }; - setTerminal({ - status: "failed", - error: d.error, - artifacts: 0, - completedAt: d.timestamp, - }); - } else { - setTerminal({ status: "failed", artifacts: 0 }); + error = d.error; + completedAt = d.timestamp; } + setTerminal({ status: "failed", error, artifacts: 0, completedAt }); + notifyJobTerminal({ + status: "failed", + jobName: jobNameRef.current ?? jobId, + jobId, + error, + }); }); es.addEventListener("end", () => es.close()); es.onerror = () => setEventErr("Event stream interrupted."); return () => es.close(); }, [jobId]); - // Status precedence: - // 1. SSE terminal frame (training.completed / training.failed) we - // observed in this session — most authoritative. - // 2. Polled terminal status from /api/jobs — also authoritative, - // and crucially it preempts a stale `liveStatus = "running"` - // that can linger if the SSE stream dropped before the - // terminal frame arrived. - // 3. SSE-derived `liveStatus` for the running phase, which lets - // us flip the UI to "running" before /api/jobs catches up. - // 4. The polled non-terminal status, if anything. - // 5. Default "queued". - const polledIsTerminal = - job?.status === "completed" || - job?.status === "failed" || - job?.status === "cancelled"; - const status: Job["status"] = - terminal?.status ?? - (polledIsTerminal ? job!.status : (liveStatus ?? job?.status ?? "queued")); + // Status precedence is centralised in `computeDisplayStatus` so the + // same rule drives the badge here, the badge in the page header, the + // sidebar meta row, and the JobsTable list-level heuristic. + const status = computeDisplayStatus({ + job, + liveStatus, + terminalStatus: terminal?.status ?? null, + eventStreamConnected, + now, + }); - // Live duration ticker while the job is running. - const isRunning = status === "running" && !terminal; + // Tick `now` while either the warm-up timer or the run timer is moving. + const isTicking = + !terminal && (status === "provisioning" || status === "running"); useEffect(() => { - if (!isRunning) return; + if (!isTicking) return; const t = setInterval(() => setNow(Date.now()), 1000); return () => clearInterval(t); - }, [isRunning]); + }, [isTicking]); const duration = computeDuration(job, liveStartedAt, terminal, now); + const warmupMs = + status === "provisioning" && job?.createdAt + ? (() => { + const created = Date.parse(job.createdAt); + return Number.isNaN(created) ? null : Math.max(0, now - created); + })() + : null; const meta: JobMetaItem[] = [ { label: "Status", value: }, { - label: "Duration", - value: duration === null ? "—" : formatDuration(duration), - mono: true, + label: "Phase", + value: phaseLabel(status), }, + status === "provisioning" + ? { + label: "GPU warm-up", + value: warmupMs === null ? "—" : formatDuration(warmupMs), + mono: true, + } + : { + label: "Duration", + value: duration === null ? "—" : formatDuration(duration), + mono: true, + }, { label: "Created", value: job?.createdAt ? formatAbsoluteTime(job.createdAt) : "—", @@ -392,6 +426,16 @@ export function JobDetail({ jobId }: { jobId: string }) { {eventErr} ) : null} + {status === "provisioning" ? ( +
    +
    Warming up the GPU for this job.
    +
    + Training GPUs are not kept warm between jobs. Allocation is fast + when one is still warm from a recent run; otherwise the worker + starts from cold, which can take several minutes. +
    +
    + ) : null}
    @@ -412,7 +456,14 @@ export function JobDetail({ jobId }: { jobId: string }) {
    - + {status === "provisioning" && points.length === 0 ? ( +
    + Waiting for GPU. The loss curve starts the moment the + trainer reports its first step. +
    + ) : ( + + )}
    @@ -425,7 +476,10 @@ export function JobDetail({ jobId }: { jobId: string }) { - +
    @@ -477,6 +531,12 @@ function AdvancedToggle({ ); } +function phaseLabel(status: string): string { + if (status === "provisioning") return "Warming up GPU"; + if (status === "running") return "Training run"; + return "—"; +} + function computeDuration( job: Job | null, liveStartedAt: string | null, From 312de32888faa51cdc3daeaf6f9c9c309f321006 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:06:59 +0900 Subject: [PATCH 05/15] feat(studio-app): request notification permission on Run training click ensurePermissionOnGesture runs inside the click handler so browsers accept the permission prompt (page-load requests are penalised in Chrome). The helper no-ops when permission is already granted or denied, so re-clicks do not re-prompt. --- packages/studio-app/src/components/RunTraining.tsx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index eee5094c..b4f9ba05 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -4,6 +4,7 @@ import { streamTraining, type ManifestResult, } from "../lib/api"; +import { ensurePermissionOnGesture } from "../lib/notify"; import { Play, StopCircle } from "./icons"; import { Button } from "./ui/Button"; @@ -65,6 +66,10 @@ export function RunTraining() { }, [log]); async function run() { + // Tied to a user gesture so browsers (Chrome / Firefox) accept the + // permission prompt. The helper no-ops if permission is already + // granted or denied. + ensurePermissionOnGesture(); setRunning(true); setLog(""); const ac = new AbortController(); From 36016481e7d66e6be1915a991c7d286416235d01 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:07:01 +0900 Subject: [PATCH 06/15] docs: cold-start training UX Introduce / / Mintlify callouts (first use in the repo) to explain that Arkor splits GPUs into a warm inference pool and an on-demand training pool. The "7 to 12 minutes" figure now refers explicitly to the training run only; the first job in a quiet period also waits for a GPU to warm up before that window starts. studio/jobs.mdx adds a "What 'first run' looks like" section documenting the provisioning -> running transition; the cookbook notifications page is reframed against the cold-start wait and points to the new in-Studio browser notification. --- docs/cookbook/notifications.mdx | 6 +++++- docs/introduction.mdx | 2 +- docs/quickstart.mdx | 23 ++++++++++++++++------- docs/studio/jobs.mdx | 13 ++++++++++++- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/docs/cookbook/notifications.mdx b/docs/cookbook/notifications.mdx index 1eacd6e7..9e467e0e 100644 --- a/docs/cookbook/notifications.mdx +++ b/docs/cookbook/notifications.mdx @@ -5,10 +5,14 @@ description: "Post to a webhook when a training run finishes or fails, from insi # Slack / Discord notifications -Training runs take long enough that nobody actually watches Studio the whole time. The terminal `onCompleted` and `onFailed` callbacks are perfect places to fan a status message out to wherever your team already lives. +A training job is two waits back-to-back: GPU allocation at the start (variable, can be several minutes when a worker starts from cold; see the [Quickstart](/quickstart) for why) and then the training run itself (about 7 to 12 minutes for the templates). Nobody actually watches a browser tab for that long. The terminal `onCompleted` and `onFailed` callbacks are the natural spot to fan a status message out to wherever your team already lives so you only look back when the run is genuinely done. This recipe uses Slack incoming webhooks; Discord, Microsoft Teams, and arbitrary HTTP endpoints work the same way. Anything you can `fetch`, you can notify. + +If you only want a desktop ping for your own runs, Studio already shows a browser notification, an in-page toast, and a tab-title indicator on `training.completed` / `training.failed` once you click **Run training** (and accept the permission prompt). This recipe is for fanning the same signal into a shared channel. + + ## The pattern ```ts diff --git a/docs/introduction.mdx b/docs/introduction.mdx index 600fffc9..9c005980 100644 --- a/docs/introduction.mdx +++ b/docs/introduction.mdx @@ -77,6 +77,6 @@ bun dev -The first run finishes in roughly 7 to 12 minutes depending on the template. +The training run itself takes about 7 to 12 minutes depending on the template. Arkor runs training on dedicated training GPUs that are not kept warm between jobs (inference uses a separate, always-warm pool, which is why the Playground stays snappy). When you start a training job, Arkor allocates a training GPU on demand: if one is still warm from a recent job it is ready in seconds, otherwise the worker has to start from cold, which can take several minutes. The [Quickstart](/quickstart) walks through what this looks like in Studio. Continue with the [Quickstart](/quickstart) for a step-by-step walkthrough. diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx index af2ac19e..2f1e397c 100644 --- a/docs/quickstart.mdx +++ b/docs/quickstart.mdx @@ -3,7 +3,16 @@ title: "Quickstart" description: "Scaffold an Arkor project, fine-tune a small open-weight LLM, and chat with it from a local Studio." --- -In a few minutes you will go from zero to a fine-tuned model you can chat with in a local Playground. The training itself takes 7 to 12 minutes; setup time depends on your connection and what is already installed. +In a few minutes you will go from zero to a fine-tuned model you can chat with in a local Playground. The training run itself takes 7 to 12 minutes. The first job you submit also waits for a GPU to be allocated before the training run starts; that wait can be a few minutes (occasionally longer) and only happens for the first run after a quiet period. + + +**How Arkor uses GPUs.** Arkor splits GPUs into two pools: + +- **Inference GPUs** run the model after training (the Playground and any production inference). These stay on an active worker, so they are always warm and respond immediately. +- **Training GPUs** run fine-tuning jobs. To keep costs predictable, these are not kept warm. When you click **Run training**, Arkor allocates a GPU on demand. If a worker is still warm from a recent job, the allocation finishes in seconds. Otherwise it starts from cold (the so-called cold start: provisioning, pulling the container image, loading the base model), which can take several minutes. + +After GPU allocation, the training run itself takes about 7 to 12 minutes for the templates. The Jobs view in Studio shows both phases. + ## Prerequisites @@ -41,11 +50,11 @@ cd my-arkor-app The scaffolder asks which template you want. Pick the one closest to what you eventually want to build: -| Template | Task | Output shape | Estimated training | -| ----------- | ----------------------- | ----------------------------------------------------- | ------------------ | -| `triage` | Support triage | `{ category, urgency, summary, nextAction }` | ~7 min | -| `translate` | 9-language translation | `{ translation, detectedLanguage }` | ~7 min | -| `redaction` | PII redaction | `{ redactedText, redactedCount, tags }` | ~12 min | +| Template | Task | Output shape | Training run (excludes GPU allocation) | +| ----------- | ----------------------- | ----------------------------------------------------- | -------------------------------------- | +| `triage` | Support triage | `{ category, urgency, summary, nextAction }` | ~7 min | +| `translate` | 9-language translation | `{ translation, detectedLanguage }` | ~7 min | +| `redaction` | PII redaction | `{ redactedText, redactedCount, tags }` | ~12 min | Each template pairs the same small open-weight base (`gemma-4-E4B-it`) with a curated public dataset on HuggingFace. The training is real and finishes in minutes, so you get to see the whole loop end to end. @@ -119,7 +128,7 @@ In the browser, click **Run training**. Studio submits the job to the managed ba Once a run is in flight, three views matter: - **Jobs.** A list of training runs. Click into one to see live status. -- **Loss chart and event log.** As the run streams from the managed GPU, the loss curve updates and the log tail shows training events. The first run takes 7 to 12 minutes depending on the template. +- **Loss chart and event log.** As the run streams from the managed GPU, the loss curve updates and the log tail shows training events. The first job's first phase is the GPU allocation; during that time the status reads `Warming up GPU`, the loss chart shows a `Waiting for GPU` placeholder, and the events list is empty. Once the `training.started` event arrives, the status flips to `Running` and the curve begins to update. The training run itself takes about 7 to 12 minutes depending on the template. - **Playground.** After a job completes, pick the final adapter from the selector and chat with it. Use the mode toggle to switch between the base model and the adapter. To run inference on intermediate checkpoints while a run is still in flight, use `onCheckpoint` callbacks instead of Studio. If you edit `src/arkor/` between runs, refresh the Run training page (or run `arkor build`) before the next click so the new code is what runs. diff --git a/docs/studio/jobs.mdx b/docs/studio/jobs.mdx index 4937531b..1200b38a 100644 --- a/docs/studio/jobs.mdx +++ b/docs/studio/jobs.mdx @@ -21,13 +21,24 @@ When you click it, Studio sends `POST /api/train`. The backend spawns `arkor sta There is no input form for picking the trainer or passing flags: Studio always runs the trainer registered through `createArkor`, and `arkor start` reuses `.arkor/build/index.mjs` if it already exists. Edits to `src/arkor/` are not picked up automatically across multiple clicks on the same page; reload the Run training page (or run `arkor build` from a terminal) between edits and the next click. See [CLI § build / start](/cli/build-and-start) for the precise rebuild rules. +### What "first run" looks like + +Click **Run training**. Two phases follow: + +1. **GPU allocation.** The job appears as `Warming up GPU`. The loss chart shows a `Waiting for GPU` placeholder, the events list is empty, and the **Phase** row in the metadata sidebar reads `Warming up GPU` while the **GPU warm-up** timer ticks. This phase varies in length: typically under a minute when a worker is still warm from a recent job, occasionally several minutes when one has to start from cold. See the [Quickstart](/quickstart) for why this happens. +2. **The training run.** When `training.started` arrives, the status flips to `Running`, the loss chart starts updating from `training.log` frames, and the **Phase** row reads `Training run`. This is the 7 to 12 minute window in the template table on the Quickstart. + + +Click **Run training** with the browser tab focused. Studio asks the browser for notification permission and surfaces a desktop toast plus a tab-title indicator when the job completes or fails, so you do not have to watch the tab. + + ## Jobs list The Jobs list polls `GET /api/jobs` once at mount, then every 5 seconds. There is no manual refresh button; the interval is fixed. | Column | Source | | --- | --- | -| Status | `Job.status` (`queued` / `running` / `completed` / `failed` / `cancelled`). The cell carries a CSS class for colouring. | +| Status | `Job.status` (`queued` / `running` / `completed` / `failed` / `cancelled`) with one Studio-derived display state: a `queued` job whose stream has connected (or whose `createdAt` is within the last 90 s) renders as `Warming up GPU`. The cell carries a CSS class for colouring. | | Name | `Job.name`. Links to `#/jobs/`. | | Created | `new Date(Job.createdAt).toLocaleString()`. | | ID | `Job.id`, monospaced. | From 2f1bb71c0c6a6533c3316b1fcbf8375cec20daa1 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:07:10 +0900 Subject: [PATCH 07/15] docs(ja): cold-start training UX Mirror the English explanation of split GPU pools (warm inference, on-demand training) and the provisioning -> running transition. --- docs/ja/cookbook/notifications.mdx | 6 +++++- docs/ja/introduction.mdx | 2 +- docs/ja/quickstart.mdx | 23 ++++++++++++++++------- docs/ja/studio/jobs.mdx | 13 ++++++++++++- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/docs/ja/cookbook/notifications.mdx b/docs/ja/cookbook/notifications.mdx index 89261eeb..9f7833e3 100644 --- a/docs/ja/cookbook/notifications.mdx +++ b/docs/ja/cookbook/notifications.mdx @@ -5,10 +5,14 @@ description: "学習が完了 or 失敗したときに、trainer.ts の中から # Slack / Discord 通知 -学習は長いので、ずっと Studio を見続ける人はいません。終端の `onCompleted` と `onFailed` コールバックは、チームが普段いる場所にステータスメッセージを流すのにうってつけです。 +学習ジョブは実質 2 つの待ちが続きます。最初に GPU の確保 (warm な GPU が残っていない場合は数分かかることもある cold start。理由は [Quickstart](/ja/quickstart) 参照)、次に学習本体 (テンプレートで約 7〜12 分)。ブラウザータブをそんなに長くは見ていられません。終端の `onCompleted` と `onFailed` コールバックは、チームが普段いる場所にステータスメッセージを流すのに自然な場所です。本当に完了したときだけ振り返ればよくなります。 このレシピは Slack incoming Webhook を使います。Discord、Microsoft Teams、任意の HTTP エンドポイントも同じやり方で動きます。`fetch` できるものなら何でも通知先になります。 + +自分のためのデスクトップ通知だけで十分なら、Studio が `training.completed` / `training.failed` 時にブラウザー通知・ページ内 toast・タブタイトルのインジケーターを出します (**Run training** をクリックして通知許可を受け入れた後で有効)。このレシピはそれをチームの共有チャンネルに飛ばすためのものです。 + + ## パターン ```ts diff --git a/docs/ja/introduction.mdx b/docs/ja/introduction.mdx index f3049d54..8e970499 100644 --- a/docs/ja/introduction.mdx +++ b/docs/ja/introduction.mdx @@ -77,6 +77,6 @@ bun dev -最初の実行は、テンプレートにもよりますがおおよそ 7〜12 分で終わります。 +学習本体はテンプレートにもよりますが約 7〜12 分です。Arkor は推論 (Inference) と学習 (Training) で別の GPU プールを使い分けており、推論側は常に warm な active worker 上で動くので Playground は即応します。一方の学習側はコストを安定させるため warm 維持しません。学習ジョブを開始すると、Arkor はそのときに学習用 GPU を確保しに行きます。直前のジョブで使われた直後など GPU がまだ warm な状態であれば確保は数秒で終わりますが、warm な GPU がなければ cold な状態から (プロビジョニング・コンテナイメージ取得・ベースモデルのロード) 立ち上げる必要があり、これに数分かかります。Studio 上で何が見えるかは [Quickstart](/ja/quickstart) で確認できます。 ステップごとの手順は [Quickstart](/ja/quickstart) に続きます。 diff --git a/docs/ja/quickstart.mdx b/docs/ja/quickstart.mdx index c5aa03e0..08a73763 100644 --- a/docs/ja/quickstart.mdx +++ b/docs/ja/quickstart.mdx @@ -3,7 +3,16 @@ title: "クイックスタート" description: "Arkor プロジェクトを生成し、小さなオープンウェイト LLM をファインチューニングして、ローカル Studio からチャットしてみる。" --- -数分でゼロからスタートし、ローカル Playground でチャットできるファインチューン済みモデルが得られます。学習自体は 7〜12 分。セットアップ時間は回線と既存のインストール状況に左右されます。 +数分でゼロからスタートし、ローカル Playground でチャットできるファインチューン済みモデルが得られます。学習本体は 7〜12 分。最初に流すジョブは、学習本体が始まる前に GPU の確保 (立ち上がり) を待つ時間が入ります。これは数分かかることがあり、ときどきそれ以上かかります。間が空いた後の最初のジョブだけに発生する待ちです。セットアップ時間は回線と既存のインストール状況に左右されます。 + + +**Arkor が GPU をどう使い分けているか。** Arkor は GPU を 2 つのプールに分けて運用しています。 + +- **推論 (Inference) 用 GPU。** 学習が終わったモデルで推論するための GPU です (Playground や本番の推論)。active worker 上に常駐させているので、いつでも warm な状態で即座に応答します。 +- **学習 (Training) 用 GPU。** ファインチューニング用の GPU です。コストを安定させるため warm 維持はしません。**Run training** を押すと、その時点で GPU を確保しに行きます。直前に他のジョブで使われた直後など、GPU がまだ warm な状態であれば確保は数秒で終わります。warm な GPU がなければ cold な状態 (プロビジョニング・コンテナイメージ取得・ベースモデルのロード。いわゆる cold start) から立ち上げる必要があり、これに数分かかります。 + +GPU が確保できた後、学習本体自体はテンプレートにより約 7〜12 分です。Studio の Jobs 画面では両フェーズが可視化されます。 + ## 前提条件 @@ -41,11 +50,11 @@ cd my-arkor-app プロジェクト生成ツールがどのテンプレートを使うか尋ねてきます。最終的に作りたいものに最も近いものを選んでください: -| テンプレート | タスク | 出力の形 | 推定学習時間 | -| ------------ | ----------------------- | ----------------------------------------------------- | ------------ | -| `triage` | サポートのトリアージ | `{ category, urgency, summary, nextAction }` | 約 7 分 | -| `translate` | 9 言語の翻訳 | `{ translation, detectedLanguage }` | 約 7 分 | -| `redaction` | 個人情報のマスク | `{ redactedText, redactedCount, tags }` | 約 12 分 | +| テンプレート | タスク | 出力の形 | 学習本体 (GPU 確保時間を除く) | +| ------------ | ----------------------- | ----------------------------------------------------- | ----------------------------- | +| `triage` | サポートのトリアージ | `{ category, urgency, summary, nextAction }` | 約 7 分 | +| `translate` | 9 言語の翻訳 | `{ translation, detectedLanguage }` | 約 7 分 | +| `redaction` | 個人情報のマスク | `{ redactedText, redactedCount, tags }` | 約 12 分 | いずれも同じ小型のオープンウェイトベース(`gemma-4-E4B-it`)と、HuggingFace 上の厳選された公開データセットを組み合わせたものです。学習は本物で、数分で終わるので、ループ全体をエンドツーエンドで体験できます。 @@ -119,7 +128,7 @@ bun dev 学習が進行中に重要なビューは 3 つあります: - **Jobs。** 学習ジョブの一覧。クリックするとライブステータスが見られます。 -- **Loss チャートとイベントログ。** マネージド GPU から進捗がストリームされるにつれて、Loss(学習中のモデルの誤差を示す数値で、低いほどモデルが正解に近づいているサイン)の曲線が更新され、ログのテールに学習イベントが表示されます。最初の学習はテンプレートにより 7〜12 分かかります。 +- **Loss チャートとイベントログ。** マネージド GPU から進捗がストリームされるにつれて、Loss(学習中のモデルの誤差を示す数値で、低いほどモデルが正解に近づいているサイン)の曲線が更新され、ログのテールに学習イベントが表示されます。最初のジョブの最初のフェーズは GPU の確保で、その間ステータスは `Warming up GPU` と表示され、Loss チャートは `Waiting for GPU` のプレースホルダー、イベントログは空のままです。`training.started` イベントが届くとステータスが `Running` に変わり、曲線が更新を始めます。学習本体自体はテンプレートにより 7〜12 分かかります。 - **Playground。** ジョブが完了したら、最終アダプタをセレクタから選んでチャット。モード切替でベースモデルとアダプタを行き来できます。学習中に中間チェックポイントで推論を走らせたい場合は Studio ではなく `onCheckpoint` コールバックを使ってください。 学習の合間に `src/arkor/` を編集した場合は、Run training ページをリロード(または `arkor build` を実行)してから次のクリックをすると、新しいコードが動きます。 diff --git a/docs/ja/studio/jobs.mdx b/docs/ja/studio/jobs.mdx index 245a2a8d..92c868ec 100644 --- a/docs/ja/studio/jobs.mdx +++ b/docs/ja/studio/jobs.mdx @@ -21,13 +21,24 @@ Run training パネルはページ読み込み時に `/api/manifest` を 1 度 トレーナー選択やフラグを渡す入力フォームはありません: Studio は常に `createArkor` で登録されたトレーナーを走らせ、`arkor start` は `.arkor/build/index.mjs` があれば再利用します。同じページで複数回クリックする間に `src/arkor/` の編集は自動では拾われません。編集の合間に Run training ページをリロード(あるいはターミナルから `arkor build`)してから次のクリックをしてください。具体的なリビルドルールは [CLI § build / start](/ja/cli/build-and-start) を参照。 +### 「最初のジョブ」で起きていること + +**Run training** をクリックすると、2 つのフェーズが続きます: + +1. **GPU の確保。** ジョブは `Warming up GPU` と表示されます。Loss チャートは `Waiting for GPU` のプレースホルダー、イベントリストは空のままで、Metadata サイドバーの **Phase** 行は `Warming up GPU`、**GPU warm-up** タイマーが動きます。このフェーズの時間は状況次第です。直前のジョブで warm な状態の GPU が残っていれば普通は 1 分以内ですが、cold な状態から立ち上げる必要がある場合は数分かかることがあります。なぜそうなるかは [Quickstart](/ja/quickstart) を参照。 +2. **学習本体。** `training.started` が届くとステータスが `Running` に変わり、Loss チャートが `training.log` フレームから更新を始め、**Phase** 行が `Training run` になります。これが Quickstart のテンプレート表にある 7〜12 分のウィンドウです。 + + +**Run training** はブラウザータブをフォアグラウンドにしてからクリックしてください。Studio が通知許可をブラウザーに求め、ジョブが完了または失敗したときにデスクトップ通知とタブタイトルのインジケーターを表示するので、タブを見張る必要がなくなります。 + + ## Jobs リスト Jobs リストはマウント時に 1 度、その後 5 秒ごとに `GET /api/jobs` をポーリングします。手動リフレッシュボタンはなく、間隔は固定です。 | 列 | ソース | | --- | --- | -| Status | `Job.status`(`queued` / `running` / `completed` / `failed` / `cancelled`)。セルは色付け用の CSS クラスを持つ。 | +| Status | `Job.status`(`queued` / `running` / `completed` / `failed` / `cancelled`)に、Studio 側で 1 つの表示ステートを足したもの。SSE ストリームが接続済み(または `createdAt` が直近 90 秒以内)かつ `queued` のジョブは `Warming up GPU` として表示される。セルは色付け用の CSS クラスを持つ。 | | Name | `Job.name`。`#/jobs/` にリンク。 | | Created | `new Date(Job.createdAt).toLocaleString()`。 | | ID | `Job.id`、等幅フォント。 | From 7f58c9b6060a5375e2d338c0e2ce253a8b77e0d7 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:07:11 +0900 Subject: [PATCH 08/15] test(e2e-studio): cold-start training scenarios Drive the deferred-SSE harness: keep the events stream open without a training.started frame, assert the Warming up GPU banner is visible, then emit training.started + training.completed and assert the toast plus the run name appear. Cover both granted and cleared notification permission paths so the toast fallback is exercised. --- e2e/studio/src/specs/cold-start.spec.ts | 222 ++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 e2e/studio/src/specs/cold-start.spec.ts diff --git a/e2e/studio/src/specs/cold-start.spec.ts b/e2e/studio/src/specs/cold-start.spec.ts new file mode 100644 index 00000000..3b8657d4 --- /dev/null +++ b/e2e/studio/src/specs/cold-start.spec.ts @@ -0,0 +1,222 @@ +import type { IncomingMessage, ServerResponse } from "node:http"; +import { expect, test } from "../harness/fixture"; + +const QUEUED_JOB = { + id: "job-cold-start", + name: "cold-start-trainer", + // The job stays `queued` on the wire until our deferred SSE fires + // `training.started`. The display-status derivation in the SPA + // promotes that to `Warming up GPU` while the EventSource is open + // without a `training.started` frame. + status: "queued", + createdAt: new Date(Date.now() - 5_000).toISOString(), + startedAt: null, + completedAt: null, + error: null, + config: { model: "unsloth/gemma-4-E4B-it" }, +}; + +function jobsBody(status: "queued" | "running" | "completed" = "queued") { + const job = { ...QUEUED_JOB, status }; + if (status === "running") { + return { jobs: [{ ...job, startedAt: new Date().toISOString() }] }; + } + if (status === "completed") { + const completedAt = new Date().toISOString(); + return { + jobs: [ + { + ...job, + status, + startedAt: completedAt, + completedAt, + }, + ], + }; + } + return { jobs: [job] }; +} + +interface DeferredSseControls { + /** Send `training.started` over the still-open SSE socket. */ + emitStarted: () => Promise; + /** Send `training.completed` and end the stream. */ + emitCompleted: () => Promise; + /** Resolves once the SSE socket has connected. */ + connected: Promise; +} + +/** + * Install a `/v1/jobs/{id}/events/stream` route that opens the SSE + * socket and keeps it open with no frames until the caller invokes + * `emitStarted()` / `emitCompleted()`. Mirrors the "cold start, then + * training begins" wire trace. + */ +function installDeferredSseRoute(cloudApi: { + setRoute: ( + method: string, + path: string, + handler: (req: IncomingMessage, res: ServerResponse) => void, + ) => void; +}): DeferredSseControls { + let resolveConnected: () => void; + const connected = new Promise((resolve) => { + resolveConnected = resolve; + }); + + // Held outside the handler so test bodies can drive frames from the + // outside. The handler captures the latest response and resolves the + // `connected` gate when the socket actually attaches. + let active: ServerResponse | null = null; + + cloudApi.setRoute( + "GET", + `/v1/jobs/${QUEUED_JOB.id}/events/stream`, + (req, res) => { + const url = new URL(req.url ?? "", "http://x"); + if ( + url.searchParams.get("orgSlug") !== "studio-e2e-org" || + url.searchParams.get("projectSlug") !== "studio-e2e-project" + ) { + res.statusCode = 400; + res.end(JSON.stringify({ error: "scope mismatch" })); + return; + } + res.statusCode = 200; + res.setHeader("content-type", "text/event-stream"); + res.setHeader("cache-control", "no-cache, no-transform"); + // Empty comment frame keeps the SSE socket alive for browsers + // that wait for a first chunk before reporting "open" but + // crucially does not carry a named event, so the SPA stays in + // `provisioning` (it is waiting for `training.started`). + res.write(":\n\n"); + active = res; + resolveConnected(); + }, + ); + + return { + connected, + emitStarted: async () => { + await connected; + if (!active) throw new Error("SSE socket was not opened"); + active.write( + `event: training.started\ndata: ${JSON.stringify({ + timestamp: new Date().toISOString(), + })}\n\n`, + ); + }, + emitCompleted: async () => { + await connected; + if (!active) throw new Error("SSE socket was not opened"); + active.write( + `event: training.completed\ndata: ${JSON.stringify({ + timestamp: new Date().toISOString(), + artifacts: [{ id: "artifact-1" }], + })}\n\n`, + ); + active.end(); + }, + }; +} + +test.describe("Cold-start training UX", () => { + test("shows the Warming up GPU phase before training.started arrives", async ({ + page, + studio, + cloudApi, + }) => { + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("queued"))); + }); + const sse = installDeferredSseRoute(cloudApi); + + await page.goto(`${studio.url}/#/jobs/${QUEUED_JOB.id}`); + await sse.connected; + + await expect( + page.getByText("Warming up the GPU for this job."), + ).toBeVisible(); + await expect(page.getByText("Waiting for GPU").first()).toBeVisible(); + + // Flip the wire state to running and emit the SSE frame: the + // display should switch to Running with the loss-chart placeholder + // gone. + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("running"))); + }); + await sse.emitStarted(); + + await expect( + page.getByText("Warming up the GPU for this job."), + ).toBeHidden(); + }); + + test("fires a desktop toast on training.completed with notification permission granted", async ({ + page, + studio, + cloudApi, + context, + }) => { + await context.grantPermissions(["notifications"]); + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("queued"))); + }); + const sse = installDeferredSseRoute(cloudApi); + + await page.goto(`${studio.url}/#/jobs/${QUEUED_JOB.id}`); + await sse.connected; + await sse.emitStarted(); + + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("completed"))); + }); + await sse.emitCompleted(); + + // Toast carries the run name and the completed label. + await expect( + page.getByText("Training run completed").first(), + ).toBeVisible(); + await expect( + page.getByText(/cold-start-trainer \(1 artifact\)/).first(), + ).toBeVisible(); + }); + + test("falls back to in-page toast only when notification permission is denied", async ({ + page, + studio, + cloudApi, + context, + }) => { + await context.clearPermissions(); + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("queued"))); + }); + const sse = installDeferredSseRoute(cloudApi); + + await page.goto(`${studio.url}/#/jobs/${QUEUED_JOB.id}`); + await sse.connected; + await sse.emitStarted(); + + cloudApi.setRoute("GET", "/v1/jobs", (_req, res) => { + res.statusCode = 200; + res.setHeader("content-type", "application/json"); + res.end(JSON.stringify(jobsBody("completed"))); + }); + await sse.emitCompleted(); + + await expect( + page.getByText("Training run completed").first(), + ).toBeVisible(); + }); +}); From 2004f3b09e2d6224b89ea3e9038bbb3a2d060c42 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:21:06 +0900 Subject: [PATCH 09/15] fix(studio-app): eventStreamConnected lifecycle for SSE attach + errors Two correctness issues addressed in one change: 1. Flip `eventStreamConnected` true the moment the EventSource is attached, not only on the first received frame. The previous code waited for `es.onopen` or any frame, which left a race on slower environments (CI Linux runners) where the Playwright assertion for the Warming up GPU banner timed out before the SSE socket reported open. The e2e cold-start specs were failing in CI as a result. 2. Set `eventStreamConnected` back to false on `es.onerror`, on the sentinel `end` event, and on the effect cleanup. Without this a transient SSE disconnect could leave the UI stuck in `Warming up GPU` and keep the 1s ticker running indefinitely. Local Playwright runs (Mac, chromium-headless-shell) cover both paths via the existing cold-start.spec.ts. --- packages/studio-app/src/pages/JobDetail.tsx | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/packages/studio-app/src/pages/JobDetail.tsx b/packages/studio-app/src/pages/JobDetail.tsx index 84a4f8ba..6e7a094f 100644 --- a/packages/studio-app/src/pages/JobDetail.tsx +++ b/packages/studio-app/src/pages/JobDetail.tsx @@ -162,6 +162,11 @@ export function JobDetail({ jobId }: { jobId: string }) { } const es = openJobEvents(jobId); + // Treat the EventSource as connected from the moment we attach so + // the display flips into `provisioning` immediately even when the + // backend takes a while to send the first frame; `onerror` below + // walks this back so a real disconnect drops us out of warm-up. + setEventStreamConnected(true); es.onopen = () => setEventStreamConnected(true); es.addEventListener("training.started", (ev: MessageEvent) => { const parsed = safeParse(ev.data); @@ -274,9 +279,18 @@ export function JobDetail({ jobId }: { jobId: string }) { error, }); }); - es.addEventListener("end", () => es.close()); - es.onerror = () => setEventErr("Event stream interrupted."); - return () => es.close(); + es.addEventListener("end", () => { + es.close(); + setEventStreamConnected(false); + }); + es.onerror = () => { + setEventErr("Event stream interrupted."); + setEventStreamConnected(false); + }; + return () => { + es.close(); + setEventStreamConnected(false); + }; }, [jobId]); // Status precedence is centralised in `computeDisplayStatus` so the From 31dbeab8b8775175a9a0111417071a3866913bb5 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:21:24 +0900 Subject: [PATCH 10/15] fix(studio-app): clear tab title prefix on route change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit notifyJobTerminal prefixes the tab title with ✓ / ⚠ on a terminal event so a user with the tab in the background sees the result. Nothing in the SPA previously reset the title, so prefixes from different jobs accumulated across navigation. Add an effect in App keyed on the route that resets `document.title` to "Arkor" whenever the user moves to a different page (job, jobs list, home, or playground). Also corrects the notify.ts docstring which claimed this behaviour already existed. --- packages/studio-app/src/App.tsx | 15 +++++++++++++++ packages/studio-app/src/lib/notify.ts | 5 +++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/packages/studio-app/src/App.tsx b/packages/studio-app/src/App.tsx index db470280..b63d0a4a 100644 --- a/packages/studio-app/src/App.tsx +++ b/packages/studio-app/src/App.tsx @@ -8,6 +8,8 @@ import { ToastProvider } from "./components/ui/Toast"; import { fetchCredentials, type Credentials } from "./lib/api"; import { useHashRoute } from "./route"; +const DEFAULT_TITLE = "Arkor"; + export function App() { const [creds, setCreds] = useState(null); const [error, setError] = useState(null); @@ -29,6 +31,19 @@ export function App() { }; }, []); + // Drop any `✓` / `⚠` prefix that a prior `notifyJobTerminal` left on the + // tab title once the user navigates. Without this the indicator persists + // across job pages and prefixes from different jobs accumulate. + const routeKey = + route.kind === "job" + ? `job:${route.id}` + : route.kind === "playground" + ? `playground:${route.adapterJobId ?? ""}` + : route.kind; + useEffect(() => { + document.title = DEFAULT_TITLE; + }, [routeKey]); + return ( <> diff --git a/packages/studio-app/src/lib/notify.ts b/packages/studio-app/src/lib/notify.ts index 9ffb54d2..39aae20d 100644 --- a/packages/studio-app/src/lib/notify.ts +++ b/packages/studio-app/src/lib/notify.ts @@ -72,8 +72,9 @@ function isTabFocused(): boolean { * * 1. Toast: always, via a `CustomEvent("arkor:toast")` that the * ToastProvider listens for. Works regardless of focus / permission. - * 2. Title prefix (`✓` / `⚠`): only when the tab is not focused. Cleared - * on the next route change by the title controller in App. + * 2. Title prefix (`✓` / `⚠`): only when the tab is not focused. + * Cleared on the next route change by an effect in `App` that + * resets `document.title` whenever the route key changes. * 3. OS Notification: only when permission was granted AND the tab is * not focused. Tagged with the jobId so a duplicate SSE frame won't * buzz the user twice. From 96ed6ea33aae7d511a88036050a65d99c395c9ac Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:46:10 +0900 Subject: [PATCH 11/15] fix(studio-app): tighten focus check + clear stacked title prefixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related issues with the terminal-event notification layer: 1. `isTabFocused()` only consulted `document.visibilityState`. A tab that is the selected one in a browser window sitting behind another app reports `visible` but `hasFocus()` is false; the user clearly cannot see the in-page toast yet we were suppressing the OS notification and title-prefix paths anyway. Now require both. 2. `ensureTitlePrefix()` only checked `startsWith(prefix)`, so a completion (`✓ Arkor`) followed by a failure (`⚠ ...`) ended up as `⚠ ✓ Arkor`. Strip any prior `✓ ` / `⚠ ` marker before applying the new one so the title carries at most one indicator. Tests cover both: the visible-but-unfocused tab path now produces a notification + title prefix, and `replaces an existing prefix instead of stacking when the status differs` verifies (2). --- packages/studio-app/src/lib/notify.test.ts | 35 ++++++++++++++++++++-- packages/studio-app/src/lib/notify.ts | 23 ++++++++++---- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/packages/studio-app/src/lib/notify.test.ts b/packages/studio-app/src/lib/notify.test.ts index 0fc09457..d8429017 100644 --- a/packages/studio-app/src/lib/notify.test.ts +++ b/packages/studio-app/src/lib/notify.test.ts @@ -48,13 +48,16 @@ function uninstallNotification() { delete (globalThis as unknown as { Notification?: unknown }).Notification; } -function setVisibility(state: "visible" | "hidden") { +function setVisibility( + state: "visible" | "hidden", + hasFocus: boolean = state === "visible", +) { Object.defineProperty(document, "visibilityState", { value: state, configurable: true, }); Object.defineProperty(document, "hasFocus", { - value: () => state === "visible", + value: () => hasFocus, configurable: true, }); } @@ -208,7 +211,33 @@ describe("notifyJobTerminal", () => { error: "x", }); - expect(document.title.startsWith("⚠ ")).toBe(true); + // The new prefix replaces the old one rather than stacking. + expect(document.title).toBe("⚠ Arkor"); + }); + + it("replaces an existing prefix instead of stacking when the status differs", () => { + setVisibility("hidden"); + installFakeNotification("granted"); + + notifyJobTerminal({ status: "completed", jobName: "A", jobId: "job-A" }); + notifyJobTerminal({ status: "failed", jobName: "B", jobId: "job-B" }); + + expect(document.title).toBe("⚠ Arkor"); + }); + + it("treats a visible tab that is not focused as backgrounded (fires OS notification)", () => { + setVisibility("visible", false); + const { instances } = installFakeNotification("granted"); + + notifyJobTerminal({ + status: "completed", + jobName: "demo", + jobId: "job-bg", + artifacts: 1, + }); + + expect(instances).toHaveLength(1); + expect(document.title.startsWith("✓ ")).toBe(true); }); it("does not stack the same title prefix repeatedly", () => { diff --git a/packages/studio-app/src/lib/notify.ts b/packages/studio-app/src/lib/notify.ts index 39aae20d..90ac3d9a 100644 --- a/packages/studio-app/src/lib/notify.ts +++ b/packages/studio-app/src/lib/notify.ts @@ -54,17 +54,26 @@ function summaryFor(input: NotifyJobTerminalInput): string { : `${input.jobName} failed`; } +// Match either of the prefixes we set so a second terminal event of a +// different kind (e.g. ✓ then ⚠) replaces the existing marker instead +// of stacking as "⚠ ✓ Arkor". +const TITLE_PREFIX_RE = /^[✓⚠] /; + function ensureTitlePrefix(prefix: string): void { if (typeof document === "undefined") return; - if (!document.title.startsWith(prefix)) { - document.title = `${prefix}${document.title}`; - } + const stripped = document.title.replace(TITLE_PREFIX_RE, ""); + document.title = `${prefix}${stripped}`; } function isTabFocused(): boolean { if (typeof document === "undefined") return false; - if (document.visibilityState === "visible") return true; - return typeof document.hasFocus === "function" && document.hasFocus(); + // `visibilityState === "visible"` is necessary but not sufficient: a + // tab that is the selected one in a browser window sitting behind + // another app still reports `visible`, yet the user clearly cannot + // see in-page toasts. Check `hasFocus()` whenever it is available. + if (document.visibilityState !== "visible") return false; + if (typeof document.hasFocus === "function") return document.hasFocus(); + return true; } /** @@ -126,7 +135,9 @@ export function notifyJobTerminal(input: NotifyJobTerminalInput): void { ); n.onclick = () => { window.focus(); - window.location.hash = `#/jobs/${encodeURIComponent(input.jobId)}`; + // `parseRoute()` does not decode path segments, so leave the id + // raw here to match the unencoded links emitted from JobsTable. + window.location.hash = `#/jobs/${input.jobId}`; n.close(); }; } catch { From d83f6e1cafb6a71fcd699b007b20b06affc693dc Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:46:38 +0900 Subject: [PATCH 12/15] fix(studio-app): use raw job id in toast navigation target Pair with the same change in notify.ts (previous commit): `parseRoute` does not decode path segments, so encoding the job id here when the unencoded `` links in JobsTable don't would make the toast's "Open job" hop to a different URL than the row link for ids that contain URL-special characters. --- packages/studio-app/src/components/ui/Toast.tsx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/studio-app/src/components/ui/Toast.tsx b/packages/studio-app/src/components/ui/Toast.tsx index 2240d7d7..9004c129 100644 --- a/packages/studio-app/src/components/ui/Toast.tsx +++ b/packages/studio-app/src/components/ui/Toast.tsx @@ -81,7 +81,9 @@ function ToastCard({ : "text-red-700 dark:text-red-300"; function openJob() { - window.location.hash = `#/jobs/${encodeURIComponent(toast.jobId)}`; + // `parseRoute()` does not decode path segments, so leave the id raw + // here to match the unencoded links emitted from JobsTable. + window.location.hash = `#/jobs/${toast.jobId}`; onDismiss(); } From 2fad362e7a03e0583c753bb0fb00ec5e421d64b5 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:48:13 +0900 Subject: [PATCH 13/15] fix(studio-app): poll-driven terminal notify + SSE replay guard Two correctness gaps in the terminal notification flow: 1. When the EventSource drops before the terminal frame arrives (PC sleep, proxy hiccup) but `/api/jobs` polling later observes `completed` / `failed`, the user previously got no toast / title marker / OS notification. The polling tick now detects a non-terminal -> terminal transition for the current job and calls notifyJobTerminal; the (jobId, status) dedupe in notify.ts absorbs the duplicate when the SSE frame arrives too. 2. Opening an already-completed job replayed the SSE history, triggering a fresh "Training run completed" notification for what was just a historical inspection. Compare the terminal frame's timestamp against the page mount time (5s grace for clock skew and polling lag) and skip notifications whose timestamp predates the mount, both on the SSE path and the new poll path. --- packages/studio-app/src/pages/JobDetail.tsx | 82 +++++++++++++++++---- 1 file changed, 68 insertions(+), 14 deletions(-) diff --git a/packages/studio-app/src/pages/JobDetail.tsx b/packages/studio-app/src/pages/JobDetail.tsx index 6e7a094f..f9af55b1 100644 --- a/packages/studio-app/src/pages/JobDetail.tsx +++ b/packages/studio-app/src/pages/JobDetail.tsx @@ -52,20 +52,54 @@ export function JobDetail({ jobId }: { jobId: string }) { // the terminal-event notification even when the polled `job` resolves // after the closures were registered. const jobNameRef = useRef(null); + // Wall-clock time the user landed on this job's page. Used to skip + // notifications for terminal frames whose timestamp predates the + // mount — i.e. SSE history replay or a poll observing a job that + // had already finished before the page was opened. + const mountedAtRef = useRef(Date.now()); + // Previously observed wire status; lets the polling tick recognise a + // queued/running -> completed/failed transition and fire a + // notification when the SSE stream missed the terminal frame. + const previousPolledStatusRef = useRef(null); useEffect(() => { setJob(null); + mountedAtRef.current = Date.now(); + previousPolledStatusRef.current = null; let cancelled = false; let timer: ReturnType | undefined; // Chained setTimeout instead of setInterval so a slow /api/jobs // request can't pile up overlapping calls. SSE remains the source // of truth for live status; polling is just for completedAt / - // config / etc that the SSE stream doesn't carry. + // config / etc that the SSE stream doesn't carry — plus a + // poll-driven terminal notification below as a safety net when + // the SSE stream dropped before the terminal frame arrived + // (machine sleep, proxy hiccup, etc). async function tick() { try { const { jobs } = await fetchJobs(); if (!cancelled) { - setJob(jobs.find((j) => j.id === jobId) ?? null); + const next = jobs.find((j) => j.id === jobId) ?? null; + const prevStatus = previousPolledStatusRef.current; + const wasTerminal = + prevStatus === "completed" || + prevStatus === "failed" || + prevStatus === "cancelled"; + if ( + next && + (next.status === "completed" || next.status === "failed") && + !wasTerminal && + shouldNotifyTerminal(next.completedAt, mountedAtRef.current) + ) { + notifyJobTerminal({ + status: next.status, + jobName: next.name ?? jobId, + jobId, + error: next.error ?? undefined, + }); + } + previousPolledStatusRef.current = next?.status ?? null; + setJob(next); } } catch { // ignore — events stream is the source of truth for live status @@ -254,12 +288,14 @@ export function JobDetail({ jobId }: { jobId: string }) { completedAt = d.timestamp; } setTerminal({ status: "completed", artifacts, completedAt }); - notifyJobTerminal({ - status: "completed", - jobName: jobNameRef.current ?? jobId, - jobId, - artifacts, - }); + if (shouldNotifyTerminal(completedAt, mountedAtRef.current)) { + notifyJobTerminal({ + status: "completed", + jobName: jobNameRef.current ?? jobId, + jobId, + artifacts, + }); + } }); es.addEventListener("training.failed", (ev: MessageEvent) => { const parsed = safeParse(ev.data); @@ -272,12 +308,14 @@ export function JobDetail({ jobId }: { jobId: string }) { completedAt = d.timestamp; } setTerminal({ status: "failed", error, artifacts: 0, completedAt }); - notifyJobTerminal({ - status: "failed", - jobName: jobNameRef.current ?? jobId, - jobId, - error, - }); + if (shouldNotifyTerminal(completedAt, mountedAtRef.current)) { + notifyJobTerminal({ + status: "failed", + jobName: jobNameRef.current ?? jobId, + jobId, + error, + }); + } }); es.addEventListener("end", () => { es.close(); @@ -545,6 +583,22 @@ function AdvancedToggle({ ); } +// Tolerance (ms) between the trainer-side terminal timestamp and the +// page mount, so a few seconds of clock skew or polling lag does not +// suppress a notification for a run that genuinely finished after the +// user landed on the page. +const TERMINAL_NOTIFY_GRACE_MS = 5_000; + +function shouldNotifyTerminal( + timestamp: string | undefined | null, + mountedAt: number, +): boolean { + if (!timestamp) return true; + const t = Date.parse(timestamp); + if (!Number.isFinite(t)) return true; + return t >= mountedAt - TERMINAL_NOTIFY_GRACE_MS; +} + function phaseLabel(status: string): string { if (status === "provisioning") return "Warming up GPU"; if (status === "running") return "Training run"; From 40497b85d8b123f554700067ef690d8c3164c0f0 Mon Sep 17 00:00:00 2001 From: Hina Date: Tue, 12 May 2026 00:48:14 +0900 Subject: [PATCH 14/15] docs(studio): correct jobs-list provisioning rule wording The Jobs list page polls `/api/jobs` and does not open an SSE stream, so the "stream has connected" half of the previous description was wrong for that view: only the createdAt-recency window applies on the list. Reword EN and JA to say so, and split the rule so the list-page behaviour and the Job detail behaviour are explicit. --- docs/ja/studio/jobs.mdx | 2 +- docs/studio/jobs.mdx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ja/studio/jobs.mdx b/docs/ja/studio/jobs.mdx index 92c868ec..60f3582a 100644 --- a/docs/ja/studio/jobs.mdx +++ b/docs/ja/studio/jobs.mdx @@ -38,7 +38,7 @@ Jobs リストはマウント時に 1 度、その後 5 秒ごとに `GET /api/j | 列 | ソース | | --- | --- | -| Status | `Job.status`(`queued` / `running` / `completed` / `failed` / `cancelled`)に、Studio 側で 1 つの表示ステートを足したもの。SSE ストリームが接続済み(または `createdAt` が直近 90 秒以内)かつ `queued` のジョブは `Warming up GPU` として表示される。セルは色付け用の CSS クラスを持つ。 | +| Status | `Job.status`(`queued` / `running` / `completed` / `failed` / `cancelled`)に、Studio 側のリスト表示で 1 つの表示ステートを足したもの。`createdAt` が直近 90 秒以内の `queued` ジョブは `Warming up GPU` として表示される。リスト画面は `/api/jobs` のポーリングだけで SSE は開かないため、ここでは createdAt の窓だけで判定する。Job 詳細では同じルールに加えて SSE ストリームの接続状態も加味される。セルは色付け用の CSS クラスを持つ。 | | Name | `Job.name`。`#/jobs/` にリンク。 | | Created | `new Date(Job.createdAt).toLocaleString()`。 | | ID | `Job.id`、等幅フォント。 | diff --git a/docs/studio/jobs.mdx b/docs/studio/jobs.mdx index 1200b38a..16ae54c8 100644 --- a/docs/studio/jobs.mdx +++ b/docs/studio/jobs.mdx @@ -38,7 +38,7 @@ The Jobs list polls `GET /api/jobs` once at mount, then every 5 seconds. There i | Column | Source | | --- | --- | -| Status | `Job.status` (`queued` / `running` / `completed` / `failed` / `cancelled`) with one Studio-derived display state: a `queued` job whose stream has connected (or whose `createdAt` is within the last 90 s) renders as `Warming up GPU`. The cell carries a CSS class for colouring. | +| Status | `Job.status` (`queued` / `running` / `completed` / `failed` / `cancelled`) plus one Studio-derived display state on the list: a `queued` job whose `createdAt` is within the last 90 seconds renders as `Warming up GPU`. The list page only polls `/api/jobs`; Job detail uses the same rule and additionally consults the live SSE stream. The cell carries a CSS class for colouring. | | Name | `Job.name`. Links to `#/jobs/`. | | Created | `new Date(Job.createdAt).toLocaleString()`. | | ID | `Job.id`, monospaced. | From 97a669d5bbc78f187cb4cdb70cd4a9f6c68cde02 Mon Sep 17 00:00:00 2001 From: Hina Date: Wed, 13 May 2026 19:07:45 +0900 Subject: [PATCH 15/15] fix(studio-app): stop dropping eventStreamConnected on SSE onerror CI artifact analysis: every cold-start e2e spec on Linux landed with "Event stream interrupted." showing on the Job detail page and the status badge stuck on Queued. The Hono `/api/jobs/:id/events` proxy appears to surface benign keep-alive transitions / reconnect signals to the browser as `EventSource.onerror`, and the previous commit treated that as a real disconnect by flipping `eventStreamConnected` to false. That instantly pulled the user out of the Warming up GPU display while training was still going - the exact regression e2e was protecting against. Walk that part back: `onerror` only surfaces the banner (the existing behaviour); the connected flag only drops on the explicit `end` event and on the effect cleanup. Once any frame reaches `pushEvent`, the banner is cleared and the connection is reaffirmed. This preserves the structural fix (early `setEventStreamConnected(true)` on attach) that CI race was about, and keeps Copilot's original concern covered for the deterministic close paths. --- packages/studio-app/src/pages/JobDetail.tsx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/packages/studio-app/src/pages/JobDetail.tsx b/packages/studio-app/src/pages/JobDetail.tsx index f9af55b1..b128de65 100644 --- a/packages/studio-app/src/pages/JobDetail.tsx +++ b/packages/studio-app/src/pages/JobDetail.tsx @@ -321,10 +321,15 @@ export function JobDetail({ jobId }: { jobId: string }) { es.close(); setEventStreamConnected(false); }); - es.onerror = () => { - setEventErr("Event stream interrupted."); - setEventStreamConnected(false); - }; + // Deliberately do NOT drop `eventStreamConnected` on `onerror`: the + // EventSource fires `onerror` during its built-in reconnect loop + // (and, under the Hono `/api/jobs/:id/events` proxy on Linux CI, + // even on benign keep-alive transitions). Treating that as a real + // disconnect would yank the user out of the warm-up display while + // training is still going. The "stream interrupted" banner alone + // is enough; once a frame lands again, `pushEvent` clears it and + // reaffirms the connection. + es.onerror = () => setEventErr("Event stream interrupted."); return () => { es.close(); setEventStreamConnected(false);