From 6cc19d9abc14e264a99e98d32ff26a177e4bb541 Mon Sep 17 00:00:00 2001 From: Dan Tasse Date: Mon, 9 Mar 2026 12:49:38 -0400 Subject: [PATCH 1/2] chore: move Job Metrics docs from Geneva --- docs/docs.json | 1 + docs/geneva/jobs/job_metrics.mdx | 66 ++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 docs/geneva/jobs/job_metrics.mdx diff --git a/docs/docs.json b/docs/docs.json index f4e0cce..f09d85d 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -184,6 +184,7 @@ "geneva/jobs/conflicts", "geneva/jobs/materialized-views", "geneva/jobs/performance", + "geneva/jobs/job_metrics", "geneva/jobs/troubleshooting" ] }, diff --git a/docs/geneva/jobs/job_metrics.mdx b/docs/geneva/jobs/job_metrics.mdx new file mode 100644 index 0000000..efc6b22 --- /dev/null +++ b/docs/geneva/jobs/job_metrics.mdx @@ -0,0 +1,66 @@ +--- +title: Job Metrics (Diagnostics) +sidebarTitle: Job Metrics +description: Use metrics from Geneva to diagnose why a backfill/refresh job is slow. +icon: chart-simple +--- + +## Where metrics come from + +Job metrics are attached to each job record in the Console/API response under +`metrics`. + +- Job list: + - `GET /api/v1/jobs?table_name=&db_uri_encoded=<...>` +- Job detail: + - `GET /api/v1/jobs/?db_uri_encoded=<...>` + +## Core diagnostic metrics + +| Metric | What it means | Common signal | +| --- | --- | --- | +| `rows_checkpointed` | Rows finished by read/UDF/checkpoint stage. | High value means upstream compute is progressing. | +| `rows_ready_for_commit` | Rows ready for atomic commit (becoming visible to other DB connections). | If much lower than `rows_checkpointed`, writer path is likely bottlenecked. | +| `rows_committed` | Rows already visible to other DB connections. | If lagging far behind `rows_ready_for_commit`, commit stage may be bottlenecked. | +| `cnt_geneva_workers_active` | Current parallel UDF executors. | Lower than expected means reduced effective parallelism. | +| `cnt_geneva_workers_pending` | Deficit from desired parallelism. | Persistently high value usually means scheduling/resource pressure. | +| `read_io_time_ms` | Cumulative read IO time. | Dominant value suggests storage/read bottleneck. | +| `udf_processing_time` | Cumulative UDF execution time. | Dominant value suggests compute/UDF bottleneck. | +| `batch_checkpointing_time` | Cumulative batch checkpoint overhead. | High value suggests checkpoint overhead is expensive. | +| `writer_write_time` | Cumulative writer output time. | High value often points to object storage throughput/throttling issues. | +| `writer_queue_wait_time_ms` | Cumulative writer queue wait time. | High value can indicate writer starvation/backpressure. | +| `commit_time_ms` | Cumulative commit time. | High value means commit itself is expensive. | +| `commit_conflict_retries` | Commit retries due to version conflicts. | Non-trivial counts indicate commit contention. | +| `commit_backoff_time_ms` | Time spent backing off during commit retries. | High value indicates contention/retry pressure. | +| `commit_concurrent_writer_retries` | Retries from "Too many concurrent writers". | High value indicates writer concurrency contention. | + +## Quick diagnosis workflow + +1. Check `rows_checkpointed` vs `rows_ready_for_commit`. + - If `rows_checkpointed` is high but `rows_ready_for_commit` is low, fragment + writer is usually the bottleneck. + - This often indicates object storage read/write pressure (for example S3). +2. Compare read, UDF, and checkpoint timing. + - High `read_io_time_ms`: storage or scan bottleneck. + - High `udf_processing_time`: UDF compute bottleneck. + - High `batch_checkpointing_time`: checkpoint overhead bottleneck. + - Typical mitigations: increase `checkpoint_size`, increase + `max_checkpoint_size`, or compact the table to produce larger fragments. +3. Check writer timing. + - High `writer_write_time` is commonly object storage throttling/throughput + limit. + - Typical mitigations: use higher network-bandwidth node types, and keep + object storage and compute nodes in the same region. +4. Check commit pressure. + - High `commit_conflict_retries`, `commit_backoff_time_ms`, or + `commit_concurrent_writer_retries` indicates commit contention. +5. Check parallelism deficit. + - If `cnt_geneva_workers_pending` stays high while + `cnt_geneva_workers_active` stays low, the job is running below desired + parallelism due to cluster/resource constraints. + +## Notes + +- Timing metrics are cumulative and may overlap; do not sum them as exact wall + time. +- For completed jobs, row counters should settle to stable final values. From 0969229e1b83fb8cfcc1c2a6dbe761d89051f58f Mon Sep 17 00:00:00 2001 From: Dan Tasse Date: Mon, 9 Mar 2026 16:45:37 -0400 Subject: [PATCH 2/2] don't expose API --- docs/geneva/jobs/console.mdx | 5 +++-- docs/geneva/jobs/job_metrics.mdx | 10 ++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/docs/geneva/jobs/console.mdx b/docs/geneva/jobs/console.mdx index c2e9a6c..592dd12 100644 --- a/docs/geneva/jobs/console.mdx +++ b/docs/geneva/jobs/console.mdx @@ -23,8 +23,9 @@ The Geneva console is installed with the Geneva Helm chart; [contact LanceDB](ht 1. Install or upgrade the Geneva Helm chart (see [Helm Deployment](/geneva/deployment/helm/)). 2. Forward port 3000 from the geneva-console-ui service: ```bash -kubectl port-forward svc/geneva-console-ui 3000:3000 +kubectl port-forward -n geneva svc/geneva-console-ui 3000:3000 ``` +(Make sure you're using `-n` to specify the namespace correctly, using the value used when you installed the Helm chart. We advise `geneva`, so it's probably `geneva`.) 3. Open `http://localhost:3000` in your browser. When prompted, enter your bucket and database, like: ``` s3://my-bucket/my-db @@ -42,4 +43,4 @@ Click on a job's ID to get more details, especially events that have happened in See the Geneva clusters that you have defined to run jobs. Because clusters can be reused by name, this view can help you run a new job with the same resource constraints as a previous job. ### Manifests -See the Manifests you've defined and what packages/dependencies they contain. As with clusters, manifests are reusable, so it's easy to start a new job with the same dependencies as an old one by just specifying the manifest name. \ No newline at end of file +See the Manifests you've defined and what packages/dependencies they contain. As with clusters, manifests are reusable, so it's easy to start a new job with the same dependencies as an old one by just specifying the manifest name. diff --git a/docs/geneva/jobs/job_metrics.mdx b/docs/geneva/jobs/job_metrics.mdx index efc6b22..7102478 100644 --- a/docs/geneva/jobs/job_metrics.mdx +++ b/docs/geneva/jobs/job_metrics.mdx @@ -5,15 +5,9 @@ description: Use metrics from Geneva to diagnose why a backfill/refresh job is s icon: chart-simple --- -## Where metrics come from +## How to find metrics -Job metrics are attached to each job record in the Console/API response under -`metrics`. - -- Job list: - - `GET /api/v1/jobs?table_name=
&db_uri_encoded=<...>` -- Job detail: - - `GET /api/v1/jobs/?db_uri_encoded=<...>` +Job metrics can be found in the [Geneva Console UI](https://docs.lancedb.com/geneva/jobs/console), by clicking on a job's ID to get to the "Job details" page. ## Core diagnostic metrics