Skip to content

Commit cc423fd

Browse files
committed
Add data.table-backed index backend
1 parent 23856d1 commit cc423fd

File tree

9 files changed

+940
-89
lines changed

9 files changed

+940
-89
lines changed

.beads/issues.jsonl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22
{"id":"bidser-g47","title":"PRD: Top 5 bidser additions","description":"Track and deliver the top 5 priority additions defined in bidser-top5-prd.json.","acceptance_criteria":"All 5 child features are closed with linked tests and docs updates.","status":"open","priority":1,"issue_type":"epic","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-05T18:23:36.729948Z","created_by":"bbuchsbaum","updated_at":"2026-03-05T18:23:36.729948Z","labels":["bidser","prd","top5"]}
33
{"id":"bidser-g47.1","title":"Implement inheritance-correct metadata retrieval","description":"Add inheritance-aware metadata resolution with nearest-file precedence and deterministic conflict handling.","acceptance_criteria":"Fixture tests prove root/sub/run JSON merge with nearest override; read_sidecar supports inherited mode; no regressions in sidecar/TR tests.","status":"closed","priority":1,"issue_type":"feature","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-05T18:23:36.895401Z","created_by":"bbuchsbaum","updated_at":"2026-03-05T15:40:00-05:00","closed_at":"2026-03-05T15:40:00-05:00","labels":["bidser","metadata","prd","top5"],"dependencies":[{"issue_id":"bidser-g47.1","depends_on_id":"bidser-g47","type":"parent-child","created_at":"2026-03-05T18:23:36.896992Z","created_by":"bbuchsbaum"}]}
44
{"id":"bidser-g47.2","title":"Introduce explicit query API v2","description":"Add query_files with explicit exact/regex matching, entity existence requirements, and raw/derivatives scope controls.","acceptance_criteria":"query_files supports exact+regex modes, require_entity semantics, and scope filtering; unknown entities return informative errors; legacy search_files remains backward compatible.","notes":"Unblocked after closing bidser-g47.1; query_files remains the recommended API while search_files stays supported.","status":"in_progress","priority":1,"issue_type":"feature","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-05T18:23:37.090515Z","created_by":"bbuchsbaum","updated_at":"2026-03-05T15:40:00-05:00","labels":["bidser","prd","query","top5"],"dependencies":[{"issue_id":"bidser-g47.2","depends_on_id":"bidser-g47","type":"parent-child","created_at":"2026-03-05T18:23:37.091509Z","created_by":"bbuchsbaum"}]}
5-
{"id":"bidser-g47.3","title":"Add persistent index backend","description":"Introduce optional persistent indexing (SQLite/DuckDB) with load/build/refresh behavior for large datasets.","acceptance_criteria":"Persistent index file is created and reused; refresh handles modified files; indexed and non-indexed query results are identical; stale index falls back safely.","status":"open","priority":2,"issue_type":"feature","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-05T18:23:37.29082Z","created_by":"bbuchsbaum","updated_at":"2026-03-05T18:23:37.29082Z","labels":["bidser","performance","prd","top5"],"dependencies":[{"issue_id":"bidser-g47.3","depends_on_id":"bidser-g47","type":"parent-child","created_at":"2026-03-05T18:23:37.291769Z","created_by":"bbuchsbaum"},{"issue_id":"bidser-g47.3","depends_on_id":"bidser-g47.2","type":"blocks","created_at":"2026-03-05T18:23:37.923634Z","created_by":"bbuchsbaum"}]}
5+
{"id":"bidser-g47.3","title":"Add persistent index backend","description":"Introduce an optional persistent indexing backend using a versioned data.table-based manifest plus resolved metadata cache. The backend must preserve current user-facing query semantics while improving repeated file/entity and metadata lookups for large BIDS datasets.","acceptance_criteria":"Persistent index file is created and reused; incremental refresh handles added, changed, and deleted files; indexed and non-indexed query results are identical; resolved inherited metadata can be served from cache; stale or version-mismatched indexes fall back safely; existing tests continue to pass and new regression tests are added. Benchmark harness tracked separately in bidser-g47.3.7.","status":"in_progress","priority":2,"issue_type":"feature","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-05T18:23:37.29082Z","created_by":"bbuchsbaum","updated_at":"2026-03-30T22:24:00-04:00","labels":["bidser","performance","prd","top5"],"dependencies":[{"issue_id":"bidser-g47.3","depends_on_id":"bidser-g47","type":"parent-child","created_at":"2026-03-05T18:23:37.291769Z","created_by":"bbuchsbaum"},{"issue_id":"bidser-g47.3","depends_on_id":"bidser-g47.2","type":"blocks","created_at":"2026-03-05T18:23:37.923634Z","created_by":"bbuchsbaum"}]}
6+
{"id":"bidser-g47.3.1","title":"Define data.table index schema and backend seam","description":"Define the internal versioned index object for the data.table backend, including manifest schema, metadata cache schema, serialization contract, and backend boundary functions. Keep public APIs stable while isolating backend-specific logic.","acceptance_criteria":"A versioned internal index contract is documented in code/tests; backend-specific helpers exist behind a stable seam; no user-facing behavior changes yet.","status":"closed","priority":1,"issue_type":"task","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-30T21:55:14.445693-04:00","created_by":"bbuchsbaum","updated_at":"2026-03-30T22:24:00-04:00","closed_at":"2026-03-30T22:24:00-04:00"}
7+
{"id":"bidser-g47.3.2","title":"Implement file manifest build and versioned persistence","description":"Build a data.table-backed file manifest with parsed BIDS entities, scope, pipeline, extension, datatype, and file stats; persist and reload it from the project index file with schema-version checks.","acceptance_criteria":"bids_index() produces a versioned persisted manifest backed by data.table; reload works across sessions; schema mismatches trigger safe rebuild behavior.","status":"closed","priority":1,"issue_type":"task","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-30T21:55:14.456706-04:00","created_by":"bbuchsbaum","updated_at":"2026-03-30T22:24:00-04:00","closed_at":"2026-03-30T22:24:00-04:00"}
8+
{"id":"bidser-g47.3.3","title":"Add incremental refresh and file-level invalidation","description":"Replace coarse project-root mtime invalidation with file-level add/change/delete detection using path, size, and mtime, reparsing only affected rows unless a full rebuild is required.","acceptance_criteria":"Incremental refresh correctly handles added, changed, and deleted files; unchanged rows are reused; full rebuilds occur only for version/parser incompatibilities.","status":"closed","priority":1,"issue_type":"task","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-30T21:55:14.557132-04:00","created_by":"bbuchsbaum","updated_at":"2026-03-30T22:24:00-04:00","closed_at":"2026-03-30T22:24:00-04:00"}
9+
{"id":"bidser-g47.3.4","title":"Add resolved metadata cache for inherited sidecars","description":"Cache inheritance-resolved metadata for target files using the indexed sidecar inventory so repeated metadata access does not reread and merge JSON sidecars on every call.","acceptance_criteria":"Repeated get_metadata() calls can be served from cache; inheritance behavior remains correct; sidecar changes invalidate affected cached metadata rows.","status":"closed","priority":1,"issue_type":"task","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-30T21:55:14.559604-04:00","created_by":"bbuchsbaum","updated_at":"2026-03-30T22:24:00-04:00","closed_at":"2026-03-30T22:24:00-04:00"}
10+
{"id":"bidser-g47.3.5","title":"Add parity and regression tests for the data.table backend","description":"Add regression coverage that proves indexed and non-indexed query parity, cached metadata reuse, and cache invalidation for the data.table-backed index.","acceptance_criteria":"Representative raw and derivative fixtures show indexed and non-indexed query parity; repeated get_metadata() calls reuse cached merged metadata; sidecar edits invalidate cached metadata correctly.","status":"closed","priority":1,"issue_type":"task","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-30T21:55:14.562292-04:00","created_by":"bbuchsbaum","updated_at":"2026-03-30T22:24:00-04:00","closed_at":"2026-03-30T22:24:00-04:00"}
11+
{"id":"bidser-g47.3.6","title":"Move query_files() to the data.table fast path","description":"Use keyed/indexed data.table subsets for indexed file queries while preserving current regex, glob, scope, pipeline, and return semantics. Keep the tree-walk path available for use_index = never and debugging.","acceptance_criteria":"Indexed and non-indexed query_files() results are identical across representative raw and derivative fixtures; query_files() uses the data.table fast path when an index is available.","status":"closed","priority":1,"issue_type":"task","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-30T21:55:14.567624-04:00","created_by":"bbuchsbaum","updated_at":"2026-03-30T22:24:00-04:00","closed_at":"2026-03-30T22:24:00-04:00"}
12+
{"id":"bidser-g47.3.7","title":"Add benchmark harness for data.table index backend","description":"Add a repeatable benchmark harness that measures cold build, warm query_files(), and repeated get_metadata() performance for the data.table-backed index on representative synthetic or fixture datasets.","acceptance_criteria":"Benchmark script or target exists in-repo; it reports cold index build, warm indexed query, and warm metadata-cache timings; benchmark can be run locally without altering package semantics.","status":"open","priority":2,"issue_type":"task","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-30T22:19:54.120725-04:00","created_by":"bbuchsbaum","updated_at":"2026-03-30T22:19:54.120725-04:00"}
613
{"id":"bidser-g47.4","title":"Generalize derivatives support across pipelines","description":"Support discovery and selection of multiple derivative pipelines, not just fMRIPrep assumptions.","acceptance_criteria":"Pipelines are discoverable from derivatives metadata; users can query by pipeline; fMRIPrep behavior remains intact; at least one non-fMRIPrep fixture is supported.","status":"open","priority":2,"issue_type":"feature","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-05T18:23:37.459708Z","created_by":"bbuchsbaum","updated_at":"2026-03-05T18:23:37.459708Z","labels":["bidser","derivatives","prd","top5"],"dependencies":[{"issue_id":"bidser-g47.4","depends_on_id":"bidser-g47","type":"parent-child","created_at":"2026-03-05T18:23:37.460803Z","created_by":"bbuchsbaum"},{"issue_id":"bidser-g47.4","depends_on_id":"bidser-g47.3","type":"blocks","created_at":"2026-03-05T18:23:38.066283Z","created_by":"bbuchsbaum"}]}
714
{"id":"bidser-g47.5","title":"Add variables and stats-model workflow layer","description":"Add higher-level variables/model APIs that produce model-ready run/session tables from events and confounds.","acceptance_criteria":"Variable loader returns tidy run-level tables; model spec ingestion produces expected schema; end-to-end example runs in CI with deterministic output.","status":"open","priority":2,"issue_type":"feature","owner":"brad.buchsbaum@gmail.com","created_at":"2026-03-05T18:23:37.646099Z","created_by":"bbuchsbaum","updated_at":"2026-03-05T18:23:37.646099Z","labels":["bidser","modeling","prd","top5"],"dependencies":[{"issue_id":"bidser-g47.5","depends_on_id":"bidser-g47","type":"parent-child","created_at":"2026-03-05T18:23:37.647053Z","created_by":"bbuchsbaum"},{"issue_id":"bidser-g47.5","depends_on_id":"bidser-g47.4","type":"blocks","created_at":"2026-03-05T18:23:38.192062Z","created_by":"bbuchsbaum"}]}

.beads/metadata.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"database": "beads.db",
3+
"jsonl_export": "issues.jsonl"
4+
}

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ ByteCompile: true
1616
Roxygen: list(markdown = TRUE)
1717
RoxygenNote: 7.3.3
1818
Imports:
19+
data.table,
1920
stringr,
2021
data.tree,
2122
neuroim2,

R/bids.R

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -772,39 +772,27 @@ bids_project <- function(path=".", fmriprep=FALSE, prep_dir="derivatives/fmripre
772772
requested_fmriprep = isTRUE(fmriprep),
773773
index_path = .bidser_project_index_path(path, index_path),
774774
index = NULL,
775+
index_state = NULL,
775776
has_index = FALSE,
776777
index_mode = index,
777778
has_sessions=has_sessions)
778779

780+
class(ret) <- "bids_project"
781+
779782
if (identical(index, "auto")) {
780-
idx_path <- ret$index_path
781-
if (file.exists(idx_path)) {
782-
obj <- tryCatch(readRDS(idx_path), error = function(e) NULL)
783-
# Support both old (bare tibble) and new (list with mtime) formats
784-
idx <- NULL
785-
stale <- FALSE
786-
if (is.data.frame(obj)) {
787-
idx <- obj
788-
} else if (is.list(obj) && is.data.frame(obj$index)) {
789-
if (!is.null(obj$mtime)) {
790-
current_mtime <- .bidser_dir_mtime(path)
791-
stale <- !identical(as.numeric(current_mtime), as.numeric(obj$mtime))
792-
}
793-
if (!stale) idx <- obj$index
794-
}
795-
if (!is.null(idx) && !stale) {
796-
ret$index <- tibble::as_tibble(idx)
797-
ret$has_index <- TRUE
798-
}
783+
state <- .bidser_load_cached_index_state(ret, refresh = TRUE, persist = TRUE)
784+
if (!is.null(state)) {
785+
ret$index_state <- state
786+
ret$index <- .bidser_index_state_manifest_tibble(state)
787+
ret$has_index <- TRUE
799788
}
800789
}
801790

802-
class(ret) <- "bids_project"
803-
804791
if (identical(index, "auto") && !isTRUE(ret$has_index)) {
805792
idx <- tryCatch(bids_index(ret, rebuild = TRUE, persist = TRUE), error = function(e) NULL)
806793
if (is.data.frame(idx)) {
807794
ret$index <- tibble::as_tibble(idx)
795+
ret$index_state <- .bidser_load_cached_index_state(ret, refresh = FALSE, persist = FALSE)
808796
ret$has_index <- TRUE
809797
}
810798
}

0 commit comments

Comments
 (0)