openalexPro · rkrug · Jun 3, 2026 · May 22, 2026
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -2,6 +2,7 @@
 ^vignettes/.*_files$
 ^IMPLEMENTATION_NOTES\.md$
 ^DEVELOPMENT_CONTINUITY\.md$
+^NOTES\.md$
 ^doc$
 ^Meta$
 ^inst/ASR$

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,83 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+`openalexVectorComp` is an R package for text embedding generation and distance/score-based corpus comparison. It is **R-first** with file-based artifacts (Parquet/YAML/JSON) as first-class outputs — functions are designed to compose in plain R workflows without hidden services.
+
+Current version: see `DESCRIPTION` (treat it as the source of truth, not docs).
+
+## Common Commands
+
+Run from the package root in R:
+
+```r
+# Install local source
+devtools::install_local(".")
+
+# Document (regenerate man/ and NAMESPACE from roxygen)
+devtools::document()
+
+# Run all tests
+devtools::test()
+
+# Run a single test file
+devtools::test(filter = "name-without-test-prefix-or-.R")
+# or: testthat::test_file("tests/testthat/test-<name>.R")
+
+# Full R CMD check (mirrors CI)
+devtools::check(args = c("--no-manual", "--no-multiarch", "--no-examples", "--ignore-vignettes"))
+
+# Build pkgdown site (output: _site/)
+pkgdown::build_site()
+```
+
+CI (`.github/workflows/pr-checks.yml`) runs `R CMD check` across Ubuntu (release/oldrel-1/devel), macOS, and Windows, **skipping vignette execution and examples**. Match this locally before committing.
+
+## Architecture
+
+### Backend abstraction
+`R/backend_core.R` exposes a provider-neutral interface (`backend_config()`, `backend_info()`, `backend_embed_texts()`, `backend_read()`, `backend_save()`). It dispatches on `provider` ∈ {`hf`, `openai`, `tei`} to internal `.embedding_*` functions in:
+
+- `R/backend_hf.R` — HuggingFace Inference router
+- `R/backend_openai.R` — OpenAI embeddings API
+- `R/backend_tei.R` — local Text Embeddings Inference server
+
+Auth for hosted backends uses the `OVC_API_TOKEN` env var (bearer token). Backend configs serialize to/from YAML (`embed_model.yaml`); `backend_read()` also accepts a legacy nested format — preserve that path when editing.
+
+### Pipeline (sync)
+1. `embed_corpus()` — prepare/clean input texts (supports `dry_run = TRUE`).
+2. `embed_texts()` / `backend_embed_texts()` — generate embeddings.
+3. Distances:
+   - `distance_reference_cosine()` — full pairwise cosine matrix with centroid axes, written as a single parquet under `distance_reference_cosine/model_id=.../corpus_label=.../reference_label=.../pairwise-cosine.parquet`. First column `id` includes a `"centroid"` row; one `"centroid"` column on the reference side.
+   - `distance_ridge()` — reference-area distance.
+4. Scores: `score_reference_cosine()` (methods: `"linear"`, `"exponential"`), `score_ridge()`.
+5. Optional: `calibrate_threshold()`.
+
+### OpenAI Batch (async, explicit three-phase)
+`batch_submit_openai()` → `batch_status_openai()` → `batch_collect_openai()`. **Pending state is an expected, non-fatal outcome.** Jobs are auto-split by size/count; a single oversized request line is a hard error. Helpers live in `R/batch_openai_helpers.R` and `R/batch_openai_http.R`.
+
+The demo wrapper `demo_finalize_openai_batch()` = status + collect + direct-vs-batch comparison, with persistent outputs at `project/openai_batch_comparison/label=<label>/`.
+
+### Demos
+`R/demo_run.R` provides `run_demo_openalex()` and `run_demo_openai()`. Default demo directories are `demos/openalex` and `demos/openai`. Quarto and YAML live in `demo_dir`; pipeline artifacts under `demo_dir/project/`. `run_demo_openai(..., render = TRUE)` may return before batch completes — this is intentional; finalize separately.
+
+## Continuity Documents (Required Workflow)
+
+Three docs have non-overlapping roles. Update the appropriate one(s) **before every commit that changes behavior**:
+
+- `DEVELOPMENT_CONTINUITY.md` — design principles, decision log, pre-commit checklist. Read this first when resuming work. Add a Decision Log entry for non-trivial changes.
+- `IMPLEMENTATION_NOTES.md` — what was shipped per release (engineering log).
+- `NEWS.md` — user-facing release notes.
+
+Keep README and vignette commands aligned with actual defaults (paths, async behavior, API names).
+
+## Conventions / Gotchas
+
+- **Don't hard-fail on pending OpenAI batch jobs** — async pending is a normal state in submit/status/collect and demo flows.
+- **Backend YAML compatibility**: support both the current flat format and the legacy nested (`meta$backend$...`, `meta$model$requested_id`) format in any reader changes.
+- **File-based artifacts are the contract.** Parquet layout includes Hive-style partition dirs (`model_id=.../corpus_label=.../reference_label=...`). Don't change these paths casually — downstream demos/tests assume them.
+- Tiny numeric drift between direct and batch OpenAI embeddings is expected (serialization/floating-point). Cosine ≈ 1.0 is not a regression.
+- pkgdown site builds to `_site/` (configured in `_pkgdown.yml`).
+- `.Rbuildignore` excludes `demos/`, `_site/`, continuity docs, and `inst/qdrant functions/` — don't expect those in the built package.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: openalexVectorComp
 Type: Package
 Title: Embedding Vectorization and Distance-Based Scoring Workflows
-Version: 0.3.2
+Version: 0.3.3
 Authors@R: c(
     person(given = "Rainer M", family = "Krug", role = c("aut", "cre"), email = "Rainer@krugs.de"),
     person(given = "ChatGPT", family = "Assistant", role = "ctb")
@@ -43,3 +43,4 @@ URL: https://github.com/openalexPro/openalexVectorComp, https://openalexpro.gith
 BugReports: https://github.com/openalexPro/openalexVectorComp/issues
 RoxygenNote: 7.3.3
 Roxygen: list(markdown = TRUE)
+Config/roxygen2/version: 8.0.0
diff --git a/DEVELOPMENT_CONTINUITY.md b/DEVELOPMENT_CONTINUITY.md
@@ -86,6 +86,27 @@ Template:
   outputs.
 - Follow-up tasks: keep tutorial narrative and command output examples aligned.
 
+- Date: 2026-05-22
+- Scope: SPECTER2 support for academic paper topic comparison
+- Decision: Add SPECTER2 proximity model support via a documented one-time
+  merge step (Python script) + local TEI serving, exposed through a thin
+  R helper `backend_specter2_tei()`. Setup material lives in
+  `inst/scripts/` and `vignettes/specter2-setup.qmd`.
+- Why: Domain-specific model gives better topic separation than general
+  OpenAI embeddings for academic corpora, at 768 dim and zero per-token cost.
+  TEI cannot load adapter-transformers adapters, so a one-time merge is
+  required. Keeping the merge out of the R API surface avoids forcing a
+  Python dependency on this R-first package.
+- Alternatives considered: bundling the merge as an R function (rejected:
+  Python toolchain dependency, unstable `adapters` library API, conflicts
+  with the principle that the package does not manage external services);
+  shipping the merged model itself (rejected: ~500 MB).
+- Impact: Users can opt into SPECTER2 with one merge + one TEI launch; the
+  rest of the pipeline is unchanged.
+- Follow-up tasks: revisit if the `adapters` library API shifts again; add a
+  benchmark comparing SPECTER2 vs `text-embedding-3-small` once we have a
+  reference corpus.
+
 - Date: 2026-04-01
 - Scope: Demo organization and docs
 - Decision: Default demo directories changed to `demos/<provider>`.

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,7 @@ export(backend_embed_texts)
 export(backend_info)
 export(backend_read)
 export(backend_save)
+export(backend_specter2_tei)
 export(batch_collect_openai)
 export(batch_status_openai)
 export(batch_submit_openai)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,15 @@
 # openalexVectorComp news
 
+## 0.3.3
+
+### New Features
+
+- Added `backend_specter2_tei()` convenience helper for serving the merged
+  SPECTER2 proximity model via a local TEI server.
+- Added `specter2-setup` vignette and `inst/scripts/prepare_specter2_merged.py`
+  + `inst/scripts/start_tei_specter2.sh` for one-time model preparation and
+  serving.
+
 ## 0.3.0
 
 ### New Features

diff --git a/NOTES.md b/NOTES.md
@@ -0,0 +1,59 @@
+# Notes
+
+Scratchpad for ideas and follow-ups that are not yet decisions. Excluded from
+the built package via `.Rbuildignore`.
+
+## Publishing the merged SPECTER2 model to HuggingFace Hub
+
+Currently, users must run `inst/scripts/prepare_specter2_merged.py` themselves
+to produce the merged `specter2_base` + proximity-adapter model directory that
+TEI can serve. This requires a Python environment with `transformers` +
+`adapters`, ~500 MB on disk, and a few minutes of compute.
+
+If SPECTER2 becomes a first-class supported backend, the cleaner path is to
+publish the merged model to HuggingFace Hub once and let users pull it
+directly.
+
+### Why not bundle the weights in the package
+
+- CRAN size limit is 5 MB; merged model is ~500 MB. Hard no.
+- Off-CRAN bundling still bloats git history, slows `install_local()`, and
+  couples model release cadence to code release cadence. R packages are source
+  code, not weight stores.
+
+### Proposed workflow
+
+1. Run the merge once locally.
+2. Publish:
+   ```bash
+   huggingface-cli login
+   huggingface-cli upload <org>/specter2_proximity_merged \
+     ~/Library/Caches/org.R-project.R/R/openalexVectorComp/specter2_proximity_merged
+   ```
+3. Users skip the merge entirely:
+   ```bash
+   text-embeddings-router --model-id <org>/specter2_proximity_merged --port 8080
+   ```
+4. `inst/scripts/prepare_specter2_merged.py` stays as a reproducibility
+   artifact (how we built it) but is no longer required for setup.
+5. `inst/scripts/start_tei_specter2.sh` defaults `--model-id` to the published
+   repo when no local merged model is found.
+6. `backend_specter2_tei()` default `model` arg updates to the published id.
+7. Vignette `specter2-setup.qmd` collapses to: install TEI, run one command,
+   use from R.
+
+### Open questions
+
+- Which HF org/account hosts the published model? Personal vs project org.
+- License compatibility — SPECTER2 base + adapter licenses (verify Apache-2.0
+  on both before redistributing the merged weights).
+- Versioning: tag the HF repo on each re-merge; pin the tag in
+  `backend_specter2_tei()` for reproducibility.
+
+### Middle-ground alternative
+
+If publishing to HF Hub is undesirable, host the merged tarball on GitHub
+Releases (2 GB per-asset limit) or S3, and have an R function lazy-download
+into `tools::R_user_dir("openalexVectorComp", "cache")` on first use.
+Packages like `piggyback` automate the GitHub Releases case. Keeps weights
+out of the package source either way.
diff --git a/R/backend_core.R b/R/backend_core.R
@@ -70,6 +70,43 @@ backend_config <- function(
   cfg
 }
 
+#' Backend preset for a local TEI server serving the merged SPECTER2 proximity model
+#'
+#' Convenience wrapper around [backend_config()] for a SPECTER2 setup served by
+#' a local TEI (text-embeddings-inference) server. The model itself must be
+#' prepared and started externally (see `inst/scripts/prepare_specter2_merged.py`
+#' and `inst/scripts/start_tei_specter2.sh`, and the `specter2-setup` vignette).
+#'
+#' The `model` argument is metadata only and is recorded in `embed_model.yaml`
+#' and parquet partition paths; TEI itself loads whichever model it was started
+#' with.
+#'
+#' @param port Port that TEI is listening on. Defaults to `8080`.
+#' @param host Host for TEI. Defaults to `"localhost"`.
+#' @param model Provenance label for the served model. Defaults to
+#'   `"allenai/specter2_proximity_merged"`.
+#'
+#' @return A backend configuration list compatible with [backend_config()].
+#' @export
+backend_specter2_tei <- function(
+  port = 8080L,
+  host = "localhost",
+  model = "allenai/specter2_proximity_merged"
+) {
+  port <- as.integer(port)
+  if (!is.finite(port) || port <= 0L) {
+    stop("`port` must be a positive integer.")
+  }
+  if (!is.character(host) || length(host) != 1L || !nzchar(host)) {
+    stop("`host` must be a non-empty string.")
+  }
+  backend_config(
+    provider = "tei",
+    tei_url  = sprintf("http://%s:%d/embed", host, port),
+    model    = model
+  )
+}
+
 #' Get embedding backend model/service information
 #'
 #' Returns normalized backend metadata used by the pipeline.

diff --git a/README.md b/README.md
@@ -60,6 +60,7 @@ Start with `vignettes/simplestart.qmd`, then see:
 - `vignettes/package-overview.qmd`
 - `vignettes/openai-batch-async.qmd`
 - `vignettes/abstract-cleaning.qmd`
+- `vignettes/specter2-setup.qmd` (academic-paper embeddings via TEI)
 
 ## Run a Local Demo Project
 

diff --git a/inst/scripts/prepare_specter2_merged.py b/inst/scripts/prepare_specter2_merged.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""
+Merge the SPECTER2 proximity adapter into the base encoder and save a standalone
+HuggingFace-format model directory that TEI (text-embeddings-inference) can serve.
+
+One-time setup. Run once per machine. Output directory is then passed to
+text-embeddings-router via --model-id.
+
+Usage:
+    python prepare_specter2_merged.py [--out-dir PATH]
+
+Environment overrides:
+    OVC_SPECTER2_PATH   Output directory (takes precedence over --out-dir default).
+
+Dependencies:
+    pip install transformers adapters torch
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+
+DEFAULT_SUBDIR = "specter2_proximity_merged"
+
+
+def default_out_dir() -> Path:
+    env = os.environ.get("OVC_SPECTER2_PATH")
+    if env:
+        return Path(env).expanduser()
+    if sys.platform == "darwin":
+        base = Path.home() / "Library" / "Caches" / "org.R-project.R" / "R" / "openalexVectorComp"
+    elif sys.platform.startswith("linux"):
+        xdg = os.environ.get("XDG_CACHE_HOME")
+        base = Path(xdg).expanduser() if xdg else Path.home() / ".cache"
+        base = base / "R" / "openalexVectorComp"
+    else:
+        base = Path.home() / "AppData" / "Local" / "R" / "cache" / "R" / "openalexVectorComp"
+    return base / DEFAULT_SUBDIR
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--out-dir",
+        default=None,
+        help="Destination for the merged model. Defaults to a per-user cache path.",
+    )
+    args = parser.parse_args()
+
+    out_dir = Path(args.out_dir).expanduser() if args.out_dir else default_out_dir()
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        from transformers import AutoTokenizer
+        from adapters import AutoAdapterModel
+    except ImportError as e:
+        print(
+            "Missing Python dependency. Install with:\n"
+            "    pip install transformers adapters torch\n"
+            f"Original error: {e}",
+            file=sys.stderr,
+        )
+        return 2
+
+    base_id = "allenai/specter2_base"
+    adapter_id = "allenai/specter2"
+    adapter_name = "proximity"
+
+    print(f"Loading base model: {base_id}")
+    tokenizer = AutoTokenizer.from_pretrained(base_id)
+    model = AutoAdapterModel.from_pretrained(base_id)
+
+    print(f"Loading adapter: {adapter_id} ({adapter_name})")
+    model.load_adapter(adapter_id, source="hf", load_as=adapter_name, set_active=True)
+
+    print(f"Merging adapter '{adapter_name}' into base weights")
+    model.merge_adapter(adapter_name)
+
+    print(f"Saving merged model + tokenizer to: {out_dir}")
+    model.save_pretrained(str(out_dir))
+    tokenizer.save_pretrained(str(out_dir))
+
+    print(str(out_dir))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/inst/scripts/start_tei_specter2.sh b/inst/scripts/start_tei_specter2.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Start a local TEI server against the merged SPECTER2 proximity model.
+#
+# Run prepare_specter2_merged.py first to produce the merged model directory.
+#
+# Environment overrides:
+#   OVC_SPECTER2_PATH   Path to merged model dir (default: per-user cache).
+#   OVC_TEI_PORT        Port for text-embeddings-router (default: 8080).
+
+set -euo pipefail
+
+default_cache_dir() {
+  case "$(uname -s)" in
+    Darwin) echo "${HOME}/Library/Caches/org.R-project.R/R/openalexVectorComp/specter2_proximity_merged" ;;
+    Linux)  echo "${XDG_CACHE_HOME:-${HOME}/.cache}/R/openalexVectorComp/specter2_proximity_merged" ;;
+    *)      echo "${HOME}/.cache/R/openalexVectorComp/specter2_proximity_merged" ;;
+  esac
+}
+
+MODEL_PATH="${OVC_SPECTER2_PATH:-$(default_cache_dir)}"
+PORT="${OVC_TEI_PORT:-8080}"
+
+if [ ! -f "${MODEL_PATH}/config.json" ]; then
+  echo "Merged SPECTER2 model not found at: ${MODEL_PATH}" >&2
+  echo "Run inst/scripts/prepare_specter2_merged.py first." >&2
+  exit 1
+fi
+
+if ! command -v text-embeddings-router >/dev/null 2>&1; then
+  echo "text-embeddings-router not on PATH. See vignettes/tei-server-operations.qmd." >&2
+  exit 1
+fi
+
+echo "Serving ${MODEL_PATH} on port ${PORT}"
+exec text-embeddings-router --model-id "${MODEL_PATH}" --port "${PORT}"