diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index d6d306a..60ddd05 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -10,7 +10,6 @@ on: env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true jobs: build-backend: @@ -20,7 +19,7 @@ jobs: packages: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -59,7 +58,7 @@ jobs: packages: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2990e78..9b3eca3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -4,9 +4,6 @@ on: push: pull_request: -env: - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true - jobs: docs: runs-on: ubuntu-22.04 @@ -17,9 +14,9 @@ jobs: poetry-version: ["2.1.0"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 61d5b39..142365c 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -5,9 +5,6 @@ on: branches: [main] pull_request: -env: - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true - jobs: integration: runs-on: ubuntu-22.04 @@ -33,9 +30,9 @@ jobs: --health-retries 5 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index a757709..ae0079d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -4,9 +4,6 @@ on: push: pull_request: -env: - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true - jobs: lint: runs-on: ubuntu-22.04 @@ -17,9 +14,9 @@ jobs: poetry-version: ["2.1.0"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b1ed5d4..db75c06 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,9 +4,6 @@ on: push: pull_request: -env: - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true - jobs: test: runs-on: ubuntu-22.04 @@ -17,9 +14,9 @@ jobs: poetry-version: ["2.1.0"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} diff --git a/Dockerfile b/Dockerfile index 0df0128..17c1ecc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.12-slim +# ── Stage 1: build dependencies ────────────────────────────────────────────── +FROM python:3.12-slim AS builder RUN apt-get update && apt-get install -y \ build-essential \ @@ -8,16 +9,30 @@ RUN apt-get update && apt-get install -y \ WORKDIR /app -# Install Poetry and dependencies first (layer cache) RUN pip install --no-cache-dir poetry==2.1.0 COPY pyproject.toml poetry.lock ./ RUN poetry config virtualenvs.create false \ && poetry install --without dev --no-root --no-interaction --no-ansi -# Copy source COPY protea/ ./protea/ RUN poetry install --without dev --no-interaction --no-ansi + +# ── Stage 2: runtime ──────────────────────────────────────────────────────── +FROM python:3.12-slim + +RUN apt-get update && apt-get install -y \ + libpq5 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy installed packages from builder +COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Copy application code +COPY protea/ ./protea/ COPY scripts/ ./scripts/ COPY alembic/ ./alembic/ COPY alembic.ini ./ @@ -25,7 +40,10 @@ COPY alembic.ini ./ ENV PYTHONUNBUFFERED=1 EXPOSE 8000 +HEALTHCHECK --interval=30s --timeout=5s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1 + # Default: API server # Override CMD to run a worker: # docker run protea python scripts/worker.py --queue protea.jobs -CMD ["uvicorn", "protea.api.app:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["uvicorn", "protea.api.app:create_app", "--factory", "--host", "0.0.0.0", "--port", "8000"] diff --git a/RERANKER.md b/RERANKER.md new file mode 100644 index 0000000..2301546 --- /dev/null +++ b/RERANKER.md @@ -0,0 +1,188 @@ +# Temporal Holdout Re-Ranker for GO Term Prediction + +## Motivación + +El pipeline actual de PROTEA transfiere anotaciones GO mediante KNN sobre embeddings ESM, usando un scoring heurístico que combina distancia de embedding y pesos de evidencia. Este scoring no está optimizado para la métrica objetivo (Fmax) ni para el comportamiento real de las anotaciones GO a lo largo del tiempo. + +La hipótesis central es que existe una señal aprendible: **dado el contexto de una predicción KNN, ¿acabará este GO term apareciendo en el siguiente release de GOA para esta proteína?** Esta señal puede extraerse directamente del mecanismo de holdout temporal que ya implementa PROTEA. + +--- + +## Formulación del Problema + +Sea $\mathcal{G}_N$ el conjunto de anotaciones GO en el release $N$ de GOA (Swiss-Prot reviewed). Para cada par consecutivo $(G_N, G_{N+1})$, el delta temporal es: + +$$\Delta_{N \to N+1} = \{(p, t) \mid (p, t) \in \mathcal{G}_{N+1} \setminus \mathcal{G}_N\}$$ + +El re-ranker aprende una función: + +$$f(q, t, \mathcal{N}_K(q)) \to \hat{y} \in [0, 1]$$ + +donde: +- $q$ es la proteína query (representada por su embedding ESM) +- $t$ es el GO term candidato +- $\mathcal{N}_K(q)$ es el conjunto de $K$ vecinos más cercanos en el espacio de embeddings con referencia $\mathcal{G}_N$ +- $\hat{y}$ es la probabilidad de que $(q, t) \in \Delta_{N \to N+1}$ + +--- + +## Protocolo de Entrenamiento + +Se utiliza validación cruzada temporal con múltiples splits históricos de GOA: + +``` +Training splits: + GOA_190 → GOA_195 + GOA_195 → GOA_200 + GOA_200 → GOA_205 + GOA_205 → GOA_211 + GOA_211 → GOA_215 + GOA_215 → GOA_220 + +Test split (holdout estricto, nunca visto durante training): + GOA_220 → GOA_229 +``` + +Para cada split se generan ejemplos etiquetados: positivos $(y=1)$ si el par (proteína, GO term) aparece en el delta, negativos $(y=0)$ en caso contrario. El desbalanceo esperado es aproximadamente 1:10, manejable con técnicas estándar. + +--- + +## Arquitectura: Cross-Attention Re-Ranker + +El modelo procesa cada par (query, GO term) usando el contexto completo de los vecinos KNN que contribuyeron a esa predicción. + +``` +Inputs por predicción (query_protein, go_term): + query_embedding float32[D] ESM embedding del query (D=480 para esmc_300m) + neighbor_embeddings float32[K × D] ESM embeddings de los K vecinos contribuyentes + tabular_features float32[K × F] distancia, evidencia, alineamiento, taxonomía... + go_term_embedding float32[G] embedding semántico del GO term (G=64) + +Arquitectura: + 1. query_proj(query_embedding) → q [H=256] + 2. ref_proj(neighbor_embeddings) → tokens [K × H] + 3. feature_encoder(tabular_features) → (sumado a tokens) + 4. CrossAttention(q, tokens, tokens) → context [H] + 5. MLP([q ‖ context ‖ go_emb ‖ agg_features]) → score [1] +``` + +La atención cruzada permite al modelo aprender **qué vecinos son más informativos para este query concreto**, en lugar de agregar los scores de forma heurística. + +### GO Term Embeddings + +Los embeddings de los GO terms se aprenden a partir de la estructura del DAG de GO (relaciones `is_a` / `part_of`) mediante Node2Vec o TransE, de forma que términos semánticamente relacionados (padre-hijo) tengan representaciones similares. El DAG ya está disponible en PROTEA a través de los modelos `GOTerm` y `GOTermRelationship`. + +--- + +## Feature Vector + +Cada predicción (query, GO term) se caracteriza por las siguientes features tabulares, computadas por vecino que contribuyó a la predicción: + +| Feature | Descripción | Estado | +|---|---|---| +| `distance` | Distancia coseno en espacio de embeddings | Existente | +| `evidence_weight` | Peso del código de evidencia (IDA > IEA) | Existente | +| `identity_nw / sw` | Identidad de secuencia (alineamiento NW/SW) | Existente (opcional) | +| `similarity_nw / sw` | Similaridad de secuencia | Existente (opcional) | +| `taxonomic_distance` | Distancia taxonómica entre query y referencia | Existente (opcional) | +| `vote_count` | Número de vecinos que coinciden en este GO term | **Nuevo** | +| `k_position` | Posición del vecino más cercano que predijo este término | **Nuevo** | +| `go_term_frequency` | Frecuencia del término en el annotation set de referencia | **Nuevo** | +| `ref_annotation_density` | Número de GO terms de la proteína de referencia | **Nuevo** | +| `neighbor_distance_std` | Varianza de distancias a los K vecinos | **Nuevo** | + +--- + +## Función de Pérdida + +Se utiliza **LambdaRank** en lugar de binary cross-entropy, ya que optimiza directamente el orden de las predicciones (proxy de NDCG / Fmax) en lugar de la calibración de probabilidades. + +Para cada proteína query, las predicciones GO se rankean conjuntamente: +- Positivos: GO terms en $\Delta_{N \to N+1}$ +- Negativos: GO terms predichos pero no en el delta + +--- + +## Pipeline de Datos: WebDataset + +El volumen de datos (múltiples splits × ~1.35M predicciones por split × embeddings de 480 dim) requiere un pipeline de datos eficiente. Se propone almacenar los ejemplos de entrenamiento en formato **WebDataset** (shards tar), con un shard por split GOA: + +``` +reranker_data/ + splits/ + goa190_to_195.tar # ~2GB por shard + goa195_to_200.tar + ... + goa220_to_229.tar # test split — no tocar durante training + models/ + reranker_v1.pt + reranker_v1_config.json +``` + +Cada muestra en el WebDataset es **una proteína query** con todas sus predicciones GO para ese split: + +```python +{ + "query_accession": "P12345", + "query_embedding": float32[480], + "go_term_ids": ["GO:0006915", "GO:0005737", ...], # N_preds + "neighbor_embeddings": float32[N_preds, K, 480], + "tabular_features": float32[N_preds, K, F], + "labels": int8[N_preds], # 1 si en delta, 0 si no +} +``` + +El streaming de WebDataset permite entrenar sin cargar todo en RAM. + +--- + +## Stack Tecnológico + +| Componente | Tecnología | +|---|---| +| Modelo | PyTorch | +| Data pipeline | WebDataset + torch.utils.data | +| Baseline comparación | LightGBM (binary + LambdaRank) | +| GO embeddings | Node2Vec / PyTorch Geometric | +| Seguimiento experimentos | wandb | +| Embeddings proteína | ESM2 / ESMC (ya en PROTEA) | + +--- + +## Integración en PROTEA + +Una vez entrenado, el re-ranker se integra en el pipeline existente: + +1. Nuevo modelo ORM `RerankingModel`: almacena pesos serializados y metadata de entrenamiento +2. Campo `reranker_id` (nullable) en `PredictionSet` +3. Si `reranker_id` presente: `store_predictions` aplica el modelo y sobreescribe `score` con $\hat{y}$ +4. El threshold de Fmax se calcula igual que ahora sobre los nuevos scores +5. UI: selector de re-ranker en la pantalla de predicción + +--- + +## Experimentos y Ablaciones + +El diseño permite comparar directamente: + +| Configuración | Descripción | +|---|---| +| **Baseline** | KNN + scoring heurístico actual | +| **LightGBM tabular** | Re-ranker con features tabulares sin embeddings | +| **LightGBM + derived** | Features tabulares + features derivadas del embedding (density, std) | +| **MLP cross-encoder** | Arquitectura completa sin cross-attention | +| **Cross-attention (propuesto)** | Arquitectura completa | +| **+ GO DAG embeddings** | Ablación: ¿aportan los go_term_emb? | +| **+ temporal CV** | Ablación: ¿mejora añadir más splits históricos? | + +La métrica principal es **Fmax promedio sobre los 9 settings** (NK/LK/PK × BPO/MFO/CCO) en el test split GOA220→229. + +--- + +## Valor para la Tesis + +1. **Científicamente honesto**: el mismo mecanismo temporal que se usa para evaluar se usa para entrenar. No hay data leakage. +2. **Comprobable y cuantificable**: Fmax(baseline KNN) vs Fmax(re-ranker) en benchmark idéntico. +3. **Interpretable**: las feature importances (LightGBM) o los pesos de atención (cross-attention) revelan qué aspectos de una predicción KNN son más predictivos de anotaciones futuras. +4. **Generalizable**: el re-ranker aprende sobre distribuciones temporales de anotaciones GO, no sobre una proteína concreta — debería generalizar a proteínas no vistas. +5. **Extensible**: la arquitectura admite incorporar embeddings de secuencia de mayor calidad (ESM3, ProstT5) sin cambiar el pipeline. diff --git a/alembic/env.py b/alembic/env.py index 6cb72b8..ba6ce44 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,8 +1,7 @@ from logging.config import fileConfig from pathlib import Path -from sqlalchemy import engine_from_config -from sqlalchemy import pool +from sqlalchemy import engine_from_config, pool from alembic import context @@ -17,14 +16,14 @@ # Wire PROTEA's ORM metadata so autogenerate works. # All model modules must be imported before Base.metadata is used. -from protea.infrastructure.orm.base import Base -import protea.infrastructure.orm.models # noqa: F401 — registers all mappers +import protea.infrastructure.orm.models # noqa: E402, F401 — registers all mappers +from protea.infrastructure.orm.base import Base # noqa: E402 target_metadata = Base.metadata # Override the DB URL from PROTEA's settings rather than relying on the # placeholder value in alembic.ini. -from protea.infrastructure.settings import load_settings +from protea.infrastructure.settings import load_settings # noqa: E402 _project_root = Path(__file__).resolve().parents[1] _settings = load_settings(_project_root) diff --git a/alembic/versions/110a5b8cfbb9_add_reranker_model_id_to_evaluation_.py b/alembic/versions/110a5b8cfbb9_add_reranker_model_id_to_evaluation_.py new file mode 100644 index 0000000..4164b2d --- /dev/null +++ b/alembic/versions/110a5b8cfbb9_add_reranker_model_id_to_evaluation_.py @@ -0,0 +1,36 @@ +"""add reranker_model_id to evaluation_result + +Revision ID: 110a5b8cfbb9 +Revises: ba9966bd453e +Create Date: 2026-03-19 10:52:11.951459 + +""" +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = '110a5b8cfbb9' +down_revision: str | Sequence[str] | None = 'ba9966bd453e' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('evaluation_result', sa.Column('reranker_model_id', sa.UUID(), nullable=True)) + op.create_index(op.f('ix_evaluation_result_reranker_model_id'), 'evaluation_result', ['reranker_model_id'], unique=False) + op.create_foreign_key(None, 'evaluation_result', 'reranker_model', ['reranker_model_id'], ['id'], ondelete='SET NULL') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, 'evaluation_result', type_='foreignkey') + op.drop_index(op.f('ix_evaluation_result_reranker_model_id'), table_name='evaluation_result') + op.drop_column('evaluation_result', 'reranker_model_id') + # ### end Alembic commands ### diff --git a/alembic/versions/1f0ac8aa38a4_add_evaluation_set.py b/alembic/versions/1f0ac8aa38a4_add_evaluation_set.py index d816d02..9602ee8 100644 --- a/alembic/versions/1f0ac8aa38a4_add_evaluation_set.py +++ b/alembic/versions/1f0ac8aa38a4_add_evaluation_set.py @@ -5,17 +5,18 @@ Create Date: 2026-03-12 22:13:05.918342 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = '1f0ac8aa38a4' -down_revision: Union[str, Sequence[str], None] = 'a7b8c9d0e1f2' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = 'a7b8c9d0e1f2' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/3505bfa74df6_add_aspect_to_reranker_model_and_.py b/alembic/versions/3505bfa74df6_add_aspect_to_reranker_model_and_.py new file mode 100644 index 0000000..98f21bd --- /dev/null +++ b/alembic/versions/3505bfa74df6_add_aspect_to_reranker_model_and_.py @@ -0,0 +1,35 @@ +"""add aspect to reranker_model and reranker_config to evaluation_result + +Revision ID: 3505bfa74df6 +Revises: 110a5b8cfbb9 +Create Date: 2026-03-19 15:16:18.474851 + +""" +from collections.abc import Sequence + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = '3505bfa74df6' +down_revision: str | Sequence[str] | None = '110a5b8cfbb9' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('evaluation_result', sa.Column('reranker_config', postgresql.JSONB(astext_type=sa.Text()), nullable=True)) + op.add_column('reranker_model', sa.Column('aspect', sa.String(length=3), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('reranker_model', 'aspect') + op.drop_column('evaluation_result', 'reranker_config') + # ### end Alembic commands ### diff --git a/alembic/versions/3884c47fe946_add_reranker_feature_columns_to_go_.py b/alembic/versions/3884c47fe946_add_reranker_feature_columns_to_go_.py new file mode 100644 index 0000000..fa60e23 --- /dev/null +++ b/alembic/versions/3884c47fe946_add_reranker_feature_columns_to_go_.py @@ -0,0 +1,40 @@ +"""add reranker feature columns to go_prediction + +Revision ID: 3884c47fe946 +Revises: 5fc2eb0f986d +Create Date: 2026-03-18 13:40:17.716092 + +""" +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = '3884c47fe946' +down_revision: str | Sequence[str] | None = '5fc2eb0f986d' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('go_prediction', sa.Column('vote_count', sa.Integer(), nullable=True)) + op.add_column('go_prediction', sa.Column('k_position', sa.Integer(), nullable=True)) + op.add_column('go_prediction', sa.Column('go_term_frequency', sa.Integer(), nullable=True)) + op.add_column('go_prediction', sa.Column('ref_annotation_density', sa.Integer(), nullable=True)) + op.add_column('go_prediction', sa.Column('neighbor_distance_std', sa.Float(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('go_prediction', 'neighbor_distance_std') + op.drop_column('go_prediction', 'ref_annotation_density') + op.drop_column('go_prediction', 'go_term_frequency') + op.drop_column('go_prediction', 'k_position') + op.drop_column('go_prediction', 'vote_count') + # ### end Alembic commands ### diff --git a/alembic/versions/47de89cf6fec_add_evaluation_result.py b/alembic/versions/47de89cf6fec_add_evaluation_result.py index e7c0792..9376c6c 100644 --- a/alembic/versions/47de89cf6fec_add_evaluation_result.py +++ b/alembic/versions/47de89cf6fec_add_evaluation_result.py @@ -5,17 +5,18 @@ Create Date: 2026-03-12 22:27:34.042479 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = '47de89cf6fec' -down_revision: Union[str, Sequence[str], None] = '1f0ac8aa38a4' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = '1f0ac8aa38a4' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py b/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py index b99dc62..de393fa 100644 --- a/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py +++ b/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py @@ -5,17 +5,15 @@ Create Date: 2026-03-15 11:17:30.865922 """ -from typing import Sequence, Union +from collections.abc import Sequence from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. revision: str = '489835ed5b31' -down_revision: Union[str, Sequence[str], None] = '7737a352d4fe' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = '7737a352d4fe' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/4f38043a5e41_add_parent_job_id.py b/alembic/versions/4f38043a5e41_add_parent_job_id.py index 56d6655..ee3c908 100644 --- a/alembic/versions/4f38043a5e41_add_parent_job_id.py +++ b/alembic/versions/4f38043a5e41_add_parent_job_id.py @@ -5,17 +5,17 @@ Create Date: 2026-03-09 11:55:12.264352 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. revision: str = '4f38043a5e41' -down_revision: Union[str, Sequence[str], None] = 'a1b2c3d4e5f6' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = 'a1b2c3d4e5f6' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py b/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py index 1890a22..4b856ae 100644 --- a/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py +++ b/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py @@ -5,17 +5,17 @@ Create Date: 2026-03-15 12:37:19.930750 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. revision: str = '513355a1d933' -down_revision: Union[str, Sequence[str], None] = '489835ed5b31' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = '489835ed5b31' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py b/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py index cde8fca..8722240 100644 --- a/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py +++ b/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py @@ -5,17 +5,18 @@ Create Date: 2026-03-16 11:42:10.636169 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = '54e758c210c8' -down_revision: Union[str, Sequence[str], None] = 'c1d2e3f4a5b6' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = 'c1d2e3f4a5b6' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/5fc2eb0f986d_add_composite_indexes_for_knn.py b/alembic/versions/5fc2eb0f986d_add_composite_indexes_for_knn.py new file mode 100644 index 0000000..1fcab83 --- /dev/null +++ b/alembic/versions/5fc2eb0f986d_add_composite_indexes_for_knn.py @@ -0,0 +1,37 @@ +"""add composite indexes for KNN performance + +Revision ID: 5fc2eb0f986d +Revises: 54e758c210c8 +Create Date: 2026-03-18 12:00:00.000000 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "5fc2eb0f986d" +down_revision: str = "54e758c210c8" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Composite index for KNN GO transfer: queries are always scoped to + # a single annotation_set_id and filtered by protein_accession. + op.create_index( + "ix_pga_set_accession", + "protein_go_annotation", + ["annotation_set_id", "protein_accession"], + ) + + # Composite index for prediction export and evaluation: queries filter + # by prediction_set_id then protein_accession. + op.create_index( + "ix_go_prediction_set_accession", + "go_prediction", + ["prediction_set_id", "protein_accession"], + ) + + +def downgrade() -> None: + op.drop_index("ix_go_prediction_set_accession", table_name="go_prediction") + op.drop_index("ix_pga_set_accession", table_name="protein_go_annotation") diff --git a/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py b/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py index f759c30..e8c1d3f 100644 --- a/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py +++ b/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py @@ -5,17 +5,13 @@ Create Date: 2026-03-15 10:11:56.507967 """ -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - +from collections.abc import Sequence # revision identifiers, used by Alembic. revision: str = '7737a352d4fe' -down_revision: Union[str, Sequence[str], None] = ('47de89cf6fec', 'b1c2d3e4f5a6') -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = ('47de89cf6fec', 'b1c2d3e4f5a6') +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/7c19ca08d5d4_add_support_entry_table.py b/alembic/versions/7c19ca08d5d4_add_support_entry_table.py index 599214c..b298e7a 100644 --- a/alembic/versions/7c19ca08d5d4_add_support_entry_table.py +++ b/alembic/versions/7c19ca08d5d4_add_support_entry_table.py @@ -5,17 +5,17 @@ Create Date: 2026-03-15 12:42:43.832417 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. revision: str = '7c19ca08d5d4' -down_revision: Union[str, Sequence[str], None] = '513355a1d933' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = '513355a1d933' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/a1b2c3d4e5f6_add_esm3c_chunking_normalize_residues.py b/alembic/versions/a1b2c3d4e5f6_add_esm3c_chunking_normalize_residues.py index 7dd4e56..7b1ed45 100644 --- a/alembic/versions/a1b2c3d4e5f6_add_esm3c_chunking_normalize_residues.py +++ b/alembic/versions/a1b2c3d4e5f6_add_esm3c_chunking_normalize_residues.py @@ -19,16 +19,17 @@ + uq_seq_embedding_seq_config_chunk (sequence_id, embedding_config_id, chunk_index_s) """ -from typing import Sequence, Union +from collections.abc import Sequence import sqlalchemy as sa + from alembic import op # revision identifiers, used by Alembic. revision: str = "a1b2c3d4e5f6" -down_revision: Union[str, Sequence[str], None] = "cdd8510858db" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "cdd8510858db" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py b/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py index 7d88cb1..184c96a 100644 --- a/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py +++ b/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py @@ -6,15 +6,16 @@ """ from __future__ import annotations -from typing import Sequence, Union +from collections.abc import Sequence import sqlalchemy as sa + from alembic import op revision: str = "a7b8c9d0e1f2" -down_revision: Union[str, Sequence[str], None] = "f1a2b3c4d5e6" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "f1a2b3c4d5e6" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py b/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py index 5eae559..0de80aa 100644 --- a/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py +++ b/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py @@ -6,10 +6,11 @@ """ from __future__ import annotations -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + revision = "b1c2d3e4f5a6" down_revision = "a7b8c9d0e1f2" branch_labels = None diff --git a/alembic/versions/ba9966bd453e_add_reranker_model_table.py b/alembic/versions/ba9966bd453e_add_reranker_model_table.py new file mode 100644 index 0000000..7f9e0a5 --- /dev/null +++ b/alembic/versions/ba9966bd453e_add_reranker_model_table.py @@ -0,0 +1,51 @@ +"""add reranker_model table + +Revision ID: ba9966bd453e +Revises: 3884c47fe946 +Create Date: 2026-03-18 13:57:29.263810 + +""" +from collections.abc import Sequence + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = 'ba9966bd453e' +down_revision: str | Sequence[str] | None = '3884c47fe946' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('reranker_model', + sa.Column('id', sa.UUID(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('prediction_set_id', sa.UUID(), nullable=True), + sa.Column('evaluation_set_id', sa.UUID(), nullable=True), + sa.Column('category', sa.String(length=10), nullable=False), + sa.Column('model_data', sa.Text(), nullable=False), + sa.Column('metrics', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column('feature_importance', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.ForeignKeyConstraint(['evaluation_set_id'], ['evaluation_set.id'], ondelete='SET NULL'), + sa.ForeignKeyConstraint(['prediction_set_id'], ['prediction_set.id'], ondelete='SET NULL'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('name') + ) + op.create_index(op.f('ix_reranker_model_evaluation_set_id'), 'reranker_model', ['evaluation_set_id'], unique=False) + op.create_index(op.f('ix_reranker_model_prediction_set_id'), 'reranker_model', ['prediction_set_id'], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_reranker_model_prediction_set_id'), table_name='reranker_model') + op.drop_index(op.f('ix_reranker_model_evaluation_set_id'), table_name='reranker_model') + op.drop_table('reranker_model') + # ### end Alembic commands ### diff --git a/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py b/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py index 4a3a4c9..fa88ddc 100644 --- a/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py +++ b/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py @@ -20,9 +20,10 @@ from __future__ import annotations import sqlalchemy as sa -from alembic import op from sqlalchemy.dialects import postgresql +from alembic import op + revision = "c1d2e3f4a5b6" down_revision = "7c19ca08d5d4" branch_labels = None diff --git a/alembic/versions/c3d4e5f6a7b8_add_query_set.py b/alembic/versions/c3d4e5f6a7b8_add_query_set.py index 6ff0e6e..98a6ffd 100644 --- a/alembic/versions/c3d4e5f6a7b8_add_query_set.py +++ b/alembic/versions/c3d4e5f6a7b8_add_query_set.py @@ -5,16 +5,16 @@ Create Date: 2026-03-10 00:00:00.000000 """ -from typing import Sequence, Union +from collections.abc import Sequence import sqlalchemy as sa + from alembic import op -from sqlalchemy.dialects import postgresql revision: str = "c3d4e5f6a7b8" -down_revision: Union[str, Sequence[str], None] = "4f38043a5e41" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "4f38043a5e41" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/cdd8510858db_initial_schema.py b/alembic/versions/cdd8510858db_initial_schema.py index 5125f7e..dfd4e5a 100644 --- a/alembic/versions/cdd8510858db_initial_schema.py +++ b/alembic/versions/cdd8510858db_initial_schema.py @@ -1,22 +1,23 @@ """initial_schema Revision ID: cdd8510858db -Revises: +Revises: Create Date: 2026-03-08 11:32:48.937483 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op -import sqlalchemy as sa import pgvector.sqlalchemy +import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = 'cdd8510858db' -down_revision: Union[str, Sequence[str], None] = None -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = None +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/d4e5f6a7b8c9_add_query_set_id_to_prediction_set.py b/alembic/versions/d4e5f6a7b8c9_add_query_set_id_to_prediction_set.py index f1cbc97..bf51266 100644 --- a/alembic/versions/d4e5f6a7b8c9_add_query_set_id_to_prediction_set.py +++ b/alembic/versions/d4e5f6a7b8c9_add_query_set_id_to_prediction_set.py @@ -5,15 +5,16 @@ Create Date: 2026-03-10 00:00:00.000000 """ -from typing import Sequence, Union +from collections.abc import Sequence import sqlalchemy as sa + from alembic import op revision: str = "d4e5f6a7b8c9" -down_revision: Union[str, Sequence[str], None] = "c3d4e5f6a7b8" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "c3d4e5f6a7b8" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/e5f6a7b8c9d0_drop_go_prediction_protein_fk.py b/alembic/versions/e5f6a7b8c9d0_drop_go_prediction_protein_fk.py index 84835e4..c8a2b96 100644 --- a/alembic/versions/e5f6a7b8c9d0_drop_go_prediction_protein_fk.py +++ b/alembic/versions/e5f6a7b8c9d0_drop_go_prediction_protein_fk.py @@ -5,14 +5,14 @@ Create Date: 2026-03-10 00:00:00.000000 """ -from typing import Sequence, Union +from collections.abc import Sequence from alembic import op revision: str = "e5f6a7b8c9d0" -down_revision: Union[str, Sequence[str], None] = "d4e5f6a7b8c9" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "d4e5f6a7b8c9" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py b/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py index a794190..76b74d6 100644 --- a/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py +++ b/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py @@ -6,9 +6,10 @@ """ from __future__ import annotations -from alembic import op import sqlalchemy as sa +from alembic import op + revision = "f1a2b3c4d5e6" down_revision = "e5f6a7b8c9d0" branch_labels = None diff --git a/apps/web/app/[locale]/annotations/page.tsx b/apps/web/app/[locale]/annotations/page.tsx index 73be48b..0affb9e 100644 --- a/apps/web/app/[locale]/annotations/page.tsx +++ b/apps/web/app/[locale]/annotations/page.tsx @@ -201,12 +201,12 @@ export default function AnnotationsPage() {

{t("title")}

-
+
{tabs.map((tab) => ( ))} -
+
{/* ── Annotation Sets ── */} {activeTab === "sets" && ( @@ -226,7 +226,51 @@ export default function AnnotationsPage() { {t("setsTab.refresh")} -
+ {/* Mobile card list */} +
+ {loadingSets && Array.from({ length: 3 }).map((_, i) => ( +
+
+
+
+ ))} + {!loadingSets && sets.length === 0 && ( +
+ {t("setsTab.noSetsFound")} +
+ )} + {sets.map((a) => ( +
+
+ {a.source} + +
+

{a.source_version ?? "—"} · {(a.annotation_count ?? 0).toLocaleString()} annotations

+
+ {a.meta && Object.entries(a.meta).map(([k, v]) => ( + + {k}: {Array.isArray(v) ? v.join(", ") : String(v)} + + ))} +
+
+ {shortId(a.id)} + {formatDate(a.created_at)} + {a.job_id && ( + ↗ + )} +
+
+ ))} +
+ + {/* Desktop table */} +
{t("setsTab.tableHeaders.id")}
{t("setsTab.tableHeaders.source")}
{t("setsTab.tableHeaders.version")}
{t("setsTab.tableHeaders.annotations")}
{t("setsTab.tableHeaders.meta")}
{t("setsTab.tableHeaders.created")}
@@ -278,7 +322,70 @@ export default function AnnotationsPage() { {t("snapshotsTab.refresh")}
-
+ {/* Mobile card list */} +
+ {loadingSnaps && Array.from({ length: 2 }).map((_, i) => ( +
+
+
+
+ ))} + {!loadingSnaps && snapshots.length === 0 && ( +
+ {t("snapshotsTab.noSnapshotsFound")} +
+ )} + {snapshots.map((s) => ( +
+
+ {s.obo_version} + {(s.go_term_count ?? 0).toLocaleString()} terms +
+
+ {iaEditId === s.id ? ( +
+ setIaEditValue(e.target.value)} + placeholder="https://…/IA_cafa6.tsv or file path" + className="w-full rounded border px-2 py-1.5 text-xs focus:outline-none focus:ring-1 focus:ring-blue-500" + onKeyDown={(e) => { + if (e.key === "Enter") handleSaveIa(s.id); + if (e.key === "Escape") setIaEditId(null); + }} + /> +
+ + +
+
+ ) : ( + + )} +
+
+ {shortId(s.id)} + {formatDate(s.loaded_at)} +
+
+ ))} +
+ + {/* Desktop table */} +
{t("snapshotsTab.tableHeaders.id")}
{t("snapshotsTab.tableHeaders.version")}
{t("snapshotsTab.tableHeaders.goTerms")}
{t("snapshotsTab.tableHeaders.iaUrl")}
{t("snapshotsTab.tableHeaders.loaded")}
diff --git a/apps/web/app/[locale]/embeddings/page.tsx b/apps/web/app/[locale]/embeddings/page.tsx index 7110cb8..d3411d0 100644 --- a/apps/web/app/[locale]/embeddings/page.tsx +++ b/apps/web/app/[locale]/embeddings/page.tsx @@ -5,12 +5,14 @@ import Link from "next/link"; import { useTranslations } from "next-intl"; import { useToast } from "@/components/Toast"; import { SkeletonTableRow } from "@/components/Skeleton"; +import { ContextBanner } from "@/components/ContextBanner"; import { listEmbeddingConfigs, createEmbeddingConfig, deleteEmbeddingConfig, createJob, listQuerySets, + getProteinStats, EmbeddingConfig, QuerySet, } from "@/lib/api"; @@ -95,6 +97,7 @@ export default function EmbeddingsPage() { const [cmpResult, setCmpResult] = useState<{ id: string; status: string } | null>(null); const [cmpError, setCmpError] = useState(""); const [cmpSubmitting, setCmpSubmitting] = useState(false); + const [proteinCount, setProteinCount] = useState(null); async function loadAll() { setLoading(true); @@ -106,6 +109,7 @@ export default function EmbeddingsPage() { ]); setConfigs(cfgs); setQuerySets(qsets); + getProteinStats().then((s) => setProteinCount(s.total ?? 0)).catch(() => {}); if (cfgs.length > 0 && !cmpConfigId) setCmpConfigId(cfgs[0].id); } catch (e: any) { setError(String(e)); @@ -224,6 +228,16 @@ export default function EmbeddingsPage() {

{t("title")}

+ 0, href: "/proteins" }, + { label: `${configs.length} embedding config(s)`, met: configs.length > 0 }, + ] : undefined} + nextStep={{ label: "Functional Annotation", href: "/functional-annotation" }} + /> + {error && (
           {error}
@@ -475,53 +489,92 @@ export default function EmbeddingsPage() {
               {Array.from({ length: 3 }).map((_, i) => )}
             
) : ( -
-
-
{t("configsTab.tableHeaders.description")}
-
{t("configsTab.tableHeaders.model")}
-
{t("configsTab.tableHeaders.backend")}
-
{t("configsTab.tableHeaders.layers")}
-
{t("configsTab.tableHeaders.agg")}
-
{t("configsTab.tableHeaders.pool")}
-
{t("configsTab.tableHeaders.norm")}
-
{t("configsTab.tableHeaders.created")}
-
-
- {configs.map((c) => ( -
-
- {c.description || } + <> + {/* Mobile card list */} +
+ {configs.map((c) => ( +
+
+ + {c.description || } + + +
+

{c.model_name}

+
+ {c.model_backend} + layers [{c.layer_indices.join(", ")}] + {c.layer_agg}/{c.pooling} + {c.normalize ? "norm" : "no norm"} +
+

{formatDate(c.created_at)}

-
{c.model_name}
-
{c.model_backend}
-
[{c.layer_indices.join(", ")}]
-
{c.layer_agg}
-
{c.pooling}
-
{c.normalize ? "yes" : "no"}
-
{formatDate(c.created_at)}
-
-
+ )} +
+ + {/* Desktop table */} +
+
+
{t("configsTab.tableHeaders.description")}
+
{t("configsTab.tableHeaders.model")}
+
{t("configsTab.tableHeaders.backend")}
+
{t("configsTab.tableHeaders.layers")}
+
{t("configsTab.tableHeaders.agg")}
+
{t("configsTab.tableHeaders.pool")}
+
{t("configsTab.tableHeaders.norm")}
+
{t("configsTab.tableHeaders.created")}
+
- ))} - {configs.length === 0 && ( -
- {t("configsTab.noConfigs")}{" "} - -
- )} -
+ {configs.map((c) => ( +
+
+ {c.description || } +
+
{c.model_name}
+
{c.model_backend}
+
[{c.layer_indices.join(", ")}]
+
{c.layer_agg}
+
{c.pooling}
+
{c.normalize ? "yes" : "no"}
+
{formatDate(c.created_at)}
+
+ +
+
+ ))} + {configs.length === 0 && ( +
+ {t("configsTab.noConfigs")}{" "} + +
+ )} +
+ )}
)} @@ -572,7 +625,7 @@ export default function EmbeddingsPage() {
-
+
-
+
{["BPO", "MFO", "CCO"].map((ns) => { const m = results[setting]?.[ns]; if (!m) return null; @@ -178,7 +201,10 @@ function ResultsTable({ results }: { results: Record }) {m.recall.toFixed(3)}
- {t("resultMetrics.coverage")} + + {t("resultMetrics.coverage")} + + {(m.coverage * 100).toFixed(1)}%
@@ -201,6 +227,7 @@ function EvaluationSetCard({ annotationSets, predictionSets, scoringConfigs, + rerankers: initialRerankers, isSelected, onSelect, onDeleted, @@ -209,6 +236,7 @@ function EvaluationSetCard({ annotationSets: AnnotationSet[]; predictionSets: PredictionSet[]; scoringConfigs: ScoringConfig[]; + rerankers: RerankerModel[]; isSelected: boolean; onSelect: () => void; onDeleted: () => void; @@ -219,6 +247,14 @@ function EvaluationSetCard({ const [predSetId, setPredSetId] = useState(""); const [maxDistance, setMaxDistance] = useState(""); const [scoringConfigId, setScoringConfigId] = useState(""); + // 3x3 reranker grid: category × aspect + const [rrGrid, setRrGrid] = useState>>({ + nk: { bpo: "", mfo: "", cco: "" }, + lk: { bpo: "", mfo: "", cco: "" }, + pk: { bpo: "", mfo: "", cco: "" }, + }); + const setRrCell = (cat: string, asp: string, val: string) => + setRrGrid((prev) => ({ ...prev, [cat]: { ...prev[cat], [asp]: val } })); const [running, setRunning] = useState(false); const [runError, setRunError] = useState(""); const [pendingJobId, setPendingJobId] = useState(null); @@ -277,7 +313,24 @@ function EvaluationSetCard({ try { const body: Record = { prediction_set_id: predSetId }; if (maxDistance) body.max_distance = parseFloat(maxDistance); - if (scoringConfigId) body.scoring_config_id = scoringConfigId; + // Build nested rerankers mapping from the 3×3 grid + const rerankers: Record> = {}; + let hasAnyReranker = false; + for (const cat of ["nk", "lk", "pk"]) { + const catMap: Record = {}; + for (const asp of ["bpo", "mfo", "cco"]) { + if (rrGrid[cat]?.[asp]) { + catMap[asp] = rrGrid[cat][asp]; + hasAnyReranker = true; + } + } + if (Object.keys(catMap).length > 0) rerankers[cat] = catMap; + } + if (hasAnyReranker) { + body.rerankers = rerankers; + } else if (scoringConfigId) { + body.scoring_config_id = scoringConfigId; + } const res = await apiFetch<{ id: string; status: string }>( `/annotations/evaluation-sets/${e.id}/run`, { @@ -305,7 +358,7 @@ function EvaluationSetCard({ className="cursor-pointer p-4 hover:bg-gray-50 rounded-t-lg" onClick={onSelect} > -
+
{evalLabel(e, annotationSets)}
-
+

{t("evaluationSetCard.runCafaEvaluator")}

-
+
setScoringConfigId(ev.target.value)} - className={selectClass} - > - - {scoringConfigs.map((c) => ( - - ))} - -
+ + {/* Scoring method — 3×3 grid (category × aspect) */} +
+ + {initialRerankers.length > 0 && ( +
+ + + + + + + + + + + {(["nk", "lk", "pk"] as const).map((cat) => ( + + + {(["bpo", "mfo", "cco"] as const).map((asp) => { + // Show models matching this category+aspect, or category+null (all-aspect models) + const candidates = initialRerankers.filter( + (r) => r.category === cat && (r.aspect === asp || r.aspect === null) + ); + return ( + + ); + })} + + ))} + +
BPOMFOCCO
{cat} + +
+
+ )} + {(() => { + const hasAnyRr = Object.values(rrGrid).some((catMap) => Object.values(catMap).some(Boolean)); + return scoringConfigs.length > 0 && !hasAnyRr ? ( +
+ + +
+ ) : null; + })()} +
{runError && (

{runError} @@ -512,22 +618,87 @@ function EvaluationSetCard({ {results.map((r) => { const pred = predictionSets.find((p) => p.id === r.prediction_set_id); const sc = scoringConfigs.find((c) => c.id === r.scoring_config_id); + const hasReranker = !!r.reranker_model_id; + const rr = initialRerankers.find((m) => m.id === r.reranker_model_id); return (

{/* Meta header */} -
+
-
+
{t("evaluationSetCard.predictionSet")} {pred ? {r.prediction_set_id.slice(0, 8)}… · {new Date(pred.created_at).toLocaleDateString()}{pred.prediction_count != null ? ` · ${pred.prediction_count.toLocaleString()} preds.` : ""} : {r.prediction_set_id.slice(0, 8)}… } + {pred && ( + +
+
Prediction Set
+
+ Config + {pred.embedding_config_name ?? pred.embedding_config_id.slice(0, 8) + "…"} +
+
+ Annotations + {pred.annotation_set_label ?? pred.annotation_set_id.slice(0, 8) + "…"} +
+
+ Ontology + {pred.ontology_snapshot_version ?? pred.ontology_snapshot_id.slice(0, 8) + "…"} +
+
+ Max dist. + {pred.distance_threshold ?? "—"} +
+
+ Limit/entry + {pred.limit_per_entry} +
+
+
+ )}
-
+
{t("evaluationSetCard.scoring")} - {sc ? sc.name : {t("evaluationSetCard.fallbackFormula")}} - {sc?.description && } + {r.reranker_config ? ( + + Re-ranker + {Object.entries(r.reranker_config).map(([cat, aspMap]) => ( + + {cat.toUpperCase()}({Object.keys(aspMap).map(a => a.toUpperCase()).join(",")}) + + ))} + + ) : hasReranker ? ( + + Re-ranker + {rr ? rr.name : "model"} + + ) : sc ? sc.name : {t("evaluationSetCard.fallbackFormula")}} + {sc && !hasReranker && ( + +
+
{sc.name}
+ {sc.description &&
{sc.description}
} +
+ Formula + {sc.formula} +
+ {Object.keys(sc.weights).length > 0 && ( +
+
Weights
+ {Object.entries(sc.weights).map(([k, v]) => ( +
+ {k} + {v} +
+ ))} +
+ )} +
+
+ )}
{new Date(r.created_at).toLocaleString()}
@@ -559,6 +730,7 @@ function EvaluationSetCard({
)}
+
)}
@@ -571,6 +743,7 @@ export default function EvaluationPage() { const [predictionSets, setPredictionSets] = useState([]); const [evaluationSets, setEvaluationSets] = useState([]); const [scoringConfigs, setScoringConfigs] = useState([]); + const [rerankers, setRerankers] = useState([]); const [loading, setLoading] = useState(true); const [oldSetId, setOldSetId] = useState(""); @@ -580,12 +753,13 @@ export default function EvaluationPage() { const [selectedEvalId, setSelectedEvalId] = useState(""); const reload = () => - Promise.all([listAnnotationSets(), listPredictionSets(), listEvaluationSets(), listScoringConfigs()]) - .then(([ann, pred, ev, sc]) => { + Promise.all([listAnnotationSets(), listPredictionSets(), listEvaluationSets(), listScoringConfigs(), listRerankers()]) + .then(([ann, pred, ev, sc, rr]) => { setAnnotationSets(ann); setPredictionSets(pred); setEvaluationSets(ev); setScoringConfigs(sc); + setRerankers(rr); }) .finally(() => setLoading(false)); @@ -616,9 +790,19 @@ export default function EvaluationPage() { if (loading) return
Loading…
; return ( -
+

{t("title")}

+ = 2, href: "/annotations" }, + { label: `${predictionSets.length} prediction set(s)`, met: predictionSets.length > 0, href: "/functional-annotation" }, + ]} + nextStep={{ label: "Scoring configs", href: "/scoring" }} + /> + {/* ── Generate Evaluation Set ───────────────────────────────── */}
@@ -627,7 +811,7 @@ export default function EvaluationPage() { {t("generateSection.description")}

-
+
setSelectedConfigId(e.target.value)} - className="rounded-md border bg-white px-2 py-1.5 text-sm text-gray-700 shadow-sm focus:outline-none focus:ring-2 focus:ring-blue-500" - > - - {scoringConfigs.map((c) => ( - - ))} - - -
- +
+
+ +
+ +
+
{selectedConfigId && ( + {/* ── Executive summary ── */} + {activeTab === "proteins" && distribution && ( +
+
+
{proteinTotal.toLocaleString()}
+
Proteins
+
+ {(["P", "F", "C"] as const).map((aspect) => ( +
+
+ {(distribution.aspect_totals[aspect] ?? 0).toLocaleString()} +
+
{ASPECT_LABELS[aspect]}
+
+ ))} +
+ )} + {/* ── Proteins ── */} {activeTab === "proteins" && (
-
-
+
+ setProteinSearchInput(e.target.value)} placeholder="Filter by accession…" - className="rounded-md border px-3 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500 w-56" + className="rounded-md border px-3 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500 w-full sm:w-56" /> )} - {proteinTotal.toLocaleString()} proteins + {proteinTotal.toLocaleString()} proteins +
+ + {/* Mobile card list */} +
+ {loadingProteins && Array.from({ length: 4 }).map((_, i) => ( +
+
+
+
+ ))} + {!loadingProteins && proteins.length === 0 && ( +
No proteins found.
+ )} + {!loadingProteins && proteins.map((p) => ( +
+
selectProtein(p.accession, p.in_db)} + > +
+
+ + {p.in_db ? ( + e.stopPropagation()}> + {p.accession} + + ) : ( + {p.accession} + )} +
+ {p.go_count} predicted +
+
+ dist: {p.min_distance?.toFixed(4) ?? "—"} + known/pred: {p.annotation_count}/{p.go_count} +
+
+ {selectedAccession === p.accession && ( +
+ setSelectedAccession(null)} + ontologySnapshotId={ontologySnapshotId} + scoringConfig={selectedConfig} + /> +
+ )} +
+ ))}
-
-
+ {/* Desktop table */} +
+
Accession
Predicted
Min Distance
-
Known
-
Matches
+
Known / Pred.
{loadingProteins && Array.from({ length: 8 }).map((_, i) => )} @@ -911,12 +1081,18 @@ export default function PredictionSetDetailPage({ params }: { params: Promise<{ {!loadingProteins && proteins.map((p) => (
selectProtein(p.accession, p.in_db)} >
+ {p.in_db ? (
{p.go_count}
{p.min_distance?.toFixed(4) ?? "—"}
-
0 ? "text-gray-700" : "text-gray-300"}`}> - {p.annotation_count > 0 ? p.annotation_count : "—"} -
-
0 ? "text-green-700" : "text-gray-300"}`}> - {p.match_count > 0 ? p.match_count : "—"} +
+ {p.annotation_count > 0 + ? {p.annotation_count} + : 0} + / + {p.go_count}
diff --git a/apps/web/app/[locale]/functional-annotation/page.tsx b/apps/web/app/[locale]/functional-annotation/page.tsx index 6aca0b0..92626b4 100644 --- a/apps/web/app/[locale]/functional-annotation/page.tsx +++ b/apps/web/app/[locale]/functional-annotation/page.tsx @@ -5,6 +5,7 @@ import Link from "next/link"; import { useTranslations } from "next-intl"; import { useToast } from "@/components/Toast"; import { SkeletonTableRow } from "@/components/Skeleton"; +import { ContextBanner } from "@/components/ContextBanner"; import { listEmbeddingConfigs, launchPredictGoTerms, @@ -163,6 +164,17 @@ export default function FunctionalAnnotationPage() {

{t("title")}

+ 0, href: "/embeddings" }, + { label: `${annotationSets.length} annotation set(s)`, met: annotationSets.length > 0, href: "/annotations" }, + { label: `${ontologySnapshots.length} ontology snapshot(s)`, met: ontologySnapshots.length > 0, href: "/annotations" }, + ] : undefined} + nextStep={{ label: "Evaluation", href: "/evaluation" }} + /> +
{tabs.map((tab) => (
-
+

{t("predictTab.searchBackend")}

-
+
{predFaissIndex === "IVFFlat" && ( -
+
setPredFaissNlist(parseInt(e.target.value, 10))} min={1} className={inputClass} /> @@ -364,7 +376,7 @@ export default function FunctionalAnnotationPage() {
)} {predFaissIndex === "HNSW" && ( -
+
setPredFaissHnswM(parseInt(e.target.value, 10))} min={2} className={inputClass} /> @@ -422,13 +434,14 @@ export default function FunctionalAnnotationPage() {
-
+
{t("resultsTab.tableHeaders.id")}
{t("resultsTab.tableHeaders.config")}
{t("resultsTab.tableHeaders.annotationSet")}
{t("resultsTab.tableHeaders.snapshot")}
{t("resultsTab.tableHeaders.goTerms")}
{t("resultsTab.tableHeaders.distanceThreshold")}
+
{t("resultsTab.tableHeaders.k")}
{t("resultsTab.tableHeaders.created")}
@@ -436,7 +449,7 @@ export default function FunctionalAnnotationPage() { {predictionSets.map((ps) => (
@@ -450,6 +463,7 @@ export default function FunctionalAnnotationPage() {
{ps.distance_threshold != null ? ps.distance_threshold : }
+
{ps.limit_per_entry}
{formatDate(ps.created_at)}
+
+
+ ); + } + + if (!data) { + return ( +
+
+
+ {[0, 1, 2].map((i) => ( +
+ ))} +
+
+
+ ); + } + + const hasFmax = data.best_fmax && Object.keys(data.best_fmax).length > 0; + const hasComparison = data.method_comparison && Object.keys(data.method_comparison).length > 0; + + // Available categories (only those with data) + const availableCategories = CATEGORIES.filter( + (cat) => data.best_fmax?.[cat] || data.method_comparison?.[cat] + ); + + // Current category data + const catFmax = data.best_fmax?.[activeCategory] ?? {}; + const catMethods = data.method_comparison?.[activeCategory] ?? []; + const baseline = catMethods.find((m) => m.method === "knn_baseline"); + + return ( +
+ {/* ── Hero ──────────────────────────────────────────────────── */} +
+

+ PROTEA +

+

+ {t("subtitle")} +

+
+ + {/* ── Annotate form ─────────────────────────────────────────── */} + + + {/* ── Category tabs ─────────────────────────────────────────── */} + {hasFmax ? ( + <> +
+
+

+ {t("bestResults")} +

+
+ {availableCategories.map((cat) => ( + + ))} +
+ + {CATEGORY_LABELS[activeCategory]} + +
+ + {/* ── Fmax cards ────────────────────────────────────────── */} +
+ {ASPECTS.map((aspect) => { + const d = catFmax[aspect]; + if (!d) return null; + const color = ASPECT_COLORS[aspect]; + return ( +
+
+ {d.fmax.toFixed(2)} +
+
+ {t("fmax")} {aspect} +
+
+ {ASPECT_LABELS[aspect]} +
+
+ {d.method_label} +
+
+ ); + })} +
+
+ + {/* ── Method comparison table ───────────────────────────── */} + {catMethods.length > 0 && ( +
+

+ {t("methodComparison")} + + ({activeCategory}) + +

+
+ + + + + {ASPECTS.map((a) => ( + + ))} + + + + {catMethods.map((row, i) => { + const isBest = ASPECTS.some( + (a) => catFmax[a]?.method === row.method + ); + return ( + + + {ASPECTS.map((aspect) => { + const val = (row as any)[aspect]?.fmax; + const baseVal = baseline ? (baseline as any)[aspect]?.fmax : null; + const delta = val != null && baseVal != null && row.method !== "knn_baseline" + ? val - baseVal + : null; + return ( + + ); + })} + + ); + })} + +
{t("method")} + {a} +
+ {t(METHOD_KEYS[row.method] ?? row.method)} + {isBest && ( + best + )} + + {val != null ? ( + + {val.toFixed(3)} + {delta != null && ( + 0 ? "text-green-600" : delta < 0 ? "text-red-600" : "text-gray-400"}`}> + {delta > 0 ? "+" : ""}{delta.toFixed(3)} + + )} + + ) : ( + + )} +
+
+
+ )} + + ) : ( +
+

{t("noDataYet")}

+ + {t("getStarted")} + +
+ )} + + {/* ── Pipeline diagram ──────────────────────────────────────── */} +
+

+ {t("pipeline")} +

+
+ {data.pipeline_stages.map((stage, i) => ( +
+ {i > 0 && ( +
+ → +
+ )} + +
+ ))} + {/* LLM stage (future) */} +
+
+ → +
+
+ LLM + {t("stageLlm")} + soon +
+
+
+
+ + {/* ── Stats bar ─────────────────────────────────────────────── */} +
+

+ {t("stats")} +

+
+ {([ + ["proteins", data.counts.proteins], + ["sequences", data.counts.sequences], + ["embeddings", data.counts.embeddings], + ["predictions", data.counts.predictions], + ] as [string, number][]).map(([key, count]) => ( +
+
+ {count.toLocaleString()} +
+
{t(key as any)}
+
+ ))} +
+
+ + {/* ── CTAs ──────────────────────────────────────────────────── */} +
+ + {t("exploreResults")} + + + {t("annotateProteins")} + +
+
+ ); } diff --git a/apps/web/app/[locale]/proteins/[accession]/page.tsx b/apps/web/app/[locale]/proteins/[accession]/page.tsx index 2c5b4c0..5ca2158 100644 --- a/apps/web/app/[locale]/proteins/[accession]/page.tsx +++ b/apps/web/app/[locale]/proteins/[accession]/page.tsx @@ -4,6 +4,7 @@ import { use, useEffect, useState } from "react"; import Link from "next/link"; import { useToast } from "@/components/Toast"; import { useTranslations } from "next-intl"; +import { Breadcrumbs } from "@/components/Breadcrumbs"; import { getProtein, getProteinAnnotations, getGoSubgraph, listOntologySnapshots, ProteinDetail, ProteinAnnotation, GoSubgraph } from "@/lib/api"; import dynamic from "next/dynamic"; const GoGraph = dynamic(() => import("@/components/GoGraph"), { ssr: false }); @@ -88,7 +89,7 @@ export default function ProteinDetailPage({ params }: { params: Promise<{ access <> {/* Header */}
- {t("backToProteins")} +

{protein.accession}

diff --git a/apps/web/app/[locale]/proteins/page.tsx b/apps/web/app/[locale]/proteins/page.tsx index 4360894..6334e8e 100644 --- a/apps/web/app/[locale]/proteins/page.tsx +++ b/apps/web/app/[locale]/proteins/page.tsx @@ -236,8 +236,41 @@ export default function ProteinsPage() { {t("browseTab.totalProteins", { count: total.toLocaleString() })}
- {/* Table */} -
+ {/* Mobile card list */} +
+ {loadingBrowse && Array.from({ length: 4 }).map((_, i) => ( +
+
+
+
+ ))} + {!loadingBrowse && proteins.length === 0 && ( +
+ {t("browseTab.noProteinsCta")} +
+ )} + {!loadingBrowse && proteins.map((p) => ( + +
+ {p.accession} + +
+

{p.gene_name ?? "—"}

+

{p.organism ?? "—"}

+
+ {p.entry_name ?? "—"} + {p.length != null && {p.length.toLocaleString()} aa} +
+ + ))} +
+ + {/* Desktop table */} +
{t("browseTab.tableHeaders.accession")}
{t("browseTab.tableHeaders.entryName")}
@@ -353,7 +386,7 @@ export default function ProteinsPage() { setSearchCriteria(e.target.value)} required className={inputClass} placeholder="organism_id:9606 AND reviewed:true" />

{t("insertTab.searchCriteriaHelper")}

-
+
setPageSize(parseInt(e.target.value, 10))} min={1} className={inputClass} /> @@ -395,7 +428,7 @@ export default function ProteinsPage() { setMetaCriteria(e.target.value)} required className={inputClass} placeholder="organism_id:9606 AND reviewed:true" />

{t("metadataTab.searchCriteriaHelper")}

-
+
setMetaPageSize(parseInt(e.target.value, 10))} min={1} className={inputClass} /> diff --git a/apps/web/app/[locale]/query-sets/page.tsx b/apps/web/app/[locale]/query-sets/page.tsx index fc1a63b..49b6c00 100644 --- a/apps/web/app/[locale]/query-sets/page.tsx +++ b/apps/web/app/[locale]/query-sets/page.tsx @@ -117,7 +117,7 @@ export default function QuerySetsPage() { {/* List */}
-
+
{t("tableHeaders.name")}
{t("tableHeaders.sequences")}
{t("tableHeaders.created")}
@@ -143,7 +143,7 @@ export default function QuerySetsPage() { {sets.map((qs) => (
setExpandedId(expandedId === qs.id ? null : qs.id)} >
diff --git a/apps/web/app/[locale]/reranker/page.tsx b/apps/web/app/[locale]/reranker/page.tsx new file mode 100644 index 0000000..edf0751 --- /dev/null +++ b/apps/web/app/[locale]/reranker/page.tsx @@ -0,0 +1,574 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { ContextBanner } from "@/components/ContextBanner"; +import { + baseUrl, + listPredictionSets, + listAnnotationSets, + listRerankers, + trainReranker, + deleteReranker, + getRerankedTsvUrl, + getRerankerMetrics, + getTrainingDataTsvUrl, +} from "@/lib/api"; +import type { PredictionSet, AnnotationSet, RerankerModel } from "@/lib/api"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +async function apiFetch(path: string, init?: RequestInit): Promise { + const res = await fetch(`${baseUrl()}${path}`, { cache: "no-store", ...init }); + if (!res.ok) throw new Error(await res.text()); + return res.json(); +} + +type EvaluationSet = { + id: string; + old_annotation_set_id: string; + new_annotation_set_id: string; + created_at: string; + stats: Record; +}; + +const listEvaluationSets = () => apiFetch("/annotations/evaluation-sets"); + +function shortId(id: string) { return id.slice(0, 8); } + +function predLabel(p: PredictionSet) { + const parts: string[] = []; + if (p.embedding_config_name) parts.push(p.embedding_config_name); + if (p.annotation_set_label) parts.push(p.annotation_set_label); + parts.push(`k=${p.limit_per_entry}`); + if (p.prediction_count != null) parts.push(`${p.prediction_count.toLocaleString()} preds`); + return `${parts.join(" · ")} (${shortId(p.id)}…)`; +} + +function evalLabel(es: EvaluationSet, annotationSets: AnnotationSet[]) { + const oldSet = annotationSets.find((a) => a.id === es.old_annotation_set_id); + const newSet = annotationSets.find((a) => a.id === es.new_annotation_set_id); + const oldVer = oldSet ? `[${oldSet.source.toUpperCase()}] ${oldSet.source_version ?? "?"}` : shortId(es.old_annotation_set_id); + const newVer = newSet ? `[${newSet.source.toUpperCase()}] ${newSet.source_version ?? "?"}` : shortId(es.new_annotation_set_id); + const delta = es.stats.delta_proteins ?? "?"; + return `${oldVer} → ${newVer} · ${delta} delta proteins (${shortId(es.id)}…)`; +} + +const labelClass = "block text-sm font-medium text-gray-700 mb-1"; +const selectClass = + "w-full rounded-md border border-gray-300 px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"; +const btnPrimary = + "rounded-md bg-blue-600 px-4 py-2 text-sm font-medium text-white hover:bg-blue-700 disabled:opacity-50 transition-colors"; +const btnDanger = + "rounded-md bg-red-50 border border-red-200 px-3 py-1.5 text-xs font-medium text-red-600 hover:bg-red-100 transition-colors"; + +const CATEGORY_HINTS: Record = { + nk: "No Knowledge: proteins with zero GO annotations at t0. Hardest setting — measures pure prediction ability.", + lk: "Limited Knowledge: proteins annotated in some GO namespaces but not all at t0. New annotations in previously empty namespaces.", + pk: "Partial Knowledge: proteins that already had annotations in a namespace at t0 and gained new ones at t1.", +}; + +const ASPECT_LABELS: Record = { + bpo: "BPO (Biological Process)", + mfo: "MFO (Molecular Function)", + cco: "CCO (Cellular Component)", +}; + +// --------------------------------------------------------------------------- +// Feature importance bar chart +// --------------------------------------------------------------------------- + +function FeatureImportanceChart({ importance }: { importance: Record }) { + const entries = Object.entries(importance) + .sort(([, a], [, b]) => b - a) + .filter(([, v]) => v > 0); + if (entries.length === 0) return

No feature importance data

; + const maxVal = entries[0][1]; + + return ( +
+ {entries.map(([name, val]) => ( +
+ {name} +
+
+
+ + {val >= 1000 ? `${(val / 1000).toFixed(1)}k` : val.toFixed(0)} + +
+ ))} +
+ ); +} + +// --------------------------------------------------------------------------- +// Metrics display +// --------------------------------------------------------------------------- + +function MetricsBadge({ label, value, suffix }: { label: string; value: number | string | undefined; suffix?: string }) { + if (value === undefined) return null; + const formatted = typeof value === "number" ? value.toFixed(4) : value; + return ( +
+

{label}

+

{formatted}{suffix}

+
+ ); +} + +// --------------------------------------------------------------------------- +// Reranker card +// --------------------------------------------------------------------------- + +function RerankerCard({ + model, + predictionSets, + evaluationSets, + annotationSets, + onDelete, +}: { + model: RerankerModel; + predictionSets: PredictionSet[]; + evaluationSets: EvaluationSet[]; + annotationSets: AnnotationSet[]; + onDelete: () => void; +}) { + const [expanded, setExpanded] = useState(false); + const [metricsLoading, setMetricsLoading] = useState(false); + const [metrics, setMetrics] = useState | null>(null); + const [metricsError, setMetricsError] = useState(null); + const [deleting, setDeleting] = useState(false); + + // For computing metrics on a different prediction set + const [metricsPsId, setMetricsPsId] = useState(model.prediction_set_id ?? ""); + const [metricsEsId, setMetricsEsId] = useState(model.evaluation_set_id ?? ""); + const [metricsCategory, setMetricsCategory] = useState(model.category); + + async function handleComputeMetrics() { + if (!metricsPsId || !metricsEsId) return; + setMetricsLoading(true); + setMetricsError(null); + setMetrics(null); + try { + const result = await getRerankerMetrics(metricsPsId, model.id, metricsEsId, metricsCategory); + setMetrics(result); + } catch (e: any) { + setMetricsError(e.message ?? "Failed to compute metrics"); + } finally { + setMetricsLoading(false); + } + } + + async function handleDelete() { + if (!confirm(`Delete reranker "${model.name}"?`)) return; + setDeleting(true); + try { + await deleteReranker(model.id); + onDelete(); + } catch { + setDeleting(false); + } + } + + const m = model.metrics; + + return ( +
+
setExpanded(!expanded)} + > +
+
+ {model.name} + + {model.category} + + {model.aspect && ( + + {model.aspect} + + )} +
+
+ {new Date(model.created_at).toLocaleDateString()} + {expanded ? "▲" : "▼"} +
+
+
+ AUC: {m.val_auc?.toFixed(4) ?? "—"} + F1: {m.val_f1?.toFixed(4) ?? "—"} + Precision: {m.val_precision?.toFixed(4) ?? "—"} + Recall: {m.val_recall?.toFixed(4) ?? "—"} + Positive rate: {m.positive_rate != null ? `${(m.positive_rate * 100).toFixed(2)}%` : "—"} +
+
+ + {expanded && ( +
+ {/* Validation metrics */} +
+

Validation metrics

+
+ + + + +
+
+ Train samples: {m.train_samples?.toLocaleString()} + Val samples: {m.val_samples?.toLocaleString()} +
+
+ + {/* Feature importance */} +
+

Feature importance (gain)

+ +
+ + {/* Download reranked TSV */} + {model.prediction_set_id && ( +
+

Download re-ranked predictions

+ + ↓ Download reranked TSV + +
+ )} + + {/* Compute CAFA metrics */} +
+

Compute CAFA metrics

+
+
+ + +
+
+ + +
+
+ + +

{CATEGORY_HINTS[metricsCategory]}

+
+
+ + {metricsError &&

{metricsError}

} + {metrics && ( +
+
+ + + + + + +
+ {metrics.curve && metrics.curve.length > 0 && ( +

{metrics.curve.length} PR curve points computed

+ )} +
+ )} +
+ + {/* Source info */} +
+ Prediction set: {model.prediction_set_id ? shortId(model.prediction_set_id) : "—"} + Evaluation set: {model.evaluation_set_id ? shortId(model.evaluation_set_id) : "—"} + ID: {shortId(model.id)} +
+ + {/* Delete */} +
+ +
+
+ )} +
+ ); +} + +// --------------------------------------------------------------------------- +// Main page +// --------------------------------------------------------------------------- + +export default function RerankerPage() { + const [rerankers, setRerankers] = useState([]); + const [predictionSets, setPredictionSets] = useState([]); + const [evaluationSets, setEvaluationSets] = useState([]); + const [annotationSets, setAnnotationSets] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + // Train form + const [trainName, setTrainName] = useState(""); + const [trainPsId, setTrainPsId] = useState(""); + const [trainEsId, setTrainEsId] = useState(""); + const [trainCategory, setTrainCategory] = useState("nk"); + const [trainAspect, setTrainAspect] = useState(""); + const [trainNegPosRatio, setTrainNegPosRatio] = useState(""); + const [extraPairs, setExtraPairs] = useState<{ psId: string; esId: string }[]>([]); + const [training, setTraining] = useState(false); + const [trainError, setTrainError] = useState(null); + + async function loadAll() { + setLoading(true); + setError(null); + try { + const [r, ps, es, as_] = await Promise.all([ + listRerankers(), + listPredictionSets(), + listEvaluationSets(), + listAnnotationSets(), + ]); + setRerankers(r); + setPredictionSets(ps); + setEvaluationSets(es); + setAnnotationSets(as_); + } catch (e: any) { + setError(e.message ?? "Failed to load data"); + } finally { + setLoading(false); + } + } + + useEffect(() => { loadAll(); }, []); + + async function handleTrain() { + if (!trainName.trim() || !trainPsId || !trainEsId) return; + setTraining(true); + setTrainError(null); + try { + const validExtraPairs = extraPairs + .filter((p) => p.psId && p.esId) + .map((p) => ({ prediction_set_id: p.psId, evaluation_set_id: p.esId })); + const model = await trainReranker({ + name: trainName.trim(), + prediction_set_id: trainPsId, + evaluation_set_id: trainEsId, + category: trainCategory, + aspect: trainAspect || null, + neg_pos_ratio: trainNegPosRatio ? parseFloat(trainNegPosRatio) : null, + extra_pairs: validExtraPairs.length > 0 ? validExtraPairs : undefined, + }); + setRerankers((prev) => [...prev, model]); + setTrainName(""); + } catch (e: any) { + setTrainError(e.message ?? "Training failed"); + } finally { + setTraining(false); + } + } + + return ( + <> +

Re-ranker Models

+ + 0, href: "/functional-annotation" }, + { label: `${evaluationSets.length} evaluation set(s)`, met: evaluationSets.length > 0, href: "/evaluation" }, + ]} + nextStep={{ label: "Evaluation", href: "/evaluation" }} + /> +

+ LightGBM binary classifiers trained on temporal holdout data (CAFA protocol). + A re-ranker uses alignment, taxonomy, and aggregate features to re-score GO predictions + with calibrated probabilities, replacing the raw embedding distance ranking. +

+ + {/* Train new reranker */} +
+

Train new re-ranker

+
+
+ + setTrainName(e.target.value)} + placeholder="e.g. reranker-nk-bpo-v1" + className="w-full rounded-md border border-gray-300 px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500" + /> +
+
+ + +
+
+ + +
+
+ + +

{CATEGORY_HINTS[trainCategory]}

+
+
+ + +
+
+ + setTrainNegPosRatio(e.target.value)} + className="w-full rounded-md border border-gray-300 px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500" + /> +
+
+ + {/* Extra training pairs */} +
+
+ + +
+ {extraPairs.map((pair, i) => ( +
+ + + +
+ ))} + {extraPairs.length > 0 && ( +

+ Data from all pairs will be concatenated before training a single model. + {extraPairs.filter((p) => p.psId && p.esId).length > 0 && + ` (${1 + extraPairs.filter((p) => p.psId && p.esId).length} pairs total)`} +

+ )} +
+ +
+ + {trainPsId && trainEsId && ( + + ↓ Preview training data TSV + + )} +
+ {trainError &&

{trainError}

} +
+ + {/* List of rerankers */} + {loading &&

Loading...

} + {error &&

{error}

} + + {!loading && rerankers.length === 0 && ( +
+ No re-ranker models trained yet. Use the form above to train one. +
+ )} + +
+ {rerankers.map((model) => ( + setRerankers((prev) => prev.filter((r) => r.id !== model.id))} + /> + ))} +
+ + ); +} diff --git a/apps/web/components/AnnotateForm.tsx b/apps/web/components/AnnotateForm.tsx new file mode 100644 index 0000000..e28e1cf --- /dev/null +++ b/apps/web/components/AnnotateForm.tsx @@ -0,0 +1,302 @@ +"use client"; + +import { useState, useRef, useCallback, useEffect } from "react"; +import { useRouter } from "next/navigation"; +import { useTranslations } from "next-intl"; +import { + annotateProteins, + getJob, + launchPredictGoTerms, + listPredictionSets, + type AnnotateResult, +} from "@/lib/api"; + +type Stage = "idle" | "uploading" | "embedding" | "predicting" | "done" | "error"; + +const POLL_MS = 3_000; + +const EXAMPLE_FASTA = `>sp|P04637|P53_HUMAN Cellular tumor antigen p53 +MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP +DEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYPQGLNGTVNLPGRNSFEV +RVCACPGRDRRTEEENLHKTTGIDSFLHPEVEYFTPETDPAGPMCSRHFYQLAKTCPVQLW +VDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHERCTCGGNHGISTTTGICLICQFFLVHKP +>sp|P38398|BRCA1_HUMAN Breast cancer type 1 susceptibility protein +MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQC +PLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKDEV +SIIQSMGYRNRAKRLLQSEPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYIELG`; + +export function AnnotateForm() { + const t = useTranslations("home"); + const router = useRouter(); + + const [fasta, setFasta] = useState(""); + const [stage, setStage] = useState("idle"); + const [error, setError] = useState(null); + const [progress, setProgress] = useState(""); + const [predictionSetId, setPredictionSetId] = useState(null); + const [rerankerId, setRerankerId] = useState(null); + const fileRef = useRef(null); + const abortRef = useRef(false); + + // Drag-and-drop state + const [dragOver, setDragOver] = useState(false); + + const handleFile = (file: File) => { + const reader = new FileReader(); + reader.onload = (e) => { + const text = e.target?.result; + if (typeof text === "string") setFasta(text); + }; + reader.readAsText(file); + }; + + const handleDrop = (e: React.DragEvent) => { + e.preventDefault(); + setDragOver(false); + const file = e.dataTransfer.files?.[0]; + if (file) handleFile(file); + }; + + const pollJob = useCallback( + async (jobId: string): Promise<"succeeded" | "failed"> => { + while (!abortRef.current) { + try { + const job = await getJob(jobId); + if (job.progress_total && job.progress_current) { + const pct = Math.round((job.progress_current / job.progress_total) * 100); + setProgress(`${pct}%`); + } + if (job.status === "succeeded") return "succeeded"; + if (job.status === "failed" || job.status === "cancelled") return "failed"; + } catch { + // transient error, keep polling + } + await new Promise((r) => setTimeout(r, POLL_MS)); + } + return "failed"; + }, + [], + ); + + const handleSubmit = async () => { + if (!fasta.trim()) return; + abortRef.current = false; + setError(null); + setStage("uploading"); + setProgress(""); + + try { + // Step 1: Upload FASTA + create embedding job + setProgress(t("annotateUploading" as any)); + const result: AnnotateResult = await annotateProteins({ + fastaText: fasta, + name: `Annotation ${new Date().toISOString().slice(0, 16)}`, + }); + + // Step 2: Poll embedding job + setStage("embedding"); + setProgress("0%"); + const embedResult = await pollJob(result.embedding_job_id); + if (embedResult === "failed") { + throw new Error("Embedding computation failed"); + } + + // Step 3: Launch prediction + setStage("predicting"); + setProgress("0%"); + const predictJob = await launchPredictGoTerms(result.predict_payload as Parameters[0]); + + // Step 4: Poll prediction job + const predictResult = await pollJob(predictJob.id); + if (predictResult === "failed") { + throw new Error("Prediction failed"); + } + + // Step 5: Find the prediction set created for this query_set + const sets = await listPredictionSets(); + const match = sets.find( + (s) => + (s as any).query_set_id === result.query_set_id && + s.embedding_config_id === result.embedding_config_id, + ); + if (match) { + setPredictionSetId(match.id); + } + if (result.reranker_id) { + setRerankerId(result.reranker_id); + } + + setStage("done"); + setProgress(""); + } catch (err: any) { + setStage("error"); + setError(err?.message ?? "Unknown error"); + } + }; + + // Auto-redirect when done + useEffect(() => { + if (stage === "done" && predictionSetId) { + const timer = setTimeout(() => { + const qs = rerankerId ? `?reranker_id=${rerankerId}` : ""; + router.push(`/functional-annotation/${predictionSetId}${qs}`); + }, 1500); + return () => clearTimeout(timer); + } + }, [stage, predictionSetId, rerankerId, router]); + + // Cleanup on unmount + useEffect(() => { + return () => { + abortRef.current = true; + }; + }, []); + + const isRunning = stage === "uploading" || stage === "embedding" || stage === "predicting"; + + return ( +
+

+ {t("annotateTitle" as any)} +

+

+ {t("annotateDescription" as any)} +

+ + {/* FASTA input */} +
{ + e.preventDefault(); + setDragOver(true); + }} + onDragLeave={() => setDragOver(false)} + onDrop={handleDrop} + > +