diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index d6d306a..60ddd05 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -10,7 +10,6 @@ on: env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true jobs: build-backend: @@ -20,7 +19,7 @@ jobs: packages: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -59,7 +58,7 @@ jobs: packages: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2990e78..9b3eca3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -4,9 +4,6 @@ on: push: pull_request: -env: - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true - jobs: docs: runs-on: ubuntu-22.04 @@ -17,9 +14,9 @@ jobs: poetry-version: ["2.1.0"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 61d5b39..142365c 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -5,9 +5,6 @@ on: branches: [main] pull_request: -env: - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true - jobs: integration: runs-on: ubuntu-22.04 @@ -33,9 +30,9 @@ jobs: --health-retries 5 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index a757709..ae0079d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -4,9 +4,6 @@ on: push: pull_request: -env: - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true - jobs: lint: runs-on: ubuntu-22.04 @@ -17,9 +14,9 @@ jobs: poetry-version: ["2.1.0"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b1ed5d4..db75c06 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,9 +4,6 @@ on: push: pull_request: -env: - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true - jobs: test: runs-on: ubuntu-22.04 @@ -17,9 +14,9 @@ jobs: poetry-version: ["2.1.0"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} diff --git a/Dockerfile b/Dockerfile index 0df0128..17c1ecc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.12-slim +# ── Stage 1: build dependencies ────────────────────────────────────────────── +FROM python:3.12-slim AS builder RUN apt-get update && apt-get install -y \ build-essential \ @@ -8,16 +9,30 @@ RUN apt-get update && apt-get install -y \ WORKDIR /app -# Install Poetry and dependencies first (layer cache) RUN pip install --no-cache-dir poetry==2.1.0 COPY pyproject.toml poetry.lock ./ RUN poetry config virtualenvs.create false \ && poetry install --without dev --no-root --no-interaction --no-ansi -# Copy source COPY protea/ ./protea/ RUN poetry install --without dev --no-interaction --no-ansi + +# ── Stage 2: runtime ──────────────────────────────────────────────────────── +FROM python:3.12-slim + +RUN apt-get update && apt-get install -y \ + libpq5 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy installed packages from builder +COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Copy application code +COPY protea/ ./protea/ COPY scripts/ ./scripts/ COPY alembic/ ./alembic/ COPY alembic.ini ./ @@ -25,7 +40,10 @@ COPY alembic.ini ./ ENV PYTHONUNBUFFERED=1 EXPOSE 8000 +HEALTHCHECK --interval=30s --timeout=5s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1 + # Default: API server # Override CMD to run a worker: # docker run protea python scripts/worker.py --queue protea.jobs -CMD ["uvicorn", "protea.api.app:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["uvicorn", "protea.api.app:create_app", "--factory", "--host", "0.0.0.0", "--port", "8000"] diff --git a/RERANKER.md b/RERANKER.md new file mode 100644 index 0000000..2301546 --- /dev/null +++ b/RERANKER.md @@ -0,0 +1,188 @@ +# Temporal Holdout Re-Ranker for GO Term Prediction + +## Motivación + +El pipeline actual de PROTEA transfiere anotaciones GO mediante KNN sobre embeddings ESM, usando un scoring heurístico que combina distancia de embedding y pesos de evidencia. Este scoring no está optimizado para la métrica objetivo (Fmax) ni para el comportamiento real de las anotaciones GO a lo largo del tiempo. + +La hipótesis central es que existe una señal aprendible: **dado el contexto de una predicción KNN, ¿acabará este GO term apareciendo en el siguiente release de GOA para esta proteína?** Esta señal puede extraerse directamente del mecanismo de holdout temporal que ya implementa PROTEA. + +--- + +## Formulación del Problema + +Sea $\mathcal{G}_N$ el conjunto de anotaciones GO en el release $N$ de GOA (Swiss-Prot reviewed). Para cada par consecutivo $(G_N, G_{N+1})$, el delta temporal es: + +$$\Delta_{N \to N+1} = \{(p, t) \mid (p, t) \in \mathcal{G}_{N+1} \setminus \mathcal{G}_N\}$$ + +El re-ranker aprende una función: + +$$f(q, t, \mathcal{N}_K(q)) \to \hat{y} \in [0, 1]$$ + +donde: +- $q$ es la proteína query (representada por su embedding ESM) +- $t$ es el GO term candidato +- $\mathcal{N}_K(q)$ es el conjunto de $K$ vecinos más cercanos en el espacio de embeddings con referencia $\mathcal{G}_N$ +- $\hat{y}$ es la probabilidad de que $(q, t) \in \Delta_{N \to N+1}$ + +--- + +## Protocolo de Entrenamiento + +Se utiliza validación cruzada temporal con múltiples splits históricos de GOA: + +``` +Training splits: + GOA_190 → GOA_195 + GOA_195 → GOA_200 + GOA_200 → GOA_205 + GOA_205 → GOA_211 + GOA_211 → GOA_215 + GOA_215 → GOA_220 + +Test split (holdout estricto, nunca visto durante training): + GOA_220 → GOA_229 +``` + +Para cada split se generan ejemplos etiquetados: positivos $(y=1)$ si el par (proteína, GO term) aparece en el delta, negativos $(y=0)$ en caso contrario. El desbalanceo esperado es aproximadamente 1:10, manejable con técnicas estándar. + +--- + +## Arquitectura: Cross-Attention Re-Ranker + +El modelo procesa cada par (query, GO term) usando el contexto completo de los vecinos KNN que contribuyeron a esa predicción. + +``` +Inputs por predicción (query_protein, go_term): + query_embedding float32[D] ESM embedding del query (D=480 para esmc_300m) + neighbor_embeddings float32[K × D] ESM embeddings de los K vecinos contribuyentes + tabular_features float32[K × F] distancia, evidencia, alineamiento, taxonomía... + go_term_embedding float32[G] embedding semántico del GO term (G=64) + +Arquitectura: + 1. query_proj(query_embedding) → q [H=256] + 2. ref_proj(neighbor_embeddings) → tokens [K × H] + 3. feature_encoder(tabular_features) → (sumado a tokens) + 4. CrossAttention(q, tokens, tokens) → context [H] + 5. MLP([q ‖ context ‖ go_emb ‖ agg_features]) → score [1] +``` + +La atención cruzada permite al modelo aprender **qué vecinos son más informativos para este query concreto**, en lugar de agregar los scores de forma heurística. + +### GO Term Embeddings + +Los embeddings de los GO terms se aprenden a partir de la estructura del DAG de GO (relaciones `is_a` / `part_of`) mediante Node2Vec o TransE, de forma que términos semánticamente relacionados (padre-hijo) tengan representaciones similares. El DAG ya está disponible en PROTEA a través de los modelos `GOTerm` y `GOTermRelationship`. + +--- + +## Feature Vector + +Cada predicción (query, GO term) se caracteriza por las siguientes features tabulares, computadas por vecino que contribuyó a la predicción: + +| Feature | Descripción | Estado | +|---|---|---| +| `distance` | Distancia coseno en espacio de embeddings | Existente | +| `evidence_weight` | Peso del código de evidencia (IDA > IEA) | Existente | +| `identity_nw / sw` | Identidad de secuencia (alineamiento NW/SW) | Existente (opcional) | +| `similarity_nw / sw` | Similaridad de secuencia | Existente (opcional) | +| `taxonomic_distance` | Distancia taxonómica entre query y referencia | Existente (opcional) | +| `vote_count` | Número de vecinos que coinciden en este GO term | **Nuevo** | +| `k_position` | Posición del vecino más cercano que predijo este término | **Nuevo** | +| `go_term_frequency` | Frecuencia del término en el annotation set de referencia | **Nuevo** | +| `ref_annotation_density` | Número de GO terms de la proteína de referencia | **Nuevo** | +| `neighbor_distance_std` | Varianza de distancias a los K vecinos | **Nuevo** | + +--- + +## Función de Pérdida + +Se utiliza **LambdaRank** en lugar de binary cross-entropy, ya que optimiza directamente el orden de las predicciones (proxy de NDCG / Fmax) en lugar de la calibración de probabilidades. + +Para cada proteína query, las predicciones GO se rankean conjuntamente: +- Positivos: GO terms en $\Delta_{N \to N+1}$ +- Negativos: GO terms predichos pero no en el delta + +--- + +## Pipeline de Datos: WebDataset + +El volumen de datos (múltiples splits × ~1.35M predicciones por split × embeddings de 480 dim) requiere un pipeline de datos eficiente. Se propone almacenar los ejemplos de entrenamiento en formato **WebDataset** (shards tar), con un shard por split GOA: + +``` +reranker_data/ + splits/ + goa190_to_195.tar # ~2GB por shard + goa195_to_200.tar + ... + goa220_to_229.tar # test split — no tocar durante training + models/ + reranker_v1.pt + reranker_v1_config.json +``` + +Cada muestra en el WebDataset es **una proteína query** con todas sus predicciones GO para ese split: + +```python +{ + "query_accession": "P12345", + "query_embedding": float32[480], + "go_term_ids": ["GO:0006915", "GO:0005737", ...], # N_preds + "neighbor_embeddings": float32[N_preds, K, 480], + "tabular_features": float32[N_preds, K, F], + "labels": int8[N_preds], # 1 si en delta, 0 si no +} +``` + +El streaming de WebDataset permite entrenar sin cargar todo en RAM. + +--- + +## Stack Tecnológico + +| Componente | Tecnología | +|---|---| +| Modelo | PyTorch | +| Data pipeline | WebDataset + torch.utils.data | +| Baseline comparación | LightGBM (binary + LambdaRank) | +| GO embeddings | Node2Vec / PyTorch Geometric | +| Seguimiento experimentos | wandb | +| Embeddings proteína | ESM2 / ESMC (ya en PROTEA) | + +--- + +## Integración en PROTEA + +Una vez entrenado, el re-ranker se integra en el pipeline existente: + +1. Nuevo modelo ORM `RerankingModel`: almacena pesos serializados y metadata de entrenamiento +2. Campo `reranker_id` (nullable) en `PredictionSet` +3. Si `reranker_id` presente: `store_predictions` aplica el modelo y sobreescribe `score` con $\hat{y}$ +4. El threshold de Fmax se calcula igual que ahora sobre los nuevos scores +5. UI: selector de re-ranker en la pantalla de predicción + +--- + +## Experimentos y Ablaciones + +El diseño permite comparar directamente: + +| Configuración | Descripción | +|---|---| +| **Baseline** | KNN + scoring heurístico actual | +| **LightGBM tabular** | Re-ranker con features tabulares sin embeddings | +| **LightGBM + derived** | Features tabulares + features derivadas del embedding (density, std) | +| **MLP cross-encoder** | Arquitectura completa sin cross-attention | +| **Cross-attention (propuesto)** | Arquitectura completa | +| **+ GO DAG embeddings** | Ablación: ¿aportan los go_term_emb? | +| **+ temporal CV** | Ablación: ¿mejora añadir más splits históricos? | + +La métrica principal es **Fmax promedio sobre los 9 settings** (NK/LK/PK × BPO/MFO/CCO) en el test split GOA220→229. + +--- + +## Valor para la Tesis + +1. **Científicamente honesto**: el mismo mecanismo temporal que se usa para evaluar se usa para entrenar. No hay data leakage. +2. **Comprobable y cuantificable**: Fmax(baseline KNN) vs Fmax(re-ranker) en benchmark idéntico. +3. **Interpretable**: las feature importances (LightGBM) o los pesos de atención (cross-attention) revelan qué aspectos de una predicción KNN son más predictivos de anotaciones futuras. +4. **Generalizable**: el re-ranker aprende sobre distribuciones temporales de anotaciones GO, no sobre una proteína concreta — debería generalizar a proteínas no vistas. +5. **Extensible**: la arquitectura admite incorporar embeddings de secuencia de mayor calidad (ESM3, ProstT5) sin cambiar el pipeline. diff --git a/alembic/env.py b/alembic/env.py index 6cb72b8..ba6ce44 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,8 +1,7 @@ from logging.config import fileConfig from pathlib import Path -from sqlalchemy import engine_from_config -from sqlalchemy import pool +from sqlalchemy import engine_from_config, pool from alembic import context @@ -17,14 +16,14 @@ # Wire PROTEA's ORM metadata so autogenerate works. # All model modules must be imported before Base.metadata is used. -from protea.infrastructure.orm.base import Base -import protea.infrastructure.orm.models # noqa: F401 — registers all mappers +import protea.infrastructure.orm.models # noqa: E402, F401 — registers all mappers +from protea.infrastructure.orm.base import Base # noqa: E402 target_metadata = Base.metadata # Override the DB URL from PROTEA's settings rather than relying on the # placeholder value in alembic.ini. -from protea.infrastructure.settings import load_settings +from protea.infrastructure.settings import load_settings # noqa: E402 _project_root = Path(__file__).resolve().parents[1] _settings = load_settings(_project_root) diff --git a/alembic/versions/110a5b8cfbb9_add_reranker_model_id_to_evaluation_.py b/alembic/versions/110a5b8cfbb9_add_reranker_model_id_to_evaluation_.py new file mode 100644 index 0000000..4164b2d --- /dev/null +++ b/alembic/versions/110a5b8cfbb9_add_reranker_model_id_to_evaluation_.py @@ -0,0 +1,36 @@ +"""add reranker_model_id to evaluation_result + +Revision ID: 110a5b8cfbb9 +Revises: ba9966bd453e +Create Date: 2026-03-19 10:52:11.951459 + +""" +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = '110a5b8cfbb9' +down_revision: str | Sequence[str] | None = 'ba9966bd453e' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('evaluation_result', sa.Column('reranker_model_id', sa.UUID(), nullable=True)) + op.create_index(op.f('ix_evaluation_result_reranker_model_id'), 'evaluation_result', ['reranker_model_id'], unique=False) + op.create_foreign_key(None, 'evaluation_result', 'reranker_model', ['reranker_model_id'], ['id'], ondelete='SET NULL') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, 'evaluation_result', type_='foreignkey') + op.drop_index(op.f('ix_evaluation_result_reranker_model_id'), table_name='evaluation_result') + op.drop_column('evaluation_result', 'reranker_model_id') + # ### end Alembic commands ### diff --git a/alembic/versions/1f0ac8aa38a4_add_evaluation_set.py b/alembic/versions/1f0ac8aa38a4_add_evaluation_set.py index d816d02..9602ee8 100644 --- a/alembic/versions/1f0ac8aa38a4_add_evaluation_set.py +++ b/alembic/versions/1f0ac8aa38a4_add_evaluation_set.py @@ -5,17 +5,18 @@ Create Date: 2026-03-12 22:13:05.918342 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = '1f0ac8aa38a4' -down_revision: Union[str, Sequence[str], None] = 'a7b8c9d0e1f2' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = 'a7b8c9d0e1f2' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/3505bfa74df6_add_aspect_to_reranker_model_and_.py b/alembic/versions/3505bfa74df6_add_aspect_to_reranker_model_and_.py new file mode 100644 index 0000000..98f21bd --- /dev/null +++ b/alembic/versions/3505bfa74df6_add_aspect_to_reranker_model_and_.py @@ -0,0 +1,35 @@ +"""add aspect to reranker_model and reranker_config to evaluation_result + +Revision ID: 3505bfa74df6 +Revises: 110a5b8cfbb9 +Create Date: 2026-03-19 15:16:18.474851 + +""" +from collections.abc import Sequence + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = '3505bfa74df6' +down_revision: str | Sequence[str] | None = '110a5b8cfbb9' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('evaluation_result', sa.Column('reranker_config', postgresql.JSONB(astext_type=sa.Text()), nullable=True)) + op.add_column('reranker_model', sa.Column('aspect', sa.String(length=3), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('reranker_model', 'aspect') + op.drop_column('evaluation_result', 'reranker_config') + # ### end Alembic commands ### diff --git a/alembic/versions/3884c47fe946_add_reranker_feature_columns_to_go_.py b/alembic/versions/3884c47fe946_add_reranker_feature_columns_to_go_.py new file mode 100644 index 0000000..fa60e23 --- /dev/null +++ b/alembic/versions/3884c47fe946_add_reranker_feature_columns_to_go_.py @@ -0,0 +1,40 @@ +"""add reranker feature columns to go_prediction + +Revision ID: 3884c47fe946 +Revises: 5fc2eb0f986d +Create Date: 2026-03-18 13:40:17.716092 + +""" +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = '3884c47fe946' +down_revision: str | Sequence[str] | None = '5fc2eb0f986d' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('go_prediction', sa.Column('vote_count', sa.Integer(), nullable=True)) + op.add_column('go_prediction', sa.Column('k_position', sa.Integer(), nullable=True)) + op.add_column('go_prediction', sa.Column('go_term_frequency', sa.Integer(), nullable=True)) + op.add_column('go_prediction', sa.Column('ref_annotation_density', sa.Integer(), nullable=True)) + op.add_column('go_prediction', sa.Column('neighbor_distance_std', sa.Float(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('go_prediction', 'neighbor_distance_std') + op.drop_column('go_prediction', 'ref_annotation_density') + op.drop_column('go_prediction', 'go_term_frequency') + op.drop_column('go_prediction', 'k_position') + op.drop_column('go_prediction', 'vote_count') + # ### end Alembic commands ### diff --git a/alembic/versions/47de89cf6fec_add_evaluation_result.py b/alembic/versions/47de89cf6fec_add_evaluation_result.py index e7c0792..9376c6c 100644 --- a/alembic/versions/47de89cf6fec_add_evaluation_result.py +++ b/alembic/versions/47de89cf6fec_add_evaluation_result.py @@ -5,17 +5,18 @@ Create Date: 2026-03-12 22:27:34.042479 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = '47de89cf6fec' -down_revision: Union[str, Sequence[str], None] = '1f0ac8aa38a4' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = '1f0ac8aa38a4' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py b/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py index b99dc62..de393fa 100644 --- a/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py +++ b/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py @@ -5,17 +5,15 @@ Create Date: 2026-03-15 11:17:30.865922 """ -from typing import Sequence, Union +from collections.abc import Sequence from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. revision: str = '489835ed5b31' -down_revision: Union[str, Sequence[str], None] = '7737a352d4fe' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = '7737a352d4fe' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/4f38043a5e41_add_parent_job_id.py b/alembic/versions/4f38043a5e41_add_parent_job_id.py index 56d6655..ee3c908 100644 --- a/alembic/versions/4f38043a5e41_add_parent_job_id.py +++ b/alembic/versions/4f38043a5e41_add_parent_job_id.py @@ -5,17 +5,17 @@ Create Date: 2026-03-09 11:55:12.264352 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. revision: str = '4f38043a5e41' -down_revision: Union[str, Sequence[str], None] = 'a1b2c3d4e5f6' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = 'a1b2c3d4e5f6' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py b/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py index 1890a22..4b856ae 100644 --- a/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py +++ b/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py @@ -5,17 +5,17 @@ Create Date: 2026-03-15 12:37:19.930750 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. revision: str = '513355a1d933' -down_revision: Union[str, Sequence[str], None] = '489835ed5b31' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = '489835ed5b31' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py b/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py index cde8fca..8722240 100644 --- a/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py +++ b/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py @@ -5,17 +5,18 @@ Create Date: 2026-03-16 11:42:10.636169 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = '54e758c210c8' -down_revision: Union[str, Sequence[str], None] = 'c1d2e3f4a5b6' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = 'c1d2e3f4a5b6' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/5fc2eb0f986d_add_composite_indexes_for_knn.py b/alembic/versions/5fc2eb0f986d_add_composite_indexes_for_knn.py new file mode 100644 index 0000000..1fcab83 --- /dev/null +++ b/alembic/versions/5fc2eb0f986d_add_composite_indexes_for_knn.py @@ -0,0 +1,37 @@ +"""add composite indexes for KNN performance + +Revision ID: 5fc2eb0f986d +Revises: 54e758c210c8 +Create Date: 2026-03-18 12:00:00.000000 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "5fc2eb0f986d" +down_revision: str = "54e758c210c8" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Composite index for KNN GO transfer: queries are always scoped to + # a single annotation_set_id and filtered by protein_accession. + op.create_index( + "ix_pga_set_accession", + "protein_go_annotation", + ["annotation_set_id", "protein_accession"], + ) + + # Composite index for prediction export and evaluation: queries filter + # by prediction_set_id then protein_accession. + op.create_index( + "ix_go_prediction_set_accession", + "go_prediction", + ["prediction_set_id", "protein_accession"], + ) + + +def downgrade() -> None: + op.drop_index("ix_go_prediction_set_accession", table_name="go_prediction") + op.drop_index("ix_pga_set_accession", table_name="protein_go_annotation") diff --git a/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py b/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py index f759c30..e8c1d3f 100644 --- a/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py +++ b/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py @@ -5,17 +5,13 @@ Create Date: 2026-03-15 10:11:56.507967 """ -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - +from collections.abc import Sequence # revision identifiers, used by Alembic. revision: str = '7737a352d4fe' -down_revision: Union[str, Sequence[str], None] = ('47de89cf6fec', 'b1c2d3e4f5a6') -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = ('47de89cf6fec', 'b1c2d3e4f5a6') +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/7c19ca08d5d4_add_support_entry_table.py b/alembic/versions/7c19ca08d5d4_add_support_entry_table.py index 599214c..b298e7a 100644 --- a/alembic/versions/7c19ca08d5d4_add_support_entry_table.py +++ b/alembic/versions/7c19ca08d5d4_add_support_entry_table.py @@ -5,17 +5,17 @@ Create Date: 2026-03-15 12:42:43.832417 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. revision: str = '7c19ca08d5d4' -down_revision: Union[str, Sequence[str], None] = '513355a1d933' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = '513355a1d933' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/a1b2c3d4e5f6_add_esm3c_chunking_normalize_residues.py b/alembic/versions/a1b2c3d4e5f6_add_esm3c_chunking_normalize_residues.py index 7dd4e56..7b1ed45 100644 --- a/alembic/versions/a1b2c3d4e5f6_add_esm3c_chunking_normalize_residues.py +++ b/alembic/versions/a1b2c3d4e5f6_add_esm3c_chunking_normalize_residues.py @@ -19,16 +19,17 @@ + uq_seq_embedding_seq_config_chunk (sequence_id, embedding_config_id, chunk_index_s) """ -from typing import Sequence, Union +from collections.abc import Sequence import sqlalchemy as sa + from alembic import op # revision identifiers, used by Alembic. revision: str = "a1b2c3d4e5f6" -down_revision: Union[str, Sequence[str], None] = "cdd8510858db" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "cdd8510858db" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py b/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py index 7d88cb1..184c96a 100644 --- a/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py +++ b/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py @@ -6,15 +6,16 @@ """ from __future__ import annotations -from typing import Sequence, Union +from collections.abc import Sequence import sqlalchemy as sa + from alembic import op revision: str = "a7b8c9d0e1f2" -down_revision: Union[str, Sequence[str], None] = "f1a2b3c4d5e6" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "f1a2b3c4d5e6" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py b/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py index 5eae559..0de80aa 100644 --- a/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py +++ b/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py @@ -6,10 +6,11 @@ """ from __future__ import annotations -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + revision = "b1c2d3e4f5a6" down_revision = "a7b8c9d0e1f2" branch_labels = None diff --git a/alembic/versions/ba9966bd453e_add_reranker_model_table.py b/alembic/versions/ba9966bd453e_add_reranker_model_table.py new file mode 100644 index 0000000..7f9e0a5 --- /dev/null +++ b/alembic/versions/ba9966bd453e_add_reranker_model_table.py @@ -0,0 +1,51 @@ +"""add reranker_model table + +Revision ID: ba9966bd453e +Revises: 3884c47fe946 +Create Date: 2026-03-18 13:57:29.263810 + +""" +from collections.abc import Sequence + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = 'ba9966bd453e' +down_revision: str | Sequence[str] | None = '3884c47fe946' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('reranker_model', + sa.Column('id', sa.UUID(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('prediction_set_id', sa.UUID(), nullable=True), + sa.Column('evaluation_set_id', sa.UUID(), nullable=True), + sa.Column('category', sa.String(length=10), nullable=False), + sa.Column('model_data', sa.Text(), nullable=False), + sa.Column('metrics', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column('feature_importance', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.ForeignKeyConstraint(['evaluation_set_id'], ['evaluation_set.id'], ondelete='SET NULL'), + sa.ForeignKeyConstraint(['prediction_set_id'], ['prediction_set.id'], ondelete='SET NULL'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('name') + ) + op.create_index(op.f('ix_reranker_model_evaluation_set_id'), 'reranker_model', ['evaluation_set_id'], unique=False) + op.create_index(op.f('ix_reranker_model_prediction_set_id'), 'reranker_model', ['prediction_set_id'], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_reranker_model_prediction_set_id'), table_name='reranker_model') + op.drop_index(op.f('ix_reranker_model_evaluation_set_id'), table_name='reranker_model') + op.drop_table('reranker_model') + # ### end Alembic commands ### diff --git a/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py b/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py index 4a3a4c9..fa88ddc 100644 --- a/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py +++ b/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py @@ -20,9 +20,10 @@ from __future__ import annotations import sqlalchemy as sa -from alembic import op from sqlalchemy.dialects import postgresql +from alembic import op + revision = "c1d2e3f4a5b6" down_revision = "7c19ca08d5d4" branch_labels = None diff --git a/alembic/versions/c3d4e5f6a7b8_add_query_set.py b/alembic/versions/c3d4e5f6a7b8_add_query_set.py index 6ff0e6e..98a6ffd 100644 --- a/alembic/versions/c3d4e5f6a7b8_add_query_set.py +++ b/alembic/versions/c3d4e5f6a7b8_add_query_set.py @@ -5,16 +5,16 @@ Create Date: 2026-03-10 00:00:00.000000 """ -from typing import Sequence, Union +from collections.abc import Sequence import sqlalchemy as sa + from alembic import op -from sqlalchemy.dialects import postgresql revision: str = "c3d4e5f6a7b8" -down_revision: Union[str, Sequence[str], None] = "4f38043a5e41" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "4f38043a5e41" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/cdd8510858db_initial_schema.py b/alembic/versions/cdd8510858db_initial_schema.py index 5125f7e..dfd4e5a 100644 --- a/alembic/versions/cdd8510858db_initial_schema.py +++ b/alembic/versions/cdd8510858db_initial_schema.py @@ -1,22 +1,23 @@ """initial_schema Revision ID: cdd8510858db -Revises: +Revises: Create Date: 2026-03-08 11:32:48.937483 """ -from typing import Sequence, Union +from collections.abc import Sequence -from alembic import op -import sqlalchemy as sa import pgvector.sqlalchemy +import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = 'cdd8510858db' -down_revision: Union[str, Sequence[str], None] = None -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = None +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/d4e5f6a7b8c9_add_query_set_id_to_prediction_set.py b/alembic/versions/d4e5f6a7b8c9_add_query_set_id_to_prediction_set.py index f1cbc97..bf51266 100644 --- a/alembic/versions/d4e5f6a7b8c9_add_query_set_id_to_prediction_set.py +++ b/alembic/versions/d4e5f6a7b8c9_add_query_set_id_to_prediction_set.py @@ -5,15 +5,16 @@ Create Date: 2026-03-10 00:00:00.000000 """ -from typing import Sequence, Union +from collections.abc import Sequence import sqlalchemy as sa + from alembic import op revision: str = "d4e5f6a7b8c9" -down_revision: Union[str, Sequence[str], None] = "c3d4e5f6a7b8" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "c3d4e5f6a7b8" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/e5f6a7b8c9d0_drop_go_prediction_protein_fk.py b/alembic/versions/e5f6a7b8c9d0_drop_go_prediction_protein_fk.py index 84835e4..c8a2b96 100644 --- a/alembic/versions/e5f6a7b8c9d0_drop_go_prediction_protein_fk.py +++ b/alembic/versions/e5f6a7b8c9d0_drop_go_prediction_protein_fk.py @@ -5,14 +5,14 @@ Create Date: 2026-03-10 00:00:00.000000 """ -from typing import Sequence, Union +from collections.abc import Sequence from alembic import op revision: str = "e5f6a7b8c9d0" -down_revision: Union[str, Sequence[str], None] = "d4e5f6a7b8c9" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None +down_revision: str | Sequence[str] | None = "d4e5f6a7b8c9" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None def upgrade() -> None: diff --git a/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py b/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py index a794190..76b74d6 100644 --- a/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py +++ b/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py @@ -6,9 +6,10 @@ """ from __future__ import annotations -from alembic import op import sqlalchemy as sa +from alembic import op + revision = "f1a2b3c4d5e6" down_revision = "e5f6a7b8c9d0" branch_labels = None diff --git a/apps/web/app/[locale]/annotations/page.tsx b/apps/web/app/[locale]/annotations/page.tsx index 73be48b..0affb9e 100644 --- a/apps/web/app/[locale]/annotations/page.tsx +++ b/apps/web/app/[locale]/annotations/page.tsx @@ -201,12 +201,12 @@ export default function AnnotationsPage() {
{a.source_version ?? "—"} · {(a.annotation_count ?? 0).toLocaleString()} annotations
+
{error}
@@ -475,53 +489,92 @@ export default function EmbeddingsPage() {
{Array.from({ length: 3 }).map((_, i) => )}
{c.model_name}
+{formatDate(c.created_at)}
{t("evaluationSetCard.runCafaEvaluator")}
-| + | BPO | +MFO | +CCO | +
|---|---|---|---|
| {cat} | + {(["bpo", "mfo", "cco"] as const).map((asp) => { + // Show models matching this category+aspect, or category+null (all-aspect models) + const candidates = initialRerankers.filter( + (r) => r.category === cat && (r.aspect === asp || r.aspect === null) + ); + return ( ++ + | + ); + })} +
{runError} @@ -512,22 +618,87 @@ function EvaluationSetCard({ {results.map((r) => { const pred = predictionSets.find((p) => p.id === r.prediction_set_id); const sc = scoringConfigs.find((c) => c.id === r.scoring_config_id); + const hasReranker = !!r.reranker_model_id; + const rr = initialRerankers.find((m) => m.id === r.reranker_model_id); return (
{sc.formula}
+