From 37f224032871368e780a12ad8e68fe4271864fbd Mon Sep 17 00:00:00 2001 From: Maxine Levesque <170461181+maxinelevesque@users.noreply.github.com> Date: Thu, 26 Feb 2026 01:52:21 -0800 Subject: [PATCH] feat: add array format type recognition and ndarray v1.1.0 annotation display Add KNOWN_ARRAY_FORMATS constant and ARRAY_FORMAT_LABELS for the six recognized array format tokens (numpyBytes, parquetBytes, sparseBytes, structuredBytes, arrowTensor, safetensors). Update row_to_schema() to surface arrayFormat, dtype, shape, and dimensionNames from the schema body as top-level fields. Update frontend templates (schema detail, schemas list, profile, dataset detail) to display format and annotation info when present. Update MCP server descriptions to mention new formats. Co-Authored-By: Claude Opus 4.6 --- src/atdata_app/frontend/routes.py | 8 ++ .../frontend/templates/dataset.html | 14 +++ .../frontend/templates/profile.html | 3 +- src/atdata_app/frontend/templates/schema.html | 12 +++ .../frontend/templates/schemas.html | 3 +- src/atdata_app/mcp_server.py | 8 +- src/atdata_app/models.py | 39 ++++++++ tests/test_frontend.py | 97 ++++++++++++++++++- tests/test_models.py | 91 +++++++++++++++++ 9 files changed, 269 insertions(+), 6 deletions(-) diff --git a/src/atdata_app/frontend/routes.py b/src/atdata_app/frontend/routes.py index 9a815e4..d528841 100644 --- a/src/atdata_app/frontend/routes.py +++ b/src/atdata_app/frontend/routes.py @@ -105,6 +105,13 @@ async def dataset_detail(request: Request, did: str, rkey: str): if len(parts) == 3: schema_did, _, schema_rkey = parts + # Fetch the referenced schema for inline display of format/annotation info + schema_info = None + if schema_did and schema_rkey: + schema_row = await query_get_schema(pool, schema_did, schema_rkey) + if schema_row: + schema_info = row_to_schema(schema_row) + # Fetch labels pointing to this dataset dataset_uri = entry["uri"] label_rows = await query_labels_for_dataset(pool, dataset_uri) @@ -117,6 +124,7 @@ async def dataset_detail(request: Request, did: str, rkey: str): "entry": entry, "schema_did": schema_did, "schema_rkey": schema_rkey, + "schema_info": schema_info, "labels": labels, }, ) diff --git a/src/atdata_app/frontend/templates/dataset.html b/src/atdata_app/frontend/templates/dataset.html index 4b32fed..78f20b8 100644 --- a/src/atdata_app/frontend/templates/dataset.html +++ b/src/atdata_app/frontend/templates/dataset.html @@ -27,6 +27,20 @@

Details

License{{ entry.license }} {% endif %} Schema{{ entry.schemaRef }} + {% if schema_info %} + {% if schema_info.arrayFormat is defined %} + Array Format{{ schema_info.get("arrayFormatLabel", schema_info.arrayFormat) }} + {% endif %} + {% if schema_info.dtype is defined %} + Data Type{{ schema_info.dtype }} + {% endif %} + {% if schema_info.shape is defined %} + Shape{{ schema_info.shape | join(" × ") }} + {% endif %} + {% if schema_info.dimensionNames is defined %} + Dimensions{{ schema_info.dimensionNames | join(", ") }} + {% endif %} + {% endif %} {% if entry.size %} Size diff --git a/src/atdata_app/frontend/templates/profile.html b/src/atdata_app/frontend/templates/profile.html index 46026b4..04c4e29 100644 --- a/src/atdata_app/frontend/templates/profile.html +++ b/src/atdata_app/frontend/templates/profile.html @@ -34,13 +34,14 @@

{{ entry.name }}

Schemas {% if schemas %} - + {% for s in schemas %} + {% endfor %} diff --git a/src/atdata_app/frontend/templates/schema.html b/src/atdata_app/frontend/templates/schema.html index 281d9f6..16ede21 100644 --- a/src/atdata_app/frontend/templates/schema.html +++ b/src/atdata_app/frontend/templates/schema.html @@ -16,6 +16,18 @@

Details

+ {% if schema.arrayFormat is defined %} + + {% endif %} + {% if schema.dtype is defined %} + + {% endif %} + {% if schema.shape is defined %} + + {% endif %} + {% if schema.dimensionNames is defined %} + + {% endif %} diff --git a/src/atdata_app/frontend/templates/schemas.html b/src/atdata_app/frontend/templates/schemas.html index 268af39..fdf3565 100644 --- a/src/atdata_app/frontend/templates/schemas.html +++ b/src/atdata_app/frontend/templates/schemas.html @@ -7,7 +7,7 @@

Schemas

{% if schemas %}
NameVersionType
NameVersionTypeFormat
{{ s.name }} {{ s.version }} {{ s.schemaType }}{{ s.get("arrayFormatLabel", "") }}
AT-URI{{ schema.uri }}
Type{{ schema.schemaType }}
Array Format{{ schema.get("arrayFormatLabel", schema.arrayFormat) }}
Data Type{{ schema.dtype }}
Shape{{ schema.shape | join(" × ") }}
Dimensions{{ schema.dimensionNames | join(", ") }}
Version{{ schema.version }}
Created{{ schema.createdAt }}
- + {% for s in schemas %} @@ -15,6 +15,7 @@

Schemas

+ diff --git a/src/atdata_app/mcp_server.py b/src/atdata_app/mcp_server.py index 53991eb..a5d2e69 100644 --- a/src/atdata_app/mcp_server.py +++ b/src/atdata_app/mcp_server.py @@ -57,7 +57,10 @@ async def server_lifespan(server: FastMCP) -> AsyncIterator[ServerContext]: "ATProto AppView for the science.alt.dataset namespace. " "Use these tools to discover and query scientific datasets, " "schemas, and lenses (bidirectional schema transforms) published " - "on the AT Protocol network." + "on the AT Protocol network. " + "Schemas may specify an arrayFormat (numpyBytes, parquetBytes, " + "sparseBytes, structuredBytes, arrowTensor, safetensors) and " + "ndarray annotations (dtype, shape, dimensionNames)." ), lifespan=server_lifespan, ) @@ -127,7 +130,8 @@ async def get_schema(ctx: Ctx, uri: str) -> dict[str, Any]: uri: AT-URI of the schema (e.g. at://did:plc:abc/science.alt.dataset.schema/my.schema@1.0.0). Returns: - Full schema record including name, version, type, schema body, and description. + Full schema record including name, version, type, schema body, description, + and (when present) arrayFormat, dtype, shape, and dimensionNames. """ sc = _get_ctx(ctx) did, _collection, rkey = parse_at_uri(uri) diff --git a/src/atdata_app/models.py b/src/atdata_app/models.py index 1cde2f2..72e31da 100644 --- a/src/atdata_app/models.py +++ b/src/atdata_app/models.py @@ -9,6 +9,32 @@ from pydantic import BaseModel +# --------------------------------------------------------------------------- +# Known array format tokens (atdata-lexicon#21) +# --------------------------------------------------------------------------- + +KNOWN_ARRAY_FORMATS: set[str] = { + # Original formats + "numpyBytes", + "parquetBytes", + # New formats + "sparseBytes", + "structuredBytes", + "arrowTensor", + "safetensors", +} + +#: Human-friendly display names for array format tokens. +ARRAY_FORMAT_LABELS: dict[str, str] = { + "numpyBytes": "NumPy ndarray", + "parquetBytes": "Parquet", + "sparseBytes": "Sparse matrix (CSR/CSC/COO)", + "structuredBytes": "NumPy structured array", + "arrowTensor": "Arrow tensor IPC", + "safetensors": "Safetensors", +} + + # --------------------------------------------------------------------------- # AT-URI parsing # --------------------------------------------------------------------------- @@ -119,6 +145,19 @@ def row_to_schema(row) -> dict[str, Any]: } if row["description"]: d["description"] = row["description"] + + # Surface array format and ndarray v1.1.0 annotation fields for display + array_format = schema_body.get("arrayFormat") + if array_format: + d["arrayFormat"] = array_format + d["arrayFormatLabel"] = ARRAY_FORMAT_LABELS.get(array_format, array_format) + if schema_body.get("dtype"): + d["dtype"] = schema_body["dtype"] + if schema_body.get("shape"): + d["shape"] = schema_body["shape"] + if schema_body.get("dimensionNames"): + d["dimensionNames"] = schema_body["dimensionNames"] + return d diff --git a/tests/test_frontend.py b/tests/test_frontend.py index 43f714f..7e3d71e 100644 --- a/tests/test_frontend.py +++ b/tests/test_frontend.py @@ -41,6 +41,7 @@ def _make_schema_row( did: str = "did:plc:test123", rkey: str = "test@1.0.0", name: str = "TestSchema", + schema_body: str | dict = '{"type": "object"}', ) -> dict: return { "did": did, @@ -49,7 +50,7 @@ def _make_schema_row( "name": name, "version": "1.0.0", "schema_type": "jsonSchema", - "schema_body": '{"type": "object"}', + "schema_body": schema_body, "description": "A test schema", "metadata": None, "created_at": "2025-01-01T00:00:00Z", @@ -140,10 +141,12 @@ async def test_home_search(mock_search): @pytest.mark.asyncio @patch("atdata_app.frontend.routes.query_labels_for_dataset", new_callable=AsyncMock) +@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock) @patch("atdata_app.frontend.routes.query_get_entry", new_callable=AsyncMock) -async def test_dataset_detail(mock_get, mock_labels): +async def test_dataset_detail(mock_get, mock_schema, mock_labels): pool, _conn = _mock_pool() mock_get.return_value = _make_entry_row() + mock_schema.return_value = _make_schema_row() mock_labels.return_value = [_make_label_row()] app = _make_app(pool) transport = ASGITransport(app=app) @@ -260,6 +263,96 @@ async def test_about(mock_counts): assert "did:web:localhost%3A8000" in resp.text +# --------------------------------------------------------------------------- +# Schema detail — array format & ndarray annotations +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock) +async def test_schema_detail_array_format(mock_get): + pool, _conn = _mock_pool() + mock_get.return_value = _make_schema_row( + schema_body={ + "arrayFormat": "sparseBytes", + "dtype": "float32", + "shape": [100, 200], + "dimensionNames": ["samples", "features"], + }, + ) + app = _make_app(pool) + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + resp = await client.get("/schema/did:plc:test123/test@1.0.0") + assert resp.status_code == 200 + assert "Sparse matrix" in resp.text + assert "float32" in resp.text + assert "100" in resp.text + assert "samples" in resp.text + + +@pytest.mark.asyncio +@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock) +async def test_schema_detail_no_array_format(mock_get): + """Plain schemas should not show array format rows.""" + pool, _conn = _mock_pool() + mock_get.return_value = _make_schema_row() + app = _make_app(pool) + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + resp = await client.get("/schema/did:plc:test123/test@1.0.0") + assert resp.status_code == 200 + assert "Array Format" not in resp.text + assert "Data Type" not in resp.text + + +# --------------------------------------------------------------------------- +# Dataset detail — schema format info +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +@patch("atdata_app.frontend.routes.query_labels_for_dataset", new_callable=AsyncMock) +@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock) +@patch("atdata_app.frontend.routes.query_get_entry", new_callable=AsyncMock) +async def test_dataset_detail_with_schema_format(mock_entry, mock_schema, mock_labels): + pool, _conn = _mock_pool() + mock_entry.return_value = _make_entry_row() + mock_schema.return_value = _make_schema_row( + did="did:plc:test", + rkey="test@1.0.0", + schema_body={"arrayFormat": "numpyBytes", "dtype": "float64"}, + ) + mock_labels.return_value = [] + app = _make_app(pool) + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + resp = await client.get("/dataset/did:plc:test123/3xyz") + assert resp.status_code == 200 + assert "NumPy ndarray" in resp.text + assert "float64" in resp.text + + +# --------------------------------------------------------------------------- +# Schemas list — format column +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +@patch("atdata_app.frontend.routes.query_list_schemas", new_callable=AsyncMock) +async def test_schemas_list_shows_format(mock_list): + pool, _conn = _mock_pool() + mock_list.return_value = [ + _make_schema_row(schema_body={"arrayFormat": "safetensors"}), + ] + app = _make_app(pool) + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + resp = await client.get("/schemas") + assert resp.status_code == 200 + assert "Safetensors" in resp.text + + # --------------------------------------------------------------------------- # Static files # --------------------------------------------------------------------------- diff --git a/tests/test_models.py b/tests/test_models.py index 23f6046..91b3089 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -5,6 +5,8 @@ import pytest from atdata_app.models import ( + ARRAY_FORMAT_LABELS, + KNOWN_ARRAY_FORMATS, decode_cursor, encode_cursor, make_at_uri, @@ -174,6 +176,95 @@ def test_row_to_schema_json_string_body(): assert d["schema"] == {"type": "object"} +def test_row_to_schema_no_array_format_fields_when_absent(): + """Plain schemas should not gain arrayFormat/ndarray annotation keys.""" + d = row_to_schema(_SCHEMA_ROW) + assert "arrayFormat" not in d + assert "arrayFormatLabel" not in d + assert "dtype" not in d + assert "shape" not in d + assert "dimensionNames" not in d + + +# --------------------------------------------------------------------------- +# row_to_schema — array format types +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fmt", sorted(KNOWN_ARRAY_FORMATS)) +def test_row_to_schema_known_array_format(fmt): + """Each known format token should surface arrayFormat and a human label.""" + row = {**_SCHEMA_ROW, "schema_body": {"arrayFormat": fmt}} + d = row_to_schema(row) + assert d["arrayFormat"] == fmt + assert d["arrayFormatLabel"] == ARRAY_FORMAT_LABELS[fmt] + + +def test_row_to_schema_unknown_array_format_passes_through(): + """Unknown format tokens are stored and surfaced as-is.""" + row = {**_SCHEMA_ROW, "schema_body": {"arrayFormat": "futureFormat"}} + d = row_to_schema(row) + assert d["arrayFormat"] == "futureFormat" + assert d["arrayFormatLabel"] == "futureFormat" + + +# --------------------------------------------------------------------------- +# row_to_schema — ndarray v1.1.0 annotations +# --------------------------------------------------------------------------- + + +def test_row_to_schema_ndarray_annotations(): + """ndarray v1.1.0 annotation fields are surfaced at top level.""" + row = { + **_SCHEMA_ROW, + "schema_body": { + "arrayFormat": "numpyBytes", + "dtype": "float32", + "shape": [100, 200], + "dimensionNames": ["samples", "features"], + }, + } + d = row_to_schema(row) + assert d["arrayFormat"] == "numpyBytes" + assert d["dtype"] == "float32" + assert d["shape"] == [100, 200] + assert d["dimensionNames"] == ["samples", "features"] + + +def test_row_to_schema_ndarray_partial_annotations(): + """Only present annotation fields should appear in output.""" + row = { + **_SCHEMA_ROW, + "schema_body": {"arrayFormat": "sparseBytes", "dtype": "int64"}, + } + d = row_to_schema(row) + assert d["dtype"] == "int64" + assert "shape" not in d + assert "dimensionNames" not in d + + +# --------------------------------------------------------------------------- +# KNOWN_ARRAY_FORMATS constant +# --------------------------------------------------------------------------- + + +def test_known_array_formats_contains_all_expected(): + expected = { + "numpyBytes", + "parquetBytes", + "sparseBytes", + "structuredBytes", + "arrowTensor", + "safetensors", + } + assert KNOWN_ARRAY_FORMATS == expected + + +def test_array_format_labels_covers_all_known(): + """Every known format should have a human-readable label.""" + assert set(ARRAY_FORMAT_LABELS.keys()) == KNOWN_ARRAY_FORMATS + + # --------------------------------------------------------------------------- # row_to_label # ---------------------------------------------------------------------------
NameVersionTypeDescriptionPublisher
NameVersionTypeFormatDescriptionPublisher
{{ s.name }} {{ s.version }} {{ s.schemaType }}{{ s.get("arrayFormatLabel", "") }} {{ s.get("description", "") }} {{ s.did[:20] }}…