From 37f224032871368e780a12ad8e68fe4271864fbd Mon Sep 17 00:00:00 2001
From: Maxine Levesque <170461181+maxinelevesque@users.noreply.github.com>
Date: Thu, 26 Feb 2026 01:52:21 -0800
Subject: [PATCH] feat: add array format type recognition and ndarray v1.1.0
 annotation display

Add KNOWN_ARRAY_FORMATS constant and ARRAY_FORMAT_LABELS for the six
recognized array format tokens (numpyBytes, parquetBytes, sparseBytes,
structuredBytes, arrowTensor, safetensors). Update row_to_schema() to
surface arrayFormat, dtype, shape, and dimensionNames from the schema
body as top-level fields. Update frontend templates (schema detail,
schemas list, profile, dataset detail) to display format and annotation
info when present. Update MCP server descriptions to mention new formats.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/atdata_app/frontend/routes.py             |  8 ++
 .../frontend/templates/dataset.html           | 14 +++
 .../frontend/templates/profile.html           |  3 +-
 src/atdata_app/frontend/templates/schema.html | 12 +++
 .../frontend/templates/schemas.html           |  3 +-
 src/atdata_app/mcp_server.py                  |  8 +-
 src/atdata_app/models.py                      | 39 ++++++++
 tests/test_frontend.py                        | 97 ++++++++++++++++++-
 tests/test_models.py                          | 91 +++++++++++++++++
 9 files changed, 269 insertions(+), 6 deletions(-)
diff --git a/src/atdata_app/frontend/routes.py b/src/atdata_app/frontend/routes.py
index 9a815e4..d528841 100644
--- a/src/atdata_app/frontend/routes.py
+++ b/src/atdata_app/frontend/routes.py
@@ -105,6 +105,13 @@ async def dataset_detail(request: Request, did: str, rkey: str):
         if len(parts) == 3:
             schema_did, _, schema_rkey = parts
 
+    # Fetch the referenced schema for inline display of format/annotation info
+    schema_info = None
+    if schema_did and schema_rkey:
+        schema_row = await query_get_schema(pool, schema_did, schema_rkey)
+        if schema_row:
+            schema_info = row_to_schema(schema_row)
+
     # Fetch labels pointing to this dataset
     dataset_uri = entry["uri"]
     label_rows = await query_labels_for_dataset(pool, dataset_uri)
@@ -117,6 +124,7 @@ async def dataset_detail(request: Request, did: str, rkey: str):
             "entry": entry,
             "schema_did": schema_did,
             "schema_rkey": schema_rkey,
+            "schema_info": schema_info,
             "labels": labels,
         },
     )
diff --git a/src/atdata_app/frontend/templates/dataset.html b/src/atdata_app/frontend/templates/dataset.html
index 4b32fed..78f20b8 100644
--- a/src/atdata_app/frontend/templates/dataset.html
+++ b/src/atdata_app/frontend/templates/dataset.html
@@ -27,6 +27,20 @@ <h2>Details</h2>
             <tr><th>License</th><td>{{ entry.license }}</td></tr>
             {% endif %}
             <tr><th>Schema</th><td><a href="/schema/{{ schema_did }}/{{ schema_rkey }}">{{ entry.schemaRef }}</a></td></tr>
+            {% if schema_info %}
+            {% if schema_info.arrayFormat is defined %}
+            <tr><th>Array Format</th><td>{{ schema_info.get("arrayFormatLabel", schema_info.arrayFormat) }}</td></tr>
+            {% endif %}
+            {% if schema_info.dtype is defined %}
+            <tr><th>Data Type</th><td><code>{{ schema_info.dtype }}</code></td></tr>
+            {% endif %}
+            {% if schema_info.shape is defined %}
+            <tr><th>Shape</th><td><code>{{ schema_info.shape | join(" × ") }}</code></td></tr>
+            {% endif %}
+            {% if schema_info.dimensionNames is defined %}
+            <tr><th>Dimensions</th><td>{{ schema_info.dimensionNames | join(", ") }}</td></tr>
+            {% endif %}
+            {% endif %}
             {% if entry.size %}
             <tr>
                 <th>Size</th>
diff --git a/src/atdata_app/frontend/templates/profile.html b/src/atdata_app/frontend/templates/profile.html
index 46026b4..04c4e29 100644
--- a/src/atdata_app/frontend/templates/profile.html
+++ b/src/atdata_app/frontend/templates/profile.html
@@ -34,13 +34,14 @@ <h3><a href="/dataset/{{ entry.did }}/{{ entry.rkey }}">{{ entry.name }}</a></h3
     <h2>Schemas</h2>
     {% if schemas %}
     <table>
-        <thead><tr><th>Name</th><th>Version</th><th>Type</th></tr></thead>
+        <thead><tr><th>Name</th><th>Version</th><th>Type</th><th>Format</th></tr></thead>
         <tbody>
             {% for s in schemas %}
             <tr>
                 <td><a href="/schema/{{ s.did }}/{{ s.rkey }}">{{ s.name }}</a></td>
                 <td>{{ s.version }}</td>
                 <td>{{ s.schemaType }}</td>
+                <td>{{ s.get("arrayFormatLabel", "") }}</td>
             </tr>
             {% endfor %}
         </tbody>
diff --git a/src/atdata_app/frontend/templates/schema.html b/src/atdata_app/frontend/templates/schema.html
index 281d9f6..16ede21 100644
--- a/src/atdata_app/frontend/templates/schema.html
+++ b/src/atdata_app/frontend/templates/schema.html
@@ -16,6 +16,18 @@ <h2>Details</h2>
         <tbody>
             <tr><th>AT-URI</th><td><code>{{ schema.uri }}</code></td></tr>
             <tr><th>Type</th><td>{{ schema.schemaType }}</td></tr>
+            {% if schema.arrayFormat is defined %}
+            <tr><th>Array Format</th><td>{{ schema.get("arrayFormatLabel", schema.arrayFormat) }}</td></tr>
+            {% endif %}
+            {% if schema.dtype is defined %}
+            <tr><th>Data Type</th><td><code>{{ schema.dtype }}</code></td></tr>
+            {% endif %}
+            {% if schema.shape is defined %}
+            <tr><th>Shape</th><td><code>{{ schema.shape | join(" × ") }}</code></td></tr>
+            {% endif %}
+            {% if schema.dimensionNames is defined %}
+            <tr><th>Dimensions</th><td>{{ schema.dimensionNames | join(", ") }}</td></tr>
+            {% endif %}
             <tr><th>Version</th><td>{{ schema.version }}</td></tr>
             <tr><th>Created</th><td>{{ schema.createdAt }}</td></tr>
         </tbody>
diff --git a/src/atdata_app/frontend/templates/schemas.html b/src/atdata_app/frontend/templates/schemas.html
index 268af39..fdf3565 100644
--- a/src/atdata_app/frontend/templates/schemas.html
+++ b/src/atdata_app/frontend/templates/schemas.html
@@ -7,7 +7,7 @@ <h1>Schemas</h1>
     {% if schemas %}
     <table>
         <thead>
-            <tr><th>Name</th><th>Version</th><th>Type</th><th>Description</th><th>Publisher</th></tr>
+            <tr><th>Name</th><th>Version</th><th>Type</th><th>Format</th><th>Description</th><th>Publisher</th></tr>
         </thead>
         <tbody>
             {% for s in schemas %}
@@ -15,6 +15,7 @@ <h1>Schemas</h1>
                 <td><a href="/schema/{{ s.did }}/{{ s.rkey }}">{{ s.name }}</a></td>
                 <td>{{ s.version }}</td>
                 <td>{{ s.schemaType }}</td>
+                <td>{{ s.get("arrayFormatLabel", "") }}</td>
                 <td>{{ s.get("description", "") }}</td>
                 <td><a href="/profile/{{ s.did }}">{{ s.did[:20] }}…</a></td>
             </tr>
diff --git a/src/atdata_app/mcp_server.py b/src/atdata_app/mcp_server.py
index 53991eb..a5d2e69 100644
--- a/src/atdata_app/mcp_server.py
+++ b/src/atdata_app/mcp_server.py
@@ -57,7 +57,10 @@ async def server_lifespan(server: FastMCP) -> AsyncIterator[ServerContext]:
         "ATProto AppView for the science.alt.dataset namespace. "
         "Use these tools to discover and query scientific datasets, "
         "schemas, and lenses (bidirectional schema transforms) published "
-        "on the AT Protocol network."
+        "on the AT Protocol network. "
+        "Schemas may specify an arrayFormat (numpyBytes, parquetBytes, "
+        "sparseBytes, structuredBytes, arrowTensor, safetensors) and "
+        "ndarray annotations (dtype, shape, dimensionNames)."
     ),
     lifespan=server_lifespan,
 )
@@ -127,7 +130,8 @@ async def get_schema(ctx: Ctx, uri: str) -> dict[str, Any]:
         uri: AT-URI of the schema (e.g. at://did:plc:abc/science.alt.dataset.schema/my.schema@1.0.0).
 
     Returns:
-        Full schema record including name, version, type, schema body, and description.
+        Full schema record including name, version, type, schema body, description,
+        and (when present) arrayFormat, dtype, shape, and dimensionNames.
     """
     sc = _get_ctx(ctx)
     did, _collection, rkey = parse_at_uri(uri)
diff --git a/src/atdata_app/models.py b/src/atdata_app/models.py
index 1cde2f2..72e31da 100644
--- a/src/atdata_app/models.py
+++ b/src/atdata_app/models.py
@@ -9,6 +9,32 @@
 from pydantic import BaseModel
 
 
+# ---------------------------------------------------------------------------
+# Known array format tokens (atdata-lexicon#21)
+# ---------------------------------------------------------------------------
+
+KNOWN_ARRAY_FORMATS: set[str] = {
+    # Original formats
+    "numpyBytes",
+    "parquetBytes",
+    # New formats
+    "sparseBytes",
+    "structuredBytes",
+    "arrowTensor",
+    "safetensors",
+}
+
+#: Human-friendly display names for array format tokens.
+ARRAY_FORMAT_LABELS: dict[str, str] = {
+    "numpyBytes": "NumPy ndarray",
+    "parquetBytes": "Parquet",
+    "sparseBytes": "Sparse matrix (CSR/CSC/COO)",
+    "structuredBytes": "NumPy structured array",
+    "arrowTensor": "Arrow tensor IPC",
+    "safetensors": "Safetensors",
+}
+
+
 # ---------------------------------------------------------------------------
 # AT-URI parsing
 # ---------------------------------------------------------------------------
@@ -119,6 +145,19 @@ def row_to_schema(row) -> dict[str, Any]:
     }
     if row["description"]:
         d["description"] = row["description"]
+
+    # Surface array format and ndarray v1.1.0 annotation fields for display
+    array_format = schema_body.get("arrayFormat")
+    if array_format:
+        d["arrayFormat"] = array_format
+        d["arrayFormatLabel"] = ARRAY_FORMAT_LABELS.get(array_format, array_format)
+    if schema_body.get("dtype"):
+        d["dtype"] = schema_body["dtype"]
+    if schema_body.get("shape"):
+        d["shape"] = schema_body["shape"]
+    if schema_body.get("dimensionNames"):
+        d["dimensionNames"] = schema_body["dimensionNames"]
+
     return d
 
 
diff --git a/tests/test_frontend.py b/tests/test_frontend.py
index 43f714f..7e3d71e 100644
--- a/tests/test_frontend.py
+++ b/tests/test_frontend.py
@@ -41,6 +41,7 @@ def _make_schema_row(
     did: str = "did:plc:test123",
     rkey: str = "test@1.0.0",
     name: str = "TestSchema",
+    schema_body: str | dict = '{"type": "object"}',
 ) -> dict:
     return {
         "did": did,
@@ -49,7 +50,7 @@ def _make_schema_row(
         "name": name,
         "version": "1.0.0",
         "schema_type": "jsonSchema",
-        "schema_body": '{"type": "object"}',
+        "schema_body": schema_body,
         "description": "A test schema",
         "metadata": None,
         "created_at": "2025-01-01T00:00:00Z",
@@ -140,10 +141,12 @@ async def test_home_search(mock_search):
 
 @pytest.mark.asyncio
 @patch("atdata_app.frontend.routes.query_labels_for_dataset", new_callable=AsyncMock)
+@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock)
 @patch("atdata_app.frontend.routes.query_get_entry", new_callable=AsyncMock)
-async def test_dataset_detail(mock_get, mock_labels):
+async def test_dataset_detail(mock_get, mock_schema, mock_labels):
     pool, _conn = _mock_pool()
     mock_get.return_value = _make_entry_row()
+    mock_schema.return_value = _make_schema_row()
     mock_labels.return_value = [_make_label_row()]
     app = _make_app(pool)
     transport = ASGITransport(app=app)
@@ -260,6 +263,96 @@ async def test_about(mock_counts):
     assert "did:web:localhost%3A8000" in resp.text
 
 
+# ---------------------------------------------------------------------------
+# Schema detail — array format & ndarray annotations
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock)
+async def test_schema_detail_array_format(mock_get):
+    pool, _conn = _mock_pool()
+    mock_get.return_value = _make_schema_row(
+        schema_body={
+            "arrayFormat": "sparseBytes",
+            "dtype": "float32",
+            "shape": [100, 200],
+            "dimensionNames": ["samples", "features"],
+        },
+    )
+    app = _make_app(pool)
+    transport = ASGITransport(app=app)
+    async with AsyncClient(transport=transport, base_url="http://test") as client:
+        resp = await client.get("/schema/did:plc:test123/test@1.0.0")
+    assert resp.status_code == 200
+    assert "Sparse matrix" in resp.text
+    assert "float32" in resp.text
+    assert "100" in resp.text
+    assert "samples" in resp.text
+
+
+@pytest.mark.asyncio
+@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock)
+async def test_schema_detail_no_array_format(mock_get):
+    """Plain schemas should not show array format rows."""
+    pool, _conn = _mock_pool()
+    mock_get.return_value = _make_schema_row()
+    app = _make_app(pool)
+    transport = ASGITransport(app=app)
+    async with AsyncClient(transport=transport, base_url="http://test") as client:
+        resp = await client.get("/schema/did:plc:test123/test@1.0.0")
+    assert resp.status_code == 200
+    assert "Array Format" not in resp.text
+    assert "Data Type" not in resp.text
+
+
+# ---------------------------------------------------------------------------
+# Dataset detail — schema format info
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@patch("atdata_app.frontend.routes.query_labels_for_dataset", new_callable=AsyncMock)
+@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock)
+@patch("atdata_app.frontend.routes.query_get_entry", new_callable=AsyncMock)
+async def test_dataset_detail_with_schema_format(mock_entry, mock_schema, mock_labels):
+    pool, _conn = _mock_pool()
+    mock_entry.return_value = _make_entry_row()
+    mock_schema.return_value = _make_schema_row(
+        did="did:plc:test",
+        rkey="test@1.0.0",
+        schema_body={"arrayFormat": "numpyBytes", "dtype": "float64"},
+    )
+    mock_labels.return_value = []
+    app = _make_app(pool)
+    transport = ASGITransport(app=app)
+    async with AsyncClient(transport=transport, base_url="http://test") as client:
+        resp = await client.get("/dataset/did:plc:test123/3xyz")
+    assert resp.status_code == 200
+    assert "NumPy ndarray" in resp.text
+    assert "float64" in resp.text
+
+
+# ---------------------------------------------------------------------------
+# Schemas list — format column
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@patch("atdata_app.frontend.routes.query_list_schemas", new_callable=AsyncMock)
+async def test_schemas_list_shows_format(mock_list):
+    pool, _conn = _mock_pool()
+    mock_list.return_value = [
+        _make_schema_row(schema_body={"arrayFormat": "safetensors"}),
+    ]
+    app = _make_app(pool)
+    transport = ASGITransport(app=app)
+    async with AsyncClient(transport=transport, base_url="http://test") as client:
+        resp = await client.get("/schemas")
+    assert resp.status_code == 200
+    assert "Safetensors" in resp.text
+
+
 # ---------------------------------------------------------------------------
 # Static files
 # ---------------------------------------------------------------------------
diff --git a/tests/test_models.py b/tests/test_models.py
index 23f6046..91b3089 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -5,6 +5,8 @@
 import pytest
 
 from atdata_app.models import (
+    ARRAY_FORMAT_LABELS,
+    KNOWN_ARRAY_FORMATS,
     decode_cursor,
     encode_cursor,
     make_at_uri,
@@ -174,6 +176,95 @@ def test_row_to_schema_json_string_body():
     assert d["schema"] == {"type": "object"}
 
 
+def test_row_to_schema_no_array_format_fields_when_absent():
+    """Plain schemas should not gain arrayFormat/ndarray annotation keys."""
+    d = row_to_schema(_SCHEMA_ROW)
+    assert "arrayFormat" not in d
+    assert "arrayFormatLabel" not in d
+    assert "dtype" not in d
+    assert "shape" not in d
+    assert "dimensionNames" not in d
+
+
+# ---------------------------------------------------------------------------
+# row_to_schema — array format types
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("fmt", sorted(KNOWN_ARRAY_FORMATS))
+def test_row_to_schema_known_array_format(fmt):
+    """Each known format token should surface arrayFormat and a human label."""
+    row = {**_SCHEMA_ROW, "schema_body": {"arrayFormat": fmt}}
+    d = row_to_schema(row)
+    assert d["arrayFormat"] == fmt
+    assert d["arrayFormatLabel"] == ARRAY_FORMAT_LABELS[fmt]
+
+
+def test_row_to_schema_unknown_array_format_passes_through():
+    """Unknown format tokens are stored and surfaced as-is."""
+    row = {**_SCHEMA_ROW, "schema_body": {"arrayFormat": "futureFormat"}}
+    d = row_to_schema(row)
+    assert d["arrayFormat"] == "futureFormat"
+    assert d["arrayFormatLabel"] == "futureFormat"
+
+
+# ---------------------------------------------------------------------------
+# row_to_schema — ndarray v1.1.0 annotations
+# ---------------------------------------------------------------------------
+
+
+def test_row_to_schema_ndarray_annotations():
+    """ndarray v1.1.0 annotation fields are surfaced at top level."""
+    row = {
+        **_SCHEMA_ROW,
+        "schema_body": {
+            "arrayFormat": "numpyBytes",
+            "dtype": "float32",
+            "shape": [100, 200],
+            "dimensionNames": ["samples", "features"],
+        },
+    }
+    d = row_to_schema(row)
+    assert d["arrayFormat"] == "numpyBytes"
+    assert d["dtype"] == "float32"
+    assert d["shape"] == [100, 200]
+    assert d["dimensionNames"] == ["samples", "features"]
+
+
+def test_row_to_schema_ndarray_partial_annotations():
+    """Only present annotation fields should appear in output."""
+    row = {
+        **_SCHEMA_ROW,
+        "schema_body": {"arrayFormat": "sparseBytes", "dtype": "int64"},
+    }
+    d = row_to_schema(row)
+    assert d["dtype"] == "int64"
+    assert "shape" not in d
+    assert "dimensionNames" not in d
+
+
+# ---------------------------------------------------------------------------
+# KNOWN_ARRAY_FORMATS constant
+# ---------------------------------------------------------------------------
+
+
+def test_known_array_formats_contains_all_expected():
+    expected = {
+        "numpyBytes",
+        "parquetBytes",
+        "sparseBytes",
+        "structuredBytes",
+        "arrowTensor",
+        "safetensors",
+    }
+    assert KNOWN_ARRAY_FORMATS == expected
+
+
+def test_array_format_labels_covers_all_known():
+    """Every known format should have a human-readable label."""
+    assert set(ARRAY_FORMAT_LABELS.keys()) == KNOWN_ARRAY_FORMATS
+
+
 # ---------------------------------------------------------------------------
 # row_to_label
 # ---------------------------------------------------------------------------

Name	Version	Type
Name	Version	Type	Format
{{ s.name }}	{{ s.version }}	{{ s.schemaType }}	{{ s.get("arrayFormatLabel", "") }}
AT-URI	`{{ schema.uri }}`
Type	{{ schema.schemaType }}
Array Format	{{ schema.get("arrayFormatLabel", schema.arrayFormat) }}
Data Type	`{{ schema.dtype }}`
Shape	`{{ schema.shape \| join(" × ") }}`
Dimensions	{{ schema.dimensionNames \| join(", ") }}
Version	{{ schema.version }}
Created	{{ schema.createdAt }}