diff --git a/src/atdata_app/frontend/routes.py b/src/atdata_app/frontend/routes.py
index 9a815e4..d528841 100644
--- a/src/atdata_app/frontend/routes.py
+++ b/src/atdata_app/frontend/routes.py
@@ -105,6 +105,13 @@ async def dataset_detail(request: Request, did: str, rkey: str):
if len(parts) == 3:
schema_did, _, schema_rkey = parts
+ # Fetch the referenced schema for inline display of format/annotation info
+ schema_info = None
+ if schema_did and schema_rkey:
+ schema_row = await query_get_schema(pool, schema_did, schema_rkey)
+ if schema_row:
+ schema_info = row_to_schema(schema_row)
+
# Fetch labels pointing to this dataset
dataset_uri = entry["uri"]
label_rows = await query_labels_for_dataset(pool, dataset_uri)
@@ -117,6 +124,7 @@ async def dataset_detail(request: Request, did: str, rkey: str):
"entry": entry,
"schema_did": schema_did,
"schema_rkey": schema_rkey,
+ "schema_info": schema_info,
"labels": labels,
},
)
diff --git a/src/atdata_app/frontend/templates/dataset.html b/src/atdata_app/frontend/templates/dataset.html
index 4b32fed..78f20b8 100644
--- a/src/atdata_app/frontend/templates/dataset.html
+++ b/src/atdata_app/frontend/templates/dataset.html
@@ -27,6 +27,20 @@
Details
| License | {{ entry.license }} |
{% endif %}
| Schema | {{ entry.schemaRef }} |
+ {% if schema_info %}
+ {% if schema_info.arrayFormat is defined %}
+ | Array Format | {{ schema_info.get("arrayFormatLabel", schema_info.arrayFormat) }} |
+ {% endif %}
+ {% if schema_info.dtype is defined %}
+ | Data Type | {{ schema_info.dtype }} |
+ {% endif %}
+ {% if schema_info.shape is defined %}
+ | Shape | {{ schema_info.shape | join(" × ") }} |
+ {% endif %}
+ {% if schema_info.dimensionNames is defined %}
+ | Dimensions | {{ schema_info.dimensionNames | join(", ") }} |
+ {% endif %}
+ {% endif %}
{% if entry.size %}
| Size |
diff --git a/src/atdata_app/frontend/templates/profile.html b/src/atdata_app/frontend/templates/profile.html
index 46026b4..04c4e29 100644
--- a/src/atdata_app/frontend/templates/profile.html
+++ b/src/atdata_app/frontend/templates/profile.html
@@ -34,13 +34,14 @@ Schemas
{% if schemas %}
- | Name | Version | Type |
+ | Name | Version | Type | Format |
{% for s in schemas %}
| {{ s.name }} |
{{ s.version }} |
{{ s.schemaType }} |
+ {{ s.get("arrayFormatLabel", "") }} |
{% endfor %}
diff --git a/src/atdata_app/frontend/templates/schema.html b/src/atdata_app/frontend/templates/schema.html
index 281d9f6..16ede21 100644
--- a/src/atdata_app/frontend/templates/schema.html
+++ b/src/atdata_app/frontend/templates/schema.html
@@ -16,6 +16,18 @@ Details
| AT-URI | {{ schema.uri }} |
| Type | {{ schema.schemaType }} |
+ {% if schema.arrayFormat is defined %}
+ | Array Format | {{ schema.get("arrayFormatLabel", schema.arrayFormat) }} |
+ {% endif %}
+ {% if schema.dtype is defined %}
+ | Data Type | {{ schema.dtype }} |
+ {% endif %}
+ {% if schema.shape is defined %}
+ | Shape | {{ schema.shape | join(" × ") }} |
+ {% endif %}
+ {% if schema.dimensionNames is defined %}
+ | Dimensions | {{ schema.dimensionNames | join(", ") }} |
+ {% endif %}
| Version | {{ schema.version }} |
| Created | {{ schema.createdAt }} |
diff --git a/src/atdata_app/frontend/templates/schemas.html b/src/atdata_app/frontend/templates/schemas.html
index 268af39..fdf3565 100644
--- a/src/atdata_app/frontend/templates/schemas.html
+++ b/src/atdata_app/frontend/templates/schemas.html
@@ -7,7 +7,7 @@ Schemas
{% if schemas %}
- | Name | Version | Type | Description | Publisher |
+ | Name | Version | Type | Format | Description | Publisher |
{% for s in schemas %}
@@ -15,6 +15,7 @@ Schemas
{{ s.name }} |
{{ s.version }} |
{{ s.schemaType }} |
+ {{ s.get("arrayFormatLabel", "") }} |
{{ s.get("description", "") }} |
{{ s.did[:20] }}… |
diff --git a/src/atdata_app/mcp_server.py b/src/atdata_app/mcp_server.py
index 53991eb..a5d2e69 100644
--- a/src/atdata_app/mcp_server.py
+++ b/src/atdata_app/mcp_server.py
@@ -57,7 +57,10 @@ async def server_lifespan(server: FastMCP) -> AsyncIterator[ServerContext]:
"ATProto AppView for the science.alt.dataset namespace. "
"Use these tools to discover and query scientific datasets, "
"schemas, and lenses (bidirectional schema transforms) published "
- "on the AT Protocol network."
+ "on the AT Protocol network. "
+ "Schemas may specify an arrayFormat (numpyBytes, parquetBytes, "
+ "sparseBytes, structuredBytes, arrowTensor, safetensors) and "
+ "ndarray annotations (dtype, shape, dimensionNames)."
),
lifespan=server_lifespan,
)
@@ -127,7 +130,8 @@ async def get_schema(ctx: Ctx, uri: str) -> dict[str, Any]:
uri: AT-URI of the schema (e.g. at://did:plc:abc/science.alt.dataset.schema/my.schema@1.0.0).
Returns:
- Full schema record including name, version, type, schema body, and description.
+ Full schema record including name, version, type, schema body, description,
+ and (when present) arrayFormat, dtype, shape, and dimensionNames.
"""
sc = _get_ctx(ctx)
did, _collection, rkey = parse_at_uri(uri)
diff --git a/src/atdata_app/models.py b/src/atdata_app/models.py
index 1cde2f2..72e31da 100644
--- a/src/atdata_app/models.py
+++ b/src/atdata_app/models.py
@@ -9,6 +9,32 @@
from pydantic import BaseModel
+# ---------------------------------------------------------------------------
+# Known array format tokens (atdata-lexicon#21)
+# ---------------------------------------------------------------------------
+
+KNOWN_ARRAY_FORMATS: set[str] = {
+ # Original formats
+ "numpyBytes",
+ "parquetBytes",
+ # New formats
+ "sparseBytes",
+ "structuredBytes",
+ "arrowTensor",
+ "safetensors",
+}
+
+#: Human-friendly display names for array format tokens.
+ARRAY_FORMAT_LABELS: dict[str, str] = {
+ "numpyBytes": "NumPy ndarray",
+ "parquetBytes": "Parquet",
+ "sparseBytes": "Sparse matrix (CSR/CSC/COO)",
+ "structuredBytes": "NumPy structured array",
+ "arrowTensor": "Arrow tensor IPC",
+ "safetensors": "Safetensors",
+}
+
+
# ---------------------------------------------------------------------------
# AT-URI parsing
# ---------------------------------------------------------------------------
@@ -119,6 +145,19 @@ def row_to_schema(row) -> dict[str, Any]:
}
if row["description"]:
d["description"] = row["description"]
+
+ # Surface array format and ndarray v1.1.0 annotation fields for display
+ array_format = schema_body.get("arrayFormat")
+ if array_format:
+ d["arrayFormat"] = array_format
+ d["arrayFormatLabel"] = ARRAY_FORMAT_LABELS.get(array_format, array_format)
+ if schema_body.get("dtype"):
+ d["dtype"] = schema_body["dtype"]
+ if schema_body.get("shape"):
+ d["shape"] = schema_body["shape"]
+ if schema_body.get("dimensionNames"):
+ d["dimensionNames"] = schema_body["dimensionNames"]
+
return d
diff --git a/tests/test_frontend.py b/tests/test_frontend.py
index 43f714f..7e3d71e 100644
--- a/tests/test_frontend.py
+++ b/tests/test_frontend.py
@@ -41,6 +41,7 @@ def _make_schema_row(
did: str = "did:plc:test123",
rkey: str = "test@1.0.0",
name: str = "TestSchema",
+ schema_body: str | dict = '{"type": "object"}',
) -> dict:
return {
"did": did,
@@ -49,7 +50,7 @@ def _make_schema_row(
"name": name,
"version": "1.0.0",
"schema_type": "jsonSchema",
- "schema_body": '{"type": "object"}',
+ "schema_body": schema_body,
"description": "A test schema",
"metadata": None,
"created_at": "2025-01-01T00:00:00Z",
@@ -140,10 +141,12 @@ async def test_home_search(mock_search):
@pytest.mark.asyncio
@patch("atdata_app.frontend.routes.query_labels_for_dataset", new_callable=AsyncMock)
+@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock)
@patch("atdata_app.frontend.routes.query_get_entry", new_callable=AsyncMock)
-async def test_dataset_detail(mock_get, mock_labels):
+async def test_dataset_detail(mock_get, mock_schema, mock_labels):
pool, _conn = _mock_pool()
mock_get.return_value = _make_entry_row()
+ mock_schema.return_value = _make_schema_row()
mock_labels.return_value = [_make_label_row()]
app = _make_app(pool)
transport = ASGITransport(app=app)
@@ -260,6 +263,96 @@ async def test_about(mock_counts):
assert "did:web:localhost%3A8000" in resp.text
+# ---------------------------------------------------------------------------
+# Schema detail — array format & ndarray annotations
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock)
+async def test_schema_detail_array_format(mock_get):
+ pool, _conn = _mock_pool()
+ mock_get.return_value = _make_schema_row(
+ schema_body={
+ "arrayFormat": "sparseBytes",
+ "dtype": "float32",
+ "shape": [100, 200],
+ "dimensionNames": ["samples", "features"],
+ },
+ )
+ app = _make_app(pool)
+ transport = ASGITransport(app=app)
+ async with AsyncClient(transport=transport, base_url="http://test") as client:
+ resp = await client.get("/schema/did:plc:test123/test@1.0.0")
+ assert resp.status_code == 200
+ assert "Sparse matrix" in resp.text
+ assert "float32" in resp.text
+ assert "100" in resp.text
+ assert "samples" in resp.text
+
+
+@pytest.mark.asyncio
+@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock)
+async def test_schema_detail_no_array_format(mock_get):
+ """Plain schemas should not show array format rows."""
+ pool, _conn = _mock_pool()
+ mock_get.return_value = _make_schema_row()
+ app = _make_app(pool)
+ transport = ASGITransport(app=app)
+ async with AsyncClient(transport=transport, base_url="http://test") as client:
+ resp = await client.get("/schema/did:plc:test123/test@1.0.0")
+ assert resp.status_code == 200
+ assert "Array Format" not in resp.text
+ assert "Data Type" not in resp.text
+
+
+# ---------------------------------------------------------------------------
+# Dataset detail — schema format info
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@patch("atdata_app.frontend.routes.query_labels_for_dataset", new_callable=AsyncMock)
+@patch("atdata_app.frontend.routes.query_get_schema", new_callable=AsyncMock)
+@patch("atdata_app.frontend.routes.query_get_entry", new_callable=AsyncMock)
+async def test_dataset_detail_with_schema_format(mock_entry, mock_schema, mock_labels):
+ pool, _conn = _mock_pool()
+ mock_entry.return_value = _make_entry_row()
+ mock_schema.return_value = _make_schema_row(
+ did="did:plc:test",
+ rkey="test@1.0.0",
+ schema_body={"arrayFormat": "numpyBytes", "dtype": "float64"},
+ )
+ mock_labels.return_value = []
+ app = _make_app(pool)
+ transport = ASGITransport(app=app)
+ async with AsyncClient(transport=transport, base_url="http://test") as client:
+ resp = await client.get("/dataset/did:plc:test123/3xyz")
+ assert resp.status_code == 200
+ assert "NumPy ndarray" in resp.text
+ assert "float64" in resp.text
+
+
+# ---------------------------------------------------------------------------
+# Schemas list — format column
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@patch("atdata_app.frontend.routes.query_list_schemas", new_callable=AsyncMock)
+async def test_schemas_list_shows_format(mock_list):
+ pool, _conn = _mock_pool()
+ mock_list.return_value = [
+ _make_schema_row(schema_body={"arrayFormat": "safetensors"}),
+ ]
+ app = _make_app(pool)
+ transport = ASGITransport(app=app)
+ async with AsyncClient(transport=transport, base_url="http://test") as client:
+ resp = await client.get("/schemas")
+ assert resp.status_code == 200
+ assert "Safetensors" in resp.text
+
+
# ---------------------------------------------------------------------------
# Static files
# ---------------------------------------------------------------------------
diff --git a/tests/test_models.py b/tests/test_models.py
index 23f6046..91b3089 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -5,6 +5,8 @@
import pytest
from atdata_app.models import (
+ ARRAY_FORMAT_LABELS,
+ KNOWN_ARRAY_FORMATS,
decode_cursor,
encode_cursor,
make_at_uri,
@@ -174,6 +176,95 @@ def test_row_to_schema_json_string_body():
assert d["schema"] == {"type": "object"}
+def test_row_to_schema_no_array_format_fields_when_absent():
+ """Plain schemas should not gain arrayFormat/ndarray annotation keys."""
+ d = row_to_schema(_SCHEMA_ROW)
+ assert "arrayFormat" not in d
+ assert "arrayFormatLabel" not in d
+ assert "dtype" not in d
+ assert "shape" not in d
+ assert "dimensionNames" not in d
+
+
+# ---------------------------------------------------------------------------
+# row_to_schema — array format types
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("fmt", sorted(KNOWN_ARRAY_FORMATS))
+def test_row_to_schema_known_array_format(fmt):
+ """Each known format token should surface arrayFormat and a human label."""
+ row = {**_SCHEMA_ROW, "schema_body": {"arrayFormat": fmt}}
+ d = row_to_schema(row)
+ assert d["arrayFormat"] == fmt
+ assert d["arrayFormatLabel"] == ARRAY_FORMAT_LABELS[fmt]
+
+
+def test_row_to_schema_unknown_array_format_passes_through():
+ """Unknown format tokens are stored and surfaced as-is."""
+ row = {**_SCHEMA_ROW, "schema_body": {"arrayFormat": "futureFormat"}}
+ d = row_to_schema(row)
+ assert d["arrayFormat"] == "futureFormat"
+ assert d["arrayFormatLabel"] == "futureFormat"
+
+
+# ---------------------------------------------------------------------------
+# row_to_schema — ndarray v1.1.0 annotations
+# ---------------------------------------------------------------------------
+
+
+def test_row_to_schema_ndarray_annotations():
+ """ndarray v1.1.0 annotation fields are surfaced at top level."""
+ row = {
+ **_SCHEMA_ROW,
+ "schema_body": {
+ "arrayFormat": "numpyBytes",
+ "dtype": "float32",
+ "shape": [100, 200],
+ "dimensionNames": ["samples", "features"],
+ },
+ }
+ d = row_to_schema(row)
+ assert d["arrayFormat"] == "numpyBytes"
+ assert d["dtype"] == "float32"
+ assert d["shape"] == [100, 200]
+ assert d["dimensionNames"] == ["samples", "features"]
+
+
+def test_row_to_schema_ndarray_partial_annotations():
+ """Only present annotation fields should appear in output."""
+ row = {
+ **_SCHEMA_ROW,
+ "schema_body": {"arrayFormat": "sparseBytes", "dtype": "int64"},
+ }
+ d = row_to_schema(row)
+ assert d["dtype"] == "int64"
+ assert "shape" not in d
+ assert "dimensionNames" not in d
+
+
+# ---------------------------------------------------------------------------
+# KNOWN_ARRAY_FORMATS constant
+# ---------------------------------------------------------------------------
+
+
+def test_known_array_formats_contains_all_expected():
+ expected = {
+ "numpyBytes",
+ "parquetBytes",
+ "sparseBytes",
+ "structuredBytes",
+ "arrowTensor",
+ "safetensors",
+ }
+ assert KNOWN_ARRAY_FORMATS == expected
+
+
+def test_array_format_labels_covers_all_known():
+ """Every known format should have a human-readable label."""
+ assert set(ARRAY_FORMAT_LABELS.keys()) == KNOWN_ARRAY_FORMATS
+
+
# ---------------------------------------------------------------------------
# row_to_label
# ---------------------------------------------------------------------------