From 3a03dd1f067e915e38a736aaeff7ab8b51c94d94 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Tue, 26 May 2026 00:11:15 +0200 Subject: [PATCH 1/4] feature - implement nested scalar functions (#37) --- docs/language/reference/functions/index.md | 4 +- docs/language/reference/functions/nested.md | 58 +++++++++ docs/release_notes/v0_1.md | 1 + docs/rfcs/020_nested_data_functions.md | 44 +++---- src/functions/mod.incn | 22 ++++ src/functions/nested/array.incn | 38 ++++++ src/functions/nested/array_contains.incn | 34 ++++++ src/functions/nested/array_distinct.incn | 33 ++++++ src/functions/nested/array_except.incn | 34 ++++++ src/functions/nested/array_flatten.incn | 37 ++++++ src/functions/nested/array_intersect.incn | 34 ++++++ src/functions/nested/array_join.incn | 34 ++++++ src/functions/nested/array_position.incn | 34 ++++++ src/functions/nested/array_reverse.incn | 33 ++++++ src/functions/nested/array_slice.incn | 35 ++++++ src/functions/nested/array_sort.incn | 33 ++++++ src/functions/nested/array_union.incn | 34 ++++++ src/functions/nested/arrays_overlap.incn | 34 ++++++ src/functions/nested/cardinality.incn | 33 ++++++ src/functions/nested/common.incn | 33 ++++++ src/functions/nested/element_at.incn | 34 ++++++ src/functions/nested/map_contains_key.incn | 35 ++++++ src/functions/nested/map_entries.incn | 33 ++++++ src/functions/nested/map_extract.incn | 34 ++++++ src/functions/nested/map_from_arrays.incn | 34 ++++++ src/functions/nested/map_keys.incn | 33 ++++++ src/functions/nested/map_values.incn | 33 ++++++ src/functions/nested/mod.incn | 24 ++++ src/functions/nested/named_struct.incn | 34 ++++++ src/lib.incn | 22 ++++ src/substrait/function_extensions.incn | 21 ++++ tests/test_function_registry.incn | 92 ++++++++++++++- tests/test_nested_data_functions.incn | 124 ++++++++++++++++++++ tests/test_session_projection.incn | 42 +++++++ tests/test_substrait_plan.incn | 54 +++++++++ 35 files changed, 1267 insertions(+), 24 deletions(-) create mode 100644 docs/language/reference/functions/nested.md create mode 100644 src/functions/nested/array.incn create mode 100644 src/functions/nested/array_contains.incn create mode 100644 src/functions/nested/array_distinct.incn create mode 100644 src/functions/nested/array_except.incn create mode 100644 src/functions/nested/array_flatten.incn create mode 100644 src/functions/nested/array_intersect.incn create mode 100644 src/functions/nested/array_join.incn create mode 100644 src/functions/nested/array_position.incn create mode 100644 src/functions/nested/array_reverse.incn create mode 100644 src/functions/nested/array_slice.incn create mode 100644 src/functions/nested/array_sort.incn create mode 100644 src/functions/nested/array_union.incn create mode 100644 src/functions/nested/arrays_overlap.incn create mode 100644 src/functions/nested/cardinality.incn create mode 100644 src/functions/nested/common.incn create mode 100644 src/functions/nested/element_at.incn create mode 100644 src/functions/nested/map_contains_key.incn create mode 100644 src/functions/nested/map_entries.incn create mode 100644 src/functions/nested/map_extract.incn create mode 100644 src/functions/nested/map_from_arrays.incn create mode 100644 src/functions/nested/map_keys.incn create mode 100644 src/functions/nested/map_values.incn create mode 100644 src/functions/nested/mod.incn create mode 100644 src/functions/nested/named_struct.incn create mode 100644 tests/test_nested_data_functions.incn diff --git a/docs/language/reference/functions/index.md b/docs/language/reference/functions/index.md index bf7c4ce..b0ed988 100644 --- a/docs/language/reference/functions/index.md +++ b/docs/language/reference/functions/index.md @@ -7,10 +7,11 @@ Today the concrete shipped surfaces are documented here: - [Filter builders](../builders/filters.md) - [Aggregate builders](../builders/aggregates.md) - [Projection builders](../builders/projections.md) +- [Nested data functions](nested.md) The canonical scalar literal helper is `lit(...)`. Typed literal helpers construct the same scalar-expression representation. -The current registry-backed helper surface is registered in the package-owned function registry. Registry types live in `src/function_registry.incn`, the shared package registry lives in `src/functions/registry.incn`, and concrete public helper entries are produced by `register_function(...)` decorators in individual `src/functions//.incn` modules. The registry-backed families are references, literals, casts, operators, predicates, conditionals, math, ordering, and aggregates. Each runtime entry exposes a stable function reference such as `inql.functions.col`, namespace, canonical name, typed lifecycle metadata (`since`, versioned changes, and optional deprecation), InQL RFC 024 policy category, function class, null behavior, alias policy, aggregate modifier policy, and Substrait mapping metadata. Checked public helpers provide the signature and, by default, the canonical name; decorator metadata may override the canonical name only for source spelling constraints such as the reserved-word `mod` case. +The current registry-backed helper surface is registered in the package-owned function registry. Registry types live in `src/function_registry.incn`, the shared package registry lives in `src/functions/registry.incn`, and concrete public helper entries are produced by `register_function(...)` decorators in individual `src/functions//.incn` modules. The registry-backed families are references, literals, casts, operators, predicates, conditionals, math, ordering, aggregates, and nested data. Each runtime entry exposes a stable function reference such as `inql.functions.col`, namespace, canonical name, typed lifecycle metadata (`since`, versioned changes, and optional deprecation), InQL RFC 024 policy category, function class, null behavior, alias policy, aggregate modifier policy, and Substrait mapping metadata. Checked public helpers provide the signature and, by default, the canonical name; decorator metadata may override the canonical name only for source spelling constraints such as the reserved-word `mod` case. The registry is the source for non-derivable machine facts. Public helper declarations are the source for argument names, argument types, and return types. Docstrings remain human-facing explanation, examples, and parameter intent. The `registry-metadata` check validates the checked API metadata projections produced from public facade aliases, registry decorators, and decorated callable signatures. Runtime registry entries are lazy and process-local: they support helper execution and lowering for loaded helpers, while the complete public catalog comes from checked metadata. This matters for generated docs, diagnostics, Prism lowering, and backend capability checks as the catalog grows. @@ -31,6 +32,7 @@ The registered helper surface currently includes: | `coalesce(...)`, `nullif(...)`, `case_when(...)` | scalar | registered Substrait mappings; `case_when(...)` lowers as built-in `IfThen` | | `in_(...)`, `between(...)` | scalar | built-in membership/range lowering (`SingularOrList` and `between`) | | `abs(...)`, `ceil(...)`, `floor(...)`, `round(...)` | scalar | registered Substrait math scalar mappings; `round(...)` is currently the single-argument form | +| `array(...)`, `cardinality(...)`, `array_contains(...)`, `arrays_overlap(...)`, `array_position(...)`, `element_at(...)`, `array_sort(...)`, `array_distinct(...)`, `array_except(...)`, `array_intersect(...)`, `array_union(...)`, `array_join(...)`, `array_slice(...)`, `array_reverse(...)`, `array_flatten(...)`, `map_from_arrays(...)`, `map_extract(...)`, `map_contains_key(...)`, `map_keys(...)`, `map_values(...)`, `map_entries(...)`, `named_struct(...)` | scalar | registered nested scalar helpers backed by Substrait extension mappings; `map_contains_key(...)` lowers as a documented predicate rewrite | | `asc(...)`, `desc(...)`, `asc_nulls_first(...)`, `asc_nulls_last(...)`, `desc_nulls_first(...)`, `desc_nulls_last(...)` | ordering | structural sort-field helpers consumed by `order_by(...)` and lowered to Substrait `SortRel.sorts` | | `sum(...)`, `count(...)`, `count_expr(...)`, `count_distinct(...)`, `count_if(...)`, `avg(...)`, `min(...)`, `max(...)` | aggregate | registered Substrait extension functions for core aggregates plus compatibility rewrites for `count_expr(...)`, `count_distinct(...)`, and `count_if(...)`; core aggregates allow `DISTINCT` and aggregate-local `FILTER` where the aggregate shape is valid | diff --git a/docs/language/reference/functions/nested.md b/docs/language/reference/functions/nested.md new file mode 100644 index 0000000..644e1ad --- /dev/null +++ b/docs/language/reference/functions/nested.md @@ -0,0 +1,58 @@ +# Nested Data Functions (Reference) + +Nested data helpers build and inspect row-level arrays, maps, and structs. They are scalar expressions: every helper returns one value for each input row and does not change relation cardinality. + +Generator or table-valued operations such as row-expanding `explode(...)` are separate from this page. + +## Arrays + +| Function | Meaning | +| --- | --- | +| `array(values)` | Build an array expression from one or more scalar expressions. | +| `cardinality(value)` | Return the size of an array or map. | +| `array_contains(array_expr, value)` | Return whether an array contains a value. | +| `arrays_overlap(left, right)` | Return whether two arrays have any elements in common. | +| `array_position(array_expr, value)` | Return the one-based position of a value. | +| `element_at(array_expr, index)` | Return an array element by one-based index. | +| `array_sort(array_expr)` | Sort one array value. | +| `array_distinct(array_expr)` | Remove duplicate elements from one array value. | +| `array_except(left, right)` | Return elements from `left` that are not in `right`. | +| `array_intersect(left, right)` | Return elements shared by both arrays. | +| `array_union(left, right)` | Return the union of both arrays. | +| `array_join(array_expr, delimiter)` | Join a string array into one string. | +| `array_slice(array_expr, start, stop)` | Return a one-based array slice using the backend adapter's slice contract. | +| `array_reverse(array_expr)` | Reverse one array value. | +| `array_flatten(array_expr)` | Flatten an array-of-arrays into one row-level array value. | + +## Maps And Structs + +| Function | Meaning | +| --- | --- | +| `map_from_arrays(keys, values)` | Build a map from key and value arrays. | +| `map_extract(map_expr, key)` | Return the values associated with a key. | +| `map_contains_key(map_expr, key)` | Return whether `map_extract(...)` finds at least one value for the key. | +| `map_keys(map_expr)` | Return the map's keys as an array. | +| `map_values(map_expr)` | Return the map's values as an array. | +| `map_entries(map_expr)` | Return map entries. | +| `named_struct(field_names, values)` | Build a struct expression with explicit field names. | + +## Example + +```incan +from pub::inql.functions import array, array_contains, cardinality, col, element_at, lit + +projected = ( + events + .with_column("tags", array([lit("paid"), col("source")])) + .with_column("tag_count", cardinality(col("tags"))) + .with_column("has_paid_tag", array_contains(col("tags"), lit("paid"))) + .with_column("first_tag", element_at(col("tags"), lit(1))) +) +``` + +## Semantics + +- Array indexing is one-based for `element_at(...)`, `array_position(...)`, and `array_slice(...)`. +- `element_at(...)` currently maps to the portable array-element adapter path. Out-of-range behavior follows the current backend adapter's recoverable result until InQL has a richer static/runtime error-policy split for strict versus try-style element access. +- `array_flatten(...)` is intentionally named to avoid colliding with future table-valued or generator `flatten(...)` forms. +- Grouping or ordering by nested values is not documented as portable until equality and ordering semantics for arrays, maps, and structs are specified. diff --git a/docs/release_notes/v0_1.md b/docs/release_notes/v0_1.md index ca3eb03..39785c7 100644 --- a/docs/release_notes/v0_1.md +++ b/docs/release_notes/v0_1.md @@ -14,6 +14,7 @@ Entries will be filled in as work lands (link RFCs and PRs when applicable). - **Scalar expressions:** RFC 012 unifies filter predicates, computed projection values, grouping keys, and aggregate inputs around one `ColumnExpr` surface with canonical `lit(...)` and typed literal helpers. - **Core scalar functions:** RFC 015 adds registry-backed scalar function applications and the first core helper slice for casts, comparisons, boolean logic, null/NaN predicates, arithmetic, conditionals, membership/range predicates, and ordering expressions. Implemented helpers lower to Substrait IR through registry metadata, built-in Rex shapes, or structural sort-field lowering; DataFusion remains the first execution adapter rather than the semantic boundary. - **Common scalar functions:** The first RFC 018 slice adds registry-backed math helpers for `abs(...)`, `ceil(...)`, `floor(...)`, and single-argument `round(...)`, with Substrait mappings and DataFusion-backed execution coverage. +- **Nested data functions:** RFC 020 adds registry-backed scalar helpers for array construction/access, cardinality, containment, overlap, sorting, set-like operations, joining, slicing, reversing, scalar array flattening, map construction/access, map key/value/entry extraction, map key containment, and named struct construction. These helpers lower through Substrait extension metadata and execute through the DataFusion-backed Session path without introducing generator semantics. - **Function registry:** RFC 014 adds declaration-site registry decorators for the current public helper surface, including stable function references, checked signature projection, lifecycle metadata, behavior categories, alias policy, Substrait mapping categories, and checked API metadata drift validation. - **Function extension policy:** InQL RFC 024 policy metadata now distinguishes portable core functions, namespaced extension-only functions, opt-in compatibility aliases, engine-specific functions, and rejected compatibility requests without adding an extension plugin system or backend-owned semantics. - **Projection:** builder-based `with_column`, `add`, `mul`, and literal expression helpers now lower derived columns through Prism, Substrait, and Session execution. diff --git a/docs/rfcs/020_nested_data_functions.md b/docs/rfcs/020_nested_data_functions.md index c3f0d35..2680ec3 100644 --- a/docs/rfcs/020_nested_data_functions.md +++ b/docs/rfcs/020_nested_data_functions.md @@ -1,6 +1,6 @@ # InQL RFC 020: Nested data functions -- **Status:** Draft +- **Status:** Implemented - **Created:** 2026-04-27 - **Author(s):** Danny Meijer (@dannymeijer) - **Related:** @@ -11,12 +11,12 @@ - InQL RFC 021 (generator and table-valued functions) - **Issue:** [InQL #37](https://github.com/dannys-code-corner/InQL/issues/37) - **RFC PR:** — -- **Written against:** Incan v0.2 -- **Shipped in:** — +- **Written against:** Incan v0.3-era InQL +- **Shipped in:** v0.1 ## Summary -This RFC defines InQL functions for nested scalar values: arrays, maps, and structs. It covers construction, element access, cardinality, containment, sorting, set-like array operations, map entry access, and higher-order collection functions as a later extension point. Nested functions remain scalar when they produce one value per input row; cardinality-changing operations such as `explode` belong to a separate generator RFC. +This RFC defines InQL functions for nested scalar values: arrays, maps, and structs. It covers construction, element access, cardinality, containment, overlap checks, sorting, set-like array operations, scalar array flattening, map entry access, and higher-order collection functions as a later extension point. Nested functions remain scalar when they produce one value per input row; cardinality-changing operations such as `explode` belong to a separate generator RFC. ## Motivation @@ -28,7 +28,7 @@ The split matters. `array_contains(.items, "x")` is a row-level scalar predicate - Define scalar functions for arrays, maps, and structs. - Distinguish nested scalar operations from generators. -- Define element access and safe element access. +- Define element access with an explicit one-based indexing policy. - Define collection size, containment, sorting, and set-like operations. - Leave lambda-based higher-order functions as a later design decision unless the host language surface is ready. @@ -41,16 +41,16 @@ The split matters. `array_contains(.items, "x")` is a row-level scalar predicate ## Guide-level explanation (how authors think about it) -Authors should be able to inspect and manipulate nested values without changing relation cardinality: +Authors can inspect and manipulate nested values without changing relation cardinality: ```incan -from pub::inql.functions import array_contains, cardinality, col, element_at, map_keys +from pub::inql.functions import array_contains, cardinality, col, element_at, lit, map_keys enriched = ( events - .filter(array_contains(col("tags"), "purchase")) + .filter(array_contains(col("tags"), lit("purchase"))) .with_column("tag_count", cardinality(col("tags"))) - .with_column("first_item", element_at(col("items"), 1)) + .with_column("first_item", element_at(col("items"), lit(1))) .with_column("metadata_keys", map_keys(col("metadata"))) ) ``` @@ -59,17 +59,17 @@ If an author wants one output row per item, that is a generator/table-valued ope ## Reference-level explanation (precise rules) -InQL should define array construction with `array`, struct construction with `struct` or `named_struct`, and map construction with `create_map` or an equivalent canonical name. +InQL defines array construction with `array`, struct construction with `named_struct`, and map construction with `map_from_arrays`. -InQL should define `cardinality` as the canonical size function for arrays and maps. Compatibility aliases such as `size`, `array_size`, and `array_length` may resolve to `cardinality` where semantics match. +InQL defines `cardinality` as the canonical size function for arrays and maps. Compatibility aliases such as `size`, `array_size`, and `array_length` may resolve to `cardinality` where semantics match, but the initial implemented surface keeps the canonical spelling. -InQL should define element access functions including `element_at`, `try_element_at`, and `get`. Strict element access must fail or diagnose according to its registry error policy when an index or key is invalid. `try_element_at` must produce the recoverable result defined by its registry entry. +InQL defines array element access with `element_at(array_expr, index)`. Indexes are one-based. Current lowering maps to the portable array-element adapter path and uses the backend adapter's recoverable out-of-range behavior until InQL has a richer static/runtime error-policy split for strict versus try-style element access. -InQL should define array predicates and transforms including `array_contains`, `array_position`, `array_sort`, `array_distinct`, `array_except`, `array_intersect`, `array_union`, `array_join`, `arrays_overlap`, `flatten`, `slice`, and `reverse` where type and null semantics are specified. +InQL defines array predicates and transforms including `array_contains`, `array_position`, `array_sort`, `array_distinct`, `array_except`, `array_intersect`, `array_union`, `array_join`, `arrays_overlap`, `array_flatten`, `array_slice`, and `array_reverse` where type and null semantics are specified by the registry and backend adapter boundary. The scalar array-flattening helper is named `array_flatten` so table-valued or generator `flatten` remains available for RFC 021. -InQL should define map functions including `map_contains_key`, `map_entries`, `map_from_arrays`, `map_from_entries`, `map_keys`, and `map_values`. +InQL defines map functions including `map_contains_key`, `map_entries`, `map_extract`, `map_from_arrays`, `map_keys`, and `map_values`. -InQL should account for object-style warehouse functions such as `object_construct`, `object_construct_keep_null`, `object_delete`, `object_insert`, `object_keys`, and `object_pick`. These should be modeled through typed object/map semantics where possible and through a variant/semi-structured family only when dynamic value semantics are required. +Object-style warehouse functions such as `object_construct`, `object_construct_keep_null`, `object_delete`, `object_insert`, `object_keys`, and `object_pick` are accounted for as semi-structured and dynamic-object concerns. They should be modeled through typed object/map semantics where possible and through the RFC 022 semi-structured family only when dynamic value semantics are required. Higher-order functions such as `transform`, `filter`, `exists`, `forall`, `aggregate`, `reduce`, `zip_with`, `map_filter`, `transform_keys`, and `transform_values` must not reach Planned status until lambda or equivalent callback semantics are specified for InQL expressions. @@ -87,7 +87,7 @@ Index origin, invalid-index behavior, null container behavior, null element beha ### Interaction with other InQL surfaces -Nested functions may appear wherever scalar expressions of their result type are valid. Grouping by nested values may be restricted until equality and ordering semantics for nested values are fully specified. +Nested functions may appear wherever scalar expressions of their result type are valid. Grouping by nested values is not documented as portable until equality and ordering semantics for nested values are fully specified. ### Compatibility / migration @@ -113,10 +113,12 @@ No current InQL APIs are expected to break. Nested functions should be additive - **Execution / interchange** — Prism and Substrait lowering must preserve nested value semantics or diagnose unsupported operations. - **Documentation** — docs should separate nested scalar operations from generator functions. -## Unresolved questions +## Design Decisions -- Should element access use one-based indexing for SQL/Spark compatibility or zero-based indexing for host-language familiarity? -- What should strict `element_at` do on out-of-range indexes? -- Should grouping and ordering over arrays, maps, and structs be allowed initially? +### Resolved - +- Element access, array position results, and array slice boundaries are one-based for SQL/Spark compatibility. +- `element_at(...)` uses the current adapter's recoverable array-element behavior for out-of-range indexes. A separate strict/try split is deferred until registry error policy can distinguish static validation failures from runtime recoverable results. +- Grouping and ordering over arrays, maps, and structs are not documented as portable in the initial implementation. +- Scalar `array_flatten(...)` is separate from RFC 021 table-valued or generator flattening. +- Higher-order collection functions remain deferred until InQL expression callback or lambda semantics are specified. diff --git a/src/functions/mod.incn b/src/functions/mod.incn index 24f00ee..f2471bd 100644 --- a/src/functions/mod.incn +++ b/src/functions/mod.incn @@ -37,6 +37,28 @@ pub from functions.math.abs import abs pub from functions.math.ceil import ceil pub from functions.math.floor import floor pub from functions.math.round import round +pub from functions.nested.array import array +pub from functions.nested.array_contains import array_contains +pub from functions.nested.array_distinct import array_distinct +pub from functions.nested.array_except import array_except +pub from functions.nested.array_flatten import array_flatten +pub from functions.nested.array_intersect import array_intersect +pub from functions.nested.array_join import array_join +pub from functions.nested.array_position import array_position +pub from functions.nested.array_reverse import array_reverse +pub from functions.nested.array_slice import array_slice +pub from functions.nested.array_sort import array_sort +pub from functions.nested.array_union import array_union +pub from functions.nested.arrays_overlap import arrays_overlap +pub from functions.nested.cardinality import cardinality +pub from functions.nested.element_at import element_at +pub from functions.nested.map_contains_key import map_contains_key +pub from functions.nested.map_entries import map_entries +pub from functions.nested.map_extract import map_extract +pub from functions.nested.map_from_arrays import map_from_arrays +pub from functions.nested.map_keys import map_keys +pub from functions.nested.map_values import map_values +pub from functions.nested.named_struct import named_struct pub from functions.operators.add import add pub from functions.operators.and_ import and_ pub from functions.operators.div import div diff --git a/src/functions/nested/array.incn b/src/functions/nested/array.incn new file mode 100644 index 0000000..3c681e7 --- /dev/null +++ b/src/functions/nested/array.incn @@ -0,0 +1,38 @@ +""" +Array construction helper. + +`array` builds a row-level nested scalar value and does not change relation cardinality. +""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application, require_non_empty_args +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import MAKE_ARRAY_FUNCTION_ANCHOR + + +@function_registry.add("array", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("make_array", MAKE_ARRAY_FUNCTION_ANCHOR), +)) +pub def array(values: list[ColumnExpr]) -> ColumnExpr: + """ + Build an array expression from one or more scalar values. + + Examples: + tags = array([str_lit("paid"), col("status")]) + + Parameters: + values: Element expressions in array order. + """ + require_non_empty_args(values) + return nested_application("array", values) diff --git a/src/functions/nested/array_contains.incn b/src/functions/nested/array_contains.incn new file mode 100644 index 0000000..fc0f480 --- /dev/null +++ b/src/functions/nested/array_contains.incn @@ -0,0 +1,34 @@ +"""Array containment predicate helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_HAS_FUNCTION_ANCHOR + + +@function_registry.add("array_contains", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_has", ARRAY_HAS_FUNCTION_ANCHOR), +)) +pub def array_contains(array_expr: ColumnExpr, value: ColumnExpr) -> ColumnExpr: + """ + Return whether an array contains a value. + + Examples: + has_purchase = array_contains(col("tags"), str_lit("purchase")) + + Parameters: + array_expr: Array expression to search. + value: Value expression to find. + """ + return nested_application("array_contains", [array_expr, value]) diff --git a/src/functions/nested/array_distinct.incn b/src/functions/nested/array_distinct.incn new file mode 100644 index 0000000..2464b9d --- /dev/null +++ b/src/functions/nested/array_distinct.incn @@ -0,0 +1,33 @@ +"""Array distinct helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_DISTINCT_FUNCTION_ANCHOR + + +@function_registry.add("array_distinct", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_distinct", ARRAY_DISTINCT_FUNCTION_ANCHOR), +)) +pub def array_distinct(array_expr: ColumnExpr) -> ColumnExpr: + """ + Return an array with duplicate values removed. + + Examples: + unique_tags = array_distinct(col("tags")) + + Parameters: + array_expr: Array expression to de-duplicate. + """ + return nested_application("array_distinct", [array_expr]) diff --git a/src/functions/nested/array_except.incn b/src/functions/nested/array_except.incn new file mode 100644 index 0000000..10a2caf --- /dev/null +++ b/src/functions/nested/array_except.incn @@ -0,0 +1,34 @@ +"""Array set-difference helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_EXCEPT_FUNCTION_ANCHOR + + +@function_registry.add("array_except", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_except", ARRAY_EXCEPT_FUNCTION_ANCHOR), +)) +pub def array_except(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: + """ + Return values from one array that are not present in another array. + + Examples: + missing_tags = array_except(col("expected_tags"), col("actual_tags")) + + Parameters: + left: Array expression that supplies candidate values. + right: Array expression containing values to remove. + """ + return nested_application("array_except", [left, right]) diff --git a/src/functions/nested/array_flatten.incn b/src/functions/nested/array_flatten.incn new file mode 100644 index 0000000..8403ee1 --- /dev/null +++ b/src/functions/nested/array_flatten.incn @@ -0,0 +1,37 @@ +""" +Array flattening helper. + +`array_flatten` is scalar: it flattens an array value in one input row and does not produce more rows. +""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_FLATTEN_FUNCTION_ANCHOR + + +@function_registry.add("array_flatten", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("flatten", ARRAY_FLATTEN_FUNCTION_ANCHOR), +)) +pub def array_flatten(array_expr: ColumnExpr) -> ColumnExpr: + """ + Flatten an array-of-arrays into one row-level array value. + + Examples: + flattened = array_flatten(col("nested_tags")) + + Parameters: + array_expr: Array expression to flatten. + """ + return nested_application("array_flatten", [array_expr]) diff --git a/src/functions/nested/array_intersect.incn b/src/functions/nested/array_intersect.incn new file mode 100644 index 0000000..a457da4 --- /dev/null +++ b/src/functions/nested/array_intersect.incn @@ -0,0 +1,34 @@ +"""Array set-intersection helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_INTERSECT_FUNCTION_ANCHOR + + +@function_registry.add("array_intersect", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_intersect", ARRAY_INTERSECT_FUNCTION_ANCHOR), +)) +pub def array_intersect(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: + """ + Return values that are present in both arrays. + + Examples: + shared_tags = array_intersect(col("expected_tags"), col("actual_tags")) + + Parameters: + left: First array expression. + right: Second array expression. + """ + return nested_application("array_intersect", [left, right]) diff --git a/src/functions/nested/array_join.incn b/src/functions/nested/array_join.incn new file mode 100644 index 0000000..600a1d9 --- /dev/null +++ b/src/functions/nested/array_join.incn @@ -0,0 +1,34 @@ +"""Array-to-string join helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_TO_STRING_FUNCTION_ANCHOR + + +@function_registry.add("array_join", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_to_string", ARRAY_TO_STRING_FUNCTION_ANCHOR), +)) +pub def array_join(array_expr: ColumnExpr, delimiter: ColumnExpr) -> ColumnExpr: + """ + Join array elements into one string with a delimiter. + + Examples: + tag_text = array_join(col("tags"), str_lit(",")) + + Parameters: + array_expr: Array expression to render. + delimiter: String delimiter expression placed between elements. + """ + return nested_application("array_join", [array_expr, delimiter]) diff --git a/src/functions/nested/array_position.incn b/src/functions/nested/array_position.incn new file mode 100644 index 0000000..e443508 --- /dev/null +++ b/src/functions/nested/array_position.incn @@ -0,0 +1,34 @@ +"""Array first-position helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_POSITION_FUNCTION_ANCHOR + + +@function_registry.add("array_position", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_position", ARRAY_POSITION_FUNCTION_ANCHOR), +)) +pub def array_position(array_expr: ColumnExpr, value: ColumnExpr) -> ColumnExpr: + """ + Return the one-based first position of a value in an array, or null when absent. + + Examples: + first_paid = array_position(col("tags"), str_lit("paid")) + + Parameters: + array_expr: Array expression to search. + value: Value expression to find. + """ + return nested_application("array_position", [array_expr, value]) diff --git a/src/functions/nested/array_reverse.incn b/src/functions/nested/array_reverse.incn new file mode 100644 index 0000000..8e4c13c --- /dev/null +++ b/src/functions/nested/array_reverse.incn @@ -0,0 +1,33 @@ +"""Array reverse helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_REVERSE_FUNCTION_ANCHOR + + +@function_registry.add("array_reverse", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_reverse", ARRAY_REVERSE_FUNCTION_ANCHOR), +)) +pub def array_reverse(array_expr: ColumnExpr) -> ColumnExpr: + """ + Return an array with elements in reverse order. + + Examples: + newest_first = array_reverse(col("events")) + + Parameters: + array_expr: Array expression to reverse. + """ + return nested_application("array_reverse", [array_expr]) diff --git a/src/functions/nested/array_slice.incn b/src/functions/nested/array_slice.incn new file mode 100644 index 0000000..08aa946 --- /dev/null +++ b/src/functions/nested/array_slice.incn @@ -0,0 +1,35 @@ +"""Array slice helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_SLICE_FUNCTION_ANCHOR + + +@function_registry.add("array_slice", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_slice", ARRAY_SLICE_FUNCTION_ANCHOR), +)) +pub def array_slice(array_expr: ColumnExpr, start: ColumnExpr, stop: ColumnExpr) -> ColumnExpr: + """ + Return a one-based array slice. + + Examples: + first_two = array_slice(col("tags"), int_lit(1), int_lit(2)) + + Parameters: + array_expr: Array expression to slice. + start: One-based start index. + stop: One-based stop index following the backend adapter's `array_slice` contract. + """ + return nested_application("array_slice", [array_expr, start, stop]) diff --git a/src/functions/nested/array_sort.incn b/src/functions/nested/array_sort.incn new file mode 100644 index 0000000..73c7f75 --- /dev/null +++ b/src/functions/nested/array_sort.incn @@ -0,0 +1,33 @@ +"""Array sort helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_SORT_FUNCTION_ANCHOR + + +@function_registry.add("array_sort", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_sort", ARRAY_SORT_FUNCTION_ANCHOR), +)) +pub def array_sort(array_expr: ColumnExpr) -> ColumnExpr: + """ + Return an array with comparable elements sorted into backend-default order. + + Examples: + sorted_tags = array_sort(col("tags")) + + Parameters: + array_expr: Array expression to sort. + """ + return nested_application("array_sort", [array_expr]) diff --git a/src/functions/nested/array_union.incn b/src/functions/nested/array_union.incn new file mode 100644 index 0000000..6aaba15 --- /dev/null +++ b/src/functions/nested/array_union.incn @@ -0,0 +1,34 @@ +"""Array set-union helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_UNION_FUNCTION_ANCHOR + + +@function_registry.add("array_union", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_union", ARRAY_UNION_FUNCTION_ANCHOR), +)) +pub def array_union(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: + """ + Return the distinct union of two arrays. + + Examples: + all_tags = array_union(col("left_tags"), col("right_tags")) + + Parameters: + left: First array expression. + right: Second array expression. + """ + return nested_application("array_union", [left, right]) diff --git a/src/functions/nested/arrays_overlap.incn b/src/functions/nested/arrays_overlap.incn new file mode 100644 index 0000000..cedebed --- /dev/null +++ b/src/functions/nested/arrays_overlap.incn @@ -0,0 +1,34 @@ +"""Array overlap predicate helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_HAS_ANY_FUNCTION_ANCHOR + + +@function_registry.add("arrays_overlap", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_has_any", ARRAY_HAS_ANY_FUNCTION_ANCHOR), +)) +pub def arrays_overlap(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: + """ + Return whether two arrays have any elements in common. + + Examples: + has_shared_tag = arrays_overlap(col("tags"), array([str_lit("paid")])) + + Parameters: + left: First array expression. + right: Second array expression. + """ + return nested_application("arrays_overlap", [left, right]) diff --git a/src/functions/nested/cardinality.incn b/src/functions/nested/cardinality.incn new file mode 100644 index 0000000..9049ccc --- /dev/null +++ b/src/functions/nested/cardinality.incn @@ -0,0 +1,33 @@ +"""Nested collection cardinality helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import CARDINALITY_FUNCTION_ANCHOR + + +@function_registry.add("cardinality", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("cardinality", CARDINALITY_FUNCTION_ANCHOR), +)) +pub def cardinality(value: ColumnExpr) -> ColumnExpr: + """ + Return the number of entries in an array or map expression. + + Examples: + tag_count = cardinality(col("tags")) + + Parameters: + value: Array or map expression to size. + """ + return nested_application("cardinality", [value]) diff --git a/src/functions/nested/common.incn b/src/functions/nested/common.incn new file mode 100644 index 0000000..319c115 --- /dev/null +++ b/src/functions/nested/common.incn @@ -0,0 +1,33 @@ +"""Shared implementation helpers for nested scalar functions.""" + +from rust::incan_stdlib::errors import raise_value_error +from functions.registry import registered_application +from projection_builders import ColumnExpr, str_expr + + +pub def nested_application(canonical_name: str, arguments: list[ColumnExpr]) -> ColumnExpr: + """Build one registry-backed nested scalar function application.""" + return registered_application(canonical_name, arguments) + + +pub def require_non_empty_args(arguments: list[ColumnExpr]) -> None: + """Reject empty variadic nested constructors that cannot infer a value type.""" + if len(arguments) == 0: + return raise_value_error("nested constructor requires at least one scalar expression") + return + + +pub def named_struct_arguments(field_names: list[str], values: list[ColumnExpr]) -> list[ColumnExpr]: + """Build alternating field-name/value arguments for a named struct function call.""" + if len(field_names) == 0: + return raise_value_error("named_struct requires at least one field") + if len(field_names) != len(values): + return raise_value_error("named_struct requires one value for each field name") + + mut arguments: list[ColumnExpr] = [] + for idx, field_name in enumerate(field_names): + if len(field_name) == 0: + return raise_value_error("named_struct field names must be non-empty") + arguments.append(str_expr(field_name)) + arguments.append(values[idx]) + return arguments diff --git a/src/functions/nested/element_at.incn b/src/functions/nested/element_at.incn new file mode 100644 index 0000000..e726baa --- /dev/null +++ b/src/functions/nested/element_at.incn @@ -0,0 +1,34 @@ +"""Array element access helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import ARRAY_ELEMENT_FUNCTION_ANCHOR + + +@function_registry.add("element_at", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("array_element", ARRAY_ELEMENT_FUNCTION_ANCHOR), +)) +pub def element_at(array_expr: ColumnExpr, index: ColumnExpr) -> ColumnExpr: + """ + Return an array element by one-based index. + + Examples: + first_tag = element_at(col("tags"), int_lit(1)) + + Parameters: + array_expr: Array expression to access. + index: One-based element index. Negative indexes count from the end where supported by the backend adapter. + """ + return nested_application("element_at", [array_expr, index]) diff --git a/src/functions/nested/map_contains_key.incn b/src/functions/nested/map_contains_key.incn new file mode 100644 index 0000000..8d0d02e --- /dev/null +++ b/src/functions/nested/map_contains_key.incn @@ -0,0 +1,35 @@ +"""Map key containment predicate helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + rewrite_mapping, + v0_1, +) +from functions.nested.cardinality import cardinality +from functions.nested.map_extract import map_extract +from functions.operators.gt import gt +from functions.registry import function_registry +from projection_builders import ColumnExpr, int_expr + + +@function_registry.add("map_contains_key", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + rewrite_mapping("gt(cardinality(map_extract(map_expr, key)), int_expr(0))"), +)) +pub def map_contains_key(map_expr: ColumnExpr, key: ColumnExpr) -> ColumnExpr: + """ + Return whether a map contains a key. + + Examples: + has_status = map_contains_key(col("attributes"), str_lit("status")) + + Parameters: + map_expr: Map expression to inspect. + key: Key expression to look up. + """ + return gt(cardinality(map_extract(map_expr, key)), int_expr(0)) diff --git a/src/functions/nested/map_entries.incn b/src/functions/nested/map_entries.incn new file mode 100644 index 0000000..49c12ff --- /dev/null +++ b/src/functions/nested/map_entries.incn @@ -0,0 +1,33 @@ +"""Map entries helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import MAP_ENTRIES_FUNCTION_ANCHOR + + +@function_registry.add("map_entries", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("map_entries", MAP_ENTRIES_FUNCTION_ANCHOR), +)) +pub def map_entries(map_expr: ColumnExpr) -> ColumnExpr: + """ + Return a map as an array of key/value entry structs. + + Examples: + entries = map_entries(col("attributes")) + + Parameters: + map_expr: Map expression to inspect. + """ + return nested_application("map_entries", [map_expr]) diff --git a/src/functions/nested/map_extract.incn b/src/functions/nested/map_extract.incn new file mode 100644 index 0000000..30e4e7b --- /dev/null +++ b/src/functions/nested/map_extract.incn @@ -0,0 +1,34 @@ +"""Map key extraction helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import MAP_EXTRACT_FUNCTION_ANCHOR + + +@function_registry.add("map_extract", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("map_extract", MAP_EXTRACT_FUNCTION_ANCHOR), +)) +pub def map_extract(map_expr: ColumnExpr, key: ColumnExpr) -> ColumnExpr: + """ + Return the value-list associated with a map key. + + Examples: + status_value = map_extract(col("attributes"), str_lit("status")) + + Parameters: + map_expr: Map expression to inspect. + key: Key expression to look up. + """ + return nested_application("map_extract", [map_expr, key]) diff --git a/src/functions/nested/map_from_arrays.incn b/src/functions/nested/map_from_arrays.incn new file mode 100644 index 0000000..1c94165 --- /dev/null +++ b/src/functions/nested/map_from_arrays.incn @@ -0,0 +1,34 @@ +"""Map construction helper from key and value arrays.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import MAP_FUNCTION_ANCHOR + + +@function_registry.add("map_from_arrays", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("map", MAP_FUNCTION_ANCHOR), +)) +pub def map_from_arrays(keys: ColumnExpr, values: ColumnExpr) -> ColumnExpr: + """ + Build a map from equal-length key and value arrays. + + Examples: + attrs = map_from_arrays(array([str_lit("status")]), array([col("status")])) + + Parameters: + keys: Array expression containing non-null map keys. + values: Array expression containing map values. + """ + return nested_application("map_from_arrays", [keys, values]) diff --git a/src/functions/nested/map_keys.incn b/src/functions/nested/map_keys.incn new file mode 100644 index 0000000..d223229 --- /dev/null +++ b/src/functions/nested/map_keys.incn @@ -0,0 +1,33 @@ +"""Map keys helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import MAP_KEYS_FUNCTION_ANCHOR + + +@function_registry.add("map_keys", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("map_keys", MAP_KEYS_FUNCTION_ANCHOR), +)) +pub def map_keys(map_expr: ColumnExpr) -> ColumnExpr: + """ + Return the keys of a map as an array. + + Examples: + keys = map_keys(col("attributes")) + + Parameters: + map_expr: Map expression to inspect. + """ + return nested_application("map_keys", [map_expr]) diff --git a/src/functions/nested/map_values.incn b/src/functions/nested/map_values.incn new file mode 100644 index 0000000..35ec19a --- /dev/null +++ b/src/functions/nested/map_values.incn @@ -0,0 +1,33 @@ +"""Map values helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import MAP_VALUES_FUNCTION_ANCHOR + + +@function_registry.add("map_values", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("map_values", MAP_VALUES_FUNCTION_ANCHOR), +)) +pub def map_values(map_expr: ColumnExpr) -> ColumnExpr: + """ + Return the values of a map as an array. + + Examples: + values = map_values(col("attributes")) + + Parameters: + map_expr: Map expression to inspect. + """ + return nested_application("map_values", [map_expr]) diff --git a/src/functions/nested/mod.incn b/src/functions/nested/mod.incn new file mode 100644 index 0000000..bdbdff1 --- /dev/null +++ b/src/functions/nested/mod.incn @@ -0,0 +1,24 @@ +"""Nested scalar function helpers for arrays, maps, and structs.""" + +pub from functions.nested.array import array +pub from functions.nested.array_contains import array_contains +pub from functions.nested.array_distinct import array_distinct +pub from functions.nested.array_except import array_except +pub from functions.nested.array_flatten import array_flatten +pub from functions.nested.array_intersect import array_intersect +pub from functions.nested.array_join import array_join +pub from functions.nested.array_position import array_position +pub from functions.nested.array_reverse import array_reverse +pub from functions.nested.array_slice import array_slice +pub from functions.nested.array_sort import array_sort +pub from functions.nested.array_union import array_union +pub from functions.nested.arrays_overlap import arrays_overlap +pub from functions.nested.cardinality import cardinality +pub from functions.nested.element_at import element_at +pub from functions.nested.map_contains_key import map_contains_key +pub from functions.nested.map_entries import map_entries +pub from functions.nested.map_extract import map_extract +pub from functions.nested.map_from_arrays import map_from_arrays +pub from functions.nested.map_keys import map_keys +pub from functions.nested.map_values import map_values +pub from functions.nested.named_struct import named_struct diff --git a/src/functions/nested/named_struct.incn b/src/functions/nested/named_struct.incn new file mode 100644 index 0000000..2f18a30 --- /dev/null +++ b/src/functions/nested/named_struct.incn @@ -0,0 +1,34 @@ +"""Named struct construction helper.""" + +from function_registry import ( + FunctionClass, + FunctionLifecycle, + FunctionNullBehavior, + deterministic_spec, + extension_mapping, + v0_1, +) +from functions.nested.common import named_struct_arguments, nested_application +from functions.registry import function_registry +from projection_builders import ColumnExpr +from substrait.function_extensions import NAMED_STRUCT_FUNCTION_ANCHOR + + +@function_registry.add("named_struct", deterministic_spec( + FunctionClass.Scalar, + FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + FunctionNullBehavior.DependsOnInputs, + extension_mapping("named_struct", NAMED_STRUCT_FUNCTION_ANCHOR), +)) +pub def named_struct(field_names: list[str], values: list[ColumnExpr]) -> ColumnExpr: + """ + Build a struct expression with explicit field names. + + Examples: + event = named_struct(["status", "amount"], [col("status"), col("amount")]) + + Parameters: + field_names: Non-empty struct field names. + values: Field value expressions in the same order as `field_names`. + """ + return nested_application("named_struct", named_struct_arguments(field_names, values)) diff --git a/src/lib.incn b/src/lib.incn index 3520592..51136d6 100644 --- a/src/lib.incn +++ b/src/lib.incn @@ -58,6 +58,28 @@ pub from functions.math.abs import abs pub from functions.math.ceil import ceil pub from functions.math.floor import floor pub from functions.math.round import round +pub from functions.nested.array import array +pub from functions.nested.array_contains import array_contains +pub from functions.nested.array_distinct import array_distinct +pub from functions.nested.array_except import array_except +pub from functions.nested.array_flatten import array_flatten +pub from functions.nested.array_intersect import array_intersect +pub from functions.nested.array_join import array_join +pub from functions.nested.array_position import array_position +pub from functions.nested.array_reverse import array_reverse +pub from functions.nested.array_slice import array_slice +pub from functions.nested.array_sort import array_sort +pub from functions.nested.array_union import array_union +pub from functions.nested.arrays_overlap import arrays_overlap +pub from functions.nested.cardinality import cardinality +pub from functions.nested.element_at import element_at +pub from functions.nested.map_contains_key import map_contains_key +pub from functions.nested.map_entries import map_entries +pub from functions.nested.map_extract import map_extract +pub from functions.nested.map_from_arrays import map_from_arrays +pub from functions.nested.map_keys import map_keys +pub from functions.nested.map_values import map_values +pub from functions.nested.named_struct import named_struct pub from functions.operators.add import add pub from functions.operators.and_ import and_ pub from functions.operators.div import div diff --git a/src/substrait/function_extensions.incn b/src/substrait/function_extensions.incn index 0ad86f1..490f93c 100644 --- a/src/substrait/function_extensions.incn +++ b/src/substrait/function_extensions.incn @@ -54,6 +54,27 @@ pub const ABS_FUNCTION_ANCHOR: u32 = 27 pub const CEIL_FUNCTION_ANCHOR: u32 = 28 pub const FLOOR_FUNCTION_ANCHOR: u32 = 29 pub const ROUND_FUNCTION_ANCHOR: u32 = 30 +pub const MAKE_ARRAY_FUNCTION_ANCHOR: u32 = 31 +pub const CARDINALITY_FUNCTION_ANCHOR: u32 = 32 +pub const ARRAY_HAS_FUNCTION_ANCHOR: u32 = 33 +pub const ARRAY_POSITION_FUNCTION_ANCHOR: u32 = 34 +pub const ARRAY_ELEMENT_FUNCTION_ANCHOR: u32 = 35 +pub const ARRAY_SORT_FUNCTION_ANCHOR: u32 = 36 +pub const ARRAY_DISTINCT_FUNCTION_ANCHOR: u32 = 37 +pub const ARRAY_EXCEPT_FUNCTION_ANCHOR: u32 = 38 +pub const ARRAY_INTERSECT_FUNCTION_ANCHOR: u32 = 39 +pub const ARRAY_UNION_FUNCTION_ANCHOR: u32 = 40 +pub const ARRAY_TO_STRING_FUNCTION_ANCHOR: u32 = 41 +pub const ARRAY_SLICE_FUNCTION_ANCHOR: u32 = 42 +pub const ARRAY_REVERSE_FUNCTION_ANCHOR: u32 = 43 +pub const MAP_FUNCTION_ANCHOR: u32 = 44 +pub const MAP_KEYS_FUNCTION_ANCHOR: u32 = 45 +pub const MAP_VALUES_FUNCTION_ANCHOR: u32 = 46 +pub const MAP_ENTRIES_FUNCTION_ANCHOR: u32 = 47 +pub const MAP_EXTRACT_FUNCTION_ANCHOR: u32 = 48 +pub const NAMED_STRUCT_FUNCTION_ANCHOR: u32 = 49 +pub const ARRAY_HAS_ANY_FUNCTION_ANCHOR: u32 = 50 +pub const ARRAY_FLATTEN_FUNCTION_ANCHOR: u32 = 51 const FUNCTION_EXTENSION_URI: str = "https://inql.io/extensions/v0.1/functions.yaml" const EXPLODE_EXTENSION_URI: str = "https://inql.io/extensions/v0.1/unnest.yaml#explode" diff --git a/tests/test_function_registry.incn b/tests/test_function_registry.incn index 134eb32..129158b 100644 --- a/tests/test_function_registry.incn +++ b/tests/test_function_registry.incn @@ -11,6 +11,19 @@ from functions import ( asc, asc_nulls_first, asc_nulls_last, + array, + array_contains, + array_distinct, + array_except, + array_flatten, + array_intersect, + array_join, + array_position, + array_reverse, + array_slice, + array_sort, + array_union, + arrays_overlap, avg, between, bool_expr, @@ -20,6 +33,7 @@ from functions import ( ceil, col, coalesce, + cardinality, count, count_distinct, count_expr, @@ -30,6 +44,7 @@ from functions import ( div, eq, equal_null, + element_at, floor, float_expr, function_registry_canonical_names, @@ -50,12 +65,19 @@ from functions import ( lit, lt, lte, + map_contains_key, + map_entries, + map_extract, + map_from_arrays, + map_keys, + map_values, max, min, modulo, mul, ne, neg, + named_struct, not_, nullif, or_, @@ -93,8 +115,22 @@ from substrait.function_extensions import ( ABS_FUNCTION_ANCHOR, ADD_FUNCTION_ANCHOR, AND_FUNCTION_ANCHOR, + ARRAY_DISTINCT_FUNCTION_ANCHOR, + ARRAY_ELEMENT_FUNCTION_ANCHOR, + ARRAY_EXCEPT_FUNCTION_ANCHOR, + ARRAY_FLATTEN_FUNCTION_ANCHOR, + ARRAY_HAS_FUNCTION_ANCHOR, + ARRAY_HAS_ANY_FUNCTION_ANCHOR, + ARRAY_INTERSECT_FUNCTION_ANCHOR, + ARRAY_POSITION_FUNCTION_ANCHOR, + ARRAY_REVERSE_FUNCTION_ANCHOR, + ARRAY_SLICE_FUNCTION_ANCHOR, + ARRAY_SORT_FUNCTION_ANCHOR, + ARRAY_TO_STRING_FUNCTION_ANCHOR, + ARRAY_UNION_FUNCTION_ANCHOR, AVG_FUNCTION_ANCHOR, BETWEEN_FUNCTION_ANCHOR, + CARDINALITY_FUNCTION_ANCHOR, CEIL_FUNCTION_ANCHOR, COALESCE_FUNCTION_ANCHOR, COUNT_FUNCTION_ANCHOR, @@ -109,10 +145,17 @@ from substrait.function_extensions import ( IS_NULL_FUNCTION_ANCHOR, LT_FUNCTION_ANCHOR, LTE_FUNCTION_ANCHOR, + MAKE_ARRAY_FUNCTION_ANCHOR, + MAP_ENTRIES_FUNCTION_ANCHOR, + MAP_EXTRACT_FUNCTION_ANCHOR, + MAP_FUNCTION_ANCHOR, + MAP_KEYS_FUNCTION_ANCHOR, + MAP_VALUES_FUNCTION_ANCHOR, MAX_FUNCTION_ANCHOR, MIN_FUNCTION_ANCHOR, MODULUS_FUNCTION_ANCHOR, MULTIPLY_FUNCTION_ANCHOR, + NAMED_STRUCT_FUNCTION_ANCHOR, NEGATE_FUNCTION_ANCHOR, NOT_EQUAL_FUNCTION_ANCHOR, NOT_FUNCTION_ANCHOR, @@ -180,18 +223,21 @@ def _local_entry_by_namespace_and_name_or_fail( def _expected_registry_names() -> list[str]: """Return the expected registered public helper names.""" - return ["col", "lit", "sum", "count", "count_expr", "count_distinct", "count_if", "avg", "min", "max", "int_expr", "float_expr", "str_expr", "bool_expr", "add", "mul", "int_lit", "str_lit", "bool_lit", "always_true", "always_false", "eq", "gt", "cast", "try_cast", "ne", "lt", "lte", "gte", "equal_null", "and_", "or_", "not_", "is_null", "is_not_null", "is_nan", "is_not_nan", "sub", "div", "mod", "neg", "coalesce", "nullif", "case_when", "in_", "between", "asc", "desc", "asc_nulls_first", "asc_nulls_last", "desc_nulls_first", "desc_nulls_last", "abs", "ceil", "floor", "round"] + return ["col", "lit", "sum", "count", "count_expr", "count_distinct", "count_if", "avg", "min", "max", "int_expr", "float_expr", "str_expr", "bool_expr", "add", "mul", "int_lit", "str_lit", "bool_lit", "always_true", "always_false", "eq", "gt", "cast", "try_cast", "ne", "lt", "lte", "gte", "equal_null", "and_", "or_", "not_", "is_null", "is_not_null", "is_nan", "is_not_nan", "sub", "div", "mod", "neg", "coalesce", "nullif", "case_when", "in_", "between", "asc", "desc", "asc_nulls_first", "asc_nulls_last", "desc_nulls_first", "desc_nulls_last", "abs", "ceil", "floor", "round", "array", "array_contains", "array_distinct", "array_except", "array_flatten", "array_intersect", "array_join", "array_position", "array_reverse", "array_slice", "array_sort", "array_union", "arrays_overlap", "cardinality", "element_at", "map_contains_key", "map_entries", "map_extract", "map_from_arrays", "map_keys", "map_values", "named_struct"] def _expected_substrait_mapped_names() -> list[str]: """Return helpers with concrete Substrait extension-function mappings.""" - return ["sum", "count", "avg", "min", "max", "add", "mul", "eq", "gt", "ne", "lt", "lte", "gte", "equal_null", "and_", "or_", "not_", "is_null", "is_not_null", "is_nan", "sub", "div", "mod", "neg", "coalesce", "nullif", "between", "abs", "ceil", "floor", "round"] + return ["sum", "count", "avg", "min", "max", "add", "mul", "eq", "gt", "ne", "lt", "lte", "gte", "equal_null", "and_", "or_", "not_", "is_null", "is_not_null", "is_nan", "sub", "div", "mod", "neg", "coalesce", "nullif", "between", "abs", "ceil", "floor", "round", "array", "array_contains", "array_distinct", "array_except", "array_flatten", "array_intersect", "array_join", "array_position", "array_reverse", "array_slice", "array_sort", "array_union", "arrays_overlap", "cardinality", "element_at", "map_entries", "map_extract", "map_from_arrays", "map_keys", "map_values", "named_struct"] def _exercise_current_public_helpers() -> None: """Touch each current registered helper so runtime registry tests observe loaded modules only.""" amount = col("amount") status = col("status") + tags = array([str_lit("paid"), status]) + backup_tags = array([str_lit("open"), str_lit("paid")]) + attr_map = map_from_arrays(array([str_lit("status")]), array([status])) lit(1) int_expr(1) float_expr(1.5) @@ -248,6 +294,26 @@ def _exercise_current_public_helpers() -> None: ceil(amount) floor(amount) round(amount) + array_contains(tags, str_lit("paid")) + array_distinct(tags) + array_except(tags, backup_tags) + array_flatten(array([tags, backup_tags])) + array_intersect(tags, backup_tags) + array_join(tags, str_lit("|")) + array_position(tags, str_lit("paid")) + array_reverse(tags) + array_slice(tags, int_lit(1), int_lit(2)) + array_sort(tags) + array_union(tags, backup_tags) + arrays_overlap(tags, backup_tags) + cardinality(tags) + element_at(tags, int_lit(1)) + map_contains_key(attr_map, str_lit("status")) + map_entries(attr_map) + map_extract(attr_map, str_lit("status")) + map_keys(attr_map) + map_values(attr_map) + named_struct(["status", "amount"], [status, amount]) return @@ -527,6 +593,27 @@ def test_function_registry__substrait_extension_mappings_are_structured() -> Non _assert_extension_mapping("ceil", "ceil", CEIL_FUNCTION_ANCHOR) _assert_extension_mapping("floor", "floor", FLOOR_FUNCTION_ANCHOR) _assert_extension_mapping("round", "round", ROUND_FUNCTION_ANCHOR) + _assert_extension_mapping("array", "make_array", MAKE_ARRAY_FUNCTION_ANCHOR) + _assert_extension_mapping("array_contains", "array_has", ARRAY_HAS_FUNCTION_ANCHOR) + _assert_extension_mapping("array_distinct", "array_distinct", ARRAY_DISTINCT_FUNCTION_ANCHOR) + _assert_extension_mapping("array_except", "array_except", ARRAY_EXCEPT_FUNCTION_ANCHOR) + _assert_extension_mapping("array_flatten", "flatten", ARRAY_FLATTEN_FUNCTION_ANCHOR) + _assert_extension_mapping("array_intersect", "array_intersect", ARRAY_INTERSECT_FUNCTION_ANCHOR) + _assert_extension_mapping("array_join", "array_to_string", ARRAY_TO_STRING_FUNCTION_ANCHOR) + _assert_extension_mapping("array_position", "array_position", ARRAY_POSITION_FUNCTION_ANCHOR) + _assert_extension_mapping("array_reverse", "array_reverse", ARRAY_REVERSE_FUNCTION_ANCHOR) + _assert_extension_mapping("array_slice", "array_slice", ARRAY_SLICE_FUNCTION_ANCHOR) + _assert_extension_mapping("array_sort", "array_sort", ARRAY_SORT_FUNCTION_ANCHOR) + _assert_extension_mapping("array_union", "array_union", ARRAY_UNION_FUNCTION_ANCHOR) + _assert_extension_mapping("arrays_overlap", "array_has_any", ARRAY_HAS_ANY_FUNCTION_ANCHOR) + _assert_extension_mapping("cardinality", "cardinality", CARDINALITY_FUNCTION_ANCHOR) + _assert_extension_mapping("element_at", "array_element", ARRAY_ELEMENT_FUNCTION_ANCHOR) + _assert_extension_mapping("map_entries", "map_entries", MAP_ENTRIES_FUNCTION_ANCHOR) + _assert_extension_mapping("map_extract", "map_extract", MAP_EXTRACT_FUNCTION_ANCHOR) + _assert_extension_mapping("map_from_arrays", "map", MAP_FUNCTION_ANCHOR) + _assert_extension_mapping("map_keys", "map_keys", MAP_KEYS_FUNCTION_ANCHOR) + _assert_extension_mapping("map_values", "map_values", MAP_VALUES_FUNCTION_ANCHOR) + _assert_extension_mapping("named_struct", "named_struct", NAMED_STRUCT_FUNCTION_ANCHOR) def test_function_registry__ordering_helpers_are_contextual_sort_fields() -> None: @@ -576,6 +663,7 @@ def test_function_registry__rewrite_mappings_identify_non_extension_helpers() -> assert always_true_entry.substrait.kind == SubstraitMappingKind.Rewrite, "always_true should lower as a literal rewrite" assert always_false_entry.substrait.kind == SubstraitMappingKind.Rewrite, "always_false should lower as a literal rewrite" _assert_rewrite_mapping("is_not_nan", "not_(is_nan(expr))") + _assert_rewrite_mapping("map_contains_key", "gt(cardinality(map_extract(map_expr, key)), int_expr(0))") assert always_true_entry.null_behavior == FunctionNullBehavior.Predicate, "predicate helpers should expose predicate null behavior" assert always_false_entry.null_behavior == FunctionNullBehavior.Predicate, "predicate helpers should expose predicate null behavior" diff --git a/tests/test_nested_data_functions.incn b/tests/test_nested_data_functions.incn new file mode 100644 index 0000000..f0b53be --- /dev/null +++ b/tests/test_nested_data_functions.incn @@ -0,0 +1,124 @@ +"""Test: RFC 020 nested scalar helper surface.""" + +from std.testing import assert_raises +from functions import ( + array, + array_contains, + array_distinct, + array_except, + array_flatten, + array_intersect, + array_join, + array_position, + array_reverse, + array_slice, + array_sort, + array_union, + arrays_overlap, + cardinality, + col, + element_at, + int_lit, + map_contains_key, + map_entries, + map_extract, + map_from_arrays, + map_keys, + map_values, + named_struct, + str_lit, +) +from function_registry import function_ref_for +from projection_builders import ( + ColumnExpr, + ColumnExprKind, + column_expr_argument_count, + column_expr_function_name, + column_expr_function_ref, + column_expr_kind, +) + + +def _assert_nested_application(expr: ColumnExpr, expected_name: str, expected_args: int) -> None: + """Assert one helper uses the shared registry-backed scalar application node.""" + assert column_expr_kind(expr) == ColumnExprKind.ScalarFunction, f"{expected_name} should use the scalar function kind" + assert column_expr_function_name(expr) == expected_name, f"{expected_name} should preserve its canonical name" + assert column_expr_function_ref(expr) == function_ref_for(expected_name), "nested helper should preserve its registry function ref" + assert column_expr_argument_count(expr) == expected_args, f"{expected_name} should carry its scalar arguments" + + +def _call_empty_array() -> None: + """Call array with no values for ValueError assertions.""" + array([]) + return + + +def _call_empty_named_struct() -> None: + """Call named_struct with no fields for ValueError assertions.""" + named_struct([], []) + return + + +def _call_mismatched_named_struct() -> None: + """Call named_struct with mismatched fields and values for ValueError assertions.""" + named_struct(["status"], []) + return + + +def _call_empty_named_struct_field() -> None: + """Call named_struct with an empty field name for ValueError assertions.""" + named_struct([""], [str_lit("paid")]) + return + + +def test_nested_data_functions__array_helpers_share_scalar_application_node() -> None: + """Assert array helpers use the shared registry-backed scalar expression model.""" + # -- Arrange -- + tags = array([str_lit("paid"), col("status")]) + other_tags = array([str_lit("open"), str_lit("paid")]) + + # -- Act / Assert -- + _assert_nested_application(tags, "array", 2) + _assert_nested_application(cardinality(tags), "cardinality", 1) + _assert_nested_application(array_contains(tags, str_lit("paid")), "array_contains", 2) + _assert_nested_application(array_position(tags, str_lit("paid")), "array_position", 2) + _assert_nested_application(element_at(tags, int_lit(1)), "element_at", 2) + _assert_nested_application(array_sort(tags), "array_sort", 1) + _assert_nested_application(array_distinct(tags), "array_distinct", 1) + _assert_nested_application(array_except(tags, other_tags), "array_except", 2) + _assert_nested_application(array_flatten(array([tags, other_tags])), "array_flatten", 1) + _assert_nested_application(array_intersect(tags, other_tags), "array_intersect", 2) + _assert_nested_application(array_union(tags, other_tags), "array_union", 2) + _assert_nested_application(arrays_overlap(tags, other_tags), "arrays_overlap", 2) + _assert_nested_application(array_join(tags, str_lit("|")), "array_join", 2) + _assert_nested_application(array_slice(tags, int_lit(1), int_lit(2)), "array_slice", 3) + _assert_nested_application(array_reverse(tags), "array_reverse", 1) + + +def test_nested_data_functions__map_and_struct_helpers_share_scalar_application_node() -> None: + """Assert map and struct helpers use scalar expressions rather than relation-shaping nodes.""" + # -- Arrange -- + keys = array([str_lit("status")]) + values = array([col("status")]) + attr_map = map_from_arrays(keys, values) + contains_key = map_contains_key(attr_map, str_lit("status")) + + # -- Act / Assert -- + _assert_nested_application(attr_map, "map_from_arrays", 2) + _assert_nested_application(map_entries(attr_map), "map_entries", 1) + _assert_nested_application(map_extract(attr_map, str_lit("status")), "map_extract", 2) + _assert_nested_application(map_keys(attr_map), "map_keys", 1) + _assert_nested_application(map_values(attr_map), "map_values", 1) + _assert_nested_application(named_struct(["status", "amount"], [col("status"), col("amount")]), "named_struct", 4) + assert column_expr_kind(contains_key) == ColumnExprKind.ScalarFunction, "map_contains_key rewrite should still be scalar" + assert column_expr_function_name(contains_key) == "gt", "map_contains_key should lower through its documented predicate rewrite" + assert column_expr_function_ref(contains_key) == function_ref_for("gt"), "rewrite should use the registered greater-than helper" + + +def test_nested_data_functions__constructor_shape_errors_raise_value_error() -> None: + """Assert nested constructors reject shapes that cannot produce typed scalar values.""" + # -- Arrange / Act / Assert -- + assert_raises[ValueError](_call_empty_array) + assert_raises[ValueError](_call_empty_named_struct) + assert_raises[ValueError](_call_mismatched_named_struct) + assert_raises[ValueError](_call_empty_named_struct_field) diff --git a/tests/test_session_projection.incn b/tests/test_session_projection.incn index 0c2777e..fb6207e 100644 --- a/tests/test_session_projection.incn +++ b/tests/test_session_projection.incn @@ -3,6 +3,11 @@ from functions import ( abs, add, + array, + array_contains, + array_distinct, + array_join, + array_position, case_when, cast, ceil, @@ -20,6 +25,8 @@ from functions import ( round, sub, try_cast, + cardinality, + element_at, ) from dataset import DataFrame, LazyFrame from session import Session, SessionErrorKind @@ -185,6 +192,41 @@ def test_session_projection__collect_executes_common_math_scalar_projection_func assert payload.contains("3"), "round projection should include round(10 / 4.0)" +def test_session_projection__collect_executes_nested_scalar_projection_functions() -> None: + """collect should execute RFC 020 nested scalar helpers through DataFusion.""" + # -- Arrange -- + mut session = Session.default() + + # -- Act -- + lazy: LazyFrame[AggregateOrder] = assert_is_ok( + session.read_csv("aggregate_orders", AGGREGATE_ORDERS_CSV_FIXTURE), + "aggregate orders fixture should load", + ) + tags = array([lit("paid"), col("customer_id"), lit("paid")]) + with_count = lazy.with_column("tag_count", cardinality(tags)) + with_contains = with_count.with_column("has_paid", array_contains(tags, lit("paid"))) + with_first = with_contains.with_column("first_tag", element_at(tags, lit(1))) + with_position = with_first.with_column("paid_position", array_position(tags, lit("paid"))) + projected = with_position.with_column("joined_tags", array_join(array_distinct(tags), lit("|"))) + df = _collect_or_fail(session, projected) + payload = df.preview_text() + resolved = df.resolved_columns() + + # -- Assert -- + assert df.row_count() == 3, "nested scalar projections should preserve the input rows" + assert len(resolved) == 7, "projection should expose all appended nested outputs" + assert payload.contains("tag_count"), "cardinality projection should materialize its alias" + assert payload.contains("has_paid"), "array_contains projection should materialize its alias" + assert payload.contains("first_tag"), "element_at projection should materialize its alias" + assert payload.contains("paid_position"), "array_position projection should materialize its alias" + assert payload.contains("joined_tags"), "array_join projection should materialize its alias" + assert payload.contains("3"), "cardinality should report three input array elements" + assert payload.contains("true"), "array_contains should find the paid tag" + assert payload.contains("paid"), "element_at should return the first tag" + assert payload.contains("1"), "array_position should use one-based positions" + assert payload.contains("paid|A"), "array_join should materialize distinct string tags" + + def test_session_projection__collect_executes_identity_select() -> None: # -- Arrange -- mut session = Session.default() diff --git a/tests/test_substrait_plan.incn b/tests/test_substrait_plan.incn index f0b6c9e..3bd3b3b 100644 --- a/tests/test_substrait_plan.incn +++ b/tests/test_substrait_plan.incn @@ -6,6 +6,19 @@ from functions import ( add, always_true, and_, + array, + array_contains, + array_distinct, + array_except, + array_flatten, + array_intersect, + array_join, + array_position, + array_reverse, + array_slice, + array_sort, + array_union, + arrays_overlap, asc, asc_nulls_last, avg, @@ -33,12 +46,19 @@ from functions import ( lit, lt, lte, + map_contains_key, + map_entries, + map_extract, + map_from_arrays, + map_keys, + map_values, max, min, modulo, mul, ne, neg, + named_struct, not_, nullif, or_, @@ -46,6 +66,8 @@ from functions import ( sub, sum, try_cast, + cardinality, + element_at, ) from projection_builders import ColumnExpr, with_column_assignment from substrait.errors import SubstraitLoweringErrorKind @@ -373,6 +395,38 @@ def test_plan__core_scalar_extension_mappings_lower_to_substrait() -> None: _assert_scalar_expr_lowers(round(div(col("amount"), lit(4.0)))) +def test_plan__nested_scalar_extension_mappings_lower_to_substrait() -> None: + """Assert RFC 020 nested scalar helpers emit Substrait scalar functions.""" + # -- Arrange -- + tags = array([lit("paid"), lit("open"), col("status")]) + other_tags = array([lit("paid"), lit("closed")]) + attr_map = map_from_arrays(array([lit("status")]), array([col("status")])) + + # -- Act / Assert -- + _assert_scalar_expr_lowers(tags) + _assert_scalar_expr_lowers(cardinality(tags)) + _assert_scalar_expr_lowers(array_contains(tags, lit("paid"))) + _assert_scalar_expr_lowers(array_position(tags, lit("paid"))) + _assert_scalar_expr_lowers(element_at(tags, lit(1))) + _assert_scalar_expr_lowers(array_sort(tags)) + _assert_scalar_expr_lowers(array_distinct(tags)) + _assert_scalar_expr_lowers(array_except(tags, other_tags)) + _assert_scalar_expr_lowers(array_flatten(array([tags, other_tags]))) + _assert_scalar_expr_lowers(array_intersect(tags, other_tags)) + _assert_scalar_expr_lowers(array_union(tags, other_tags)) + _assert_scalar_expr_lowers(arrays_overlap(tags, other_tags)) + _assert_scalar_expr_lowers(array_join(tags, lit("|"))) + _assert_scalar_expr_lowers(array_slice(tags, lit(1), lit(2))) + _assert_scalar_expr_lowers(array_reverse(tags)) + _assert_scalar_expr_lowers(attr_map) + _assert_scalar_expr_lowers(map_entries(attr_map)) + _assert_scalar_expr_lowers(map_extract(attr_map, lit("status"))) + _assert_scalar_expr_lowers(map_keys(attr_map)) + _assert_scalar_expr_lowers(map_values(attr_map)) + _assert_scalar_expr_lowers(map_contains_key(attr_map, lit("status"))) + _assert_scalar_expr_lowers(named_struct(["status", "amount"], [col("status"), col("amount")])) + + def test_plan__aggregate_rel_surfaces_group_and_measure_output_columns() -> None: # -- Arrange -- _register_orders_schema() From 6d2edd946764b216da84c4566d2367ead4a12d4c Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Wed, 27 May 2026 16:03:23 +0200 Subject: [PATCH 2/4] fix - 37 align nested function registry metadata --- scripts/check_function_registry_metadata.incn | 54 ++++++++++++++----- src/functions/nested/array.incn | 12 ++--- src/functions/nested/array_contains.incn | 12 ++--- src/functions/nested/array_distinct.incn | 12 ++--- src/functions/nested/array_except.incn | 12 ++--- src/functions/nested/array_flatten.incn | 12 ++--- src/functions/nested/array_intersect.incn | 12 ++--- src/functions/nested/array_join.incn | 12 ++--- src/functions/nested/array_position.incn | 12 ++--- src/functions/nested/array_reverse.incn | 12 ++--- src/functions/nested/array_slice.incn | 12 ++--- src/functions/nested/array_sort.incn | 12 ++--- src/functions/nested/array_union.incn | 12 ++--- src/functions/nested/arrays_overlap.incn | 12 ++--- src/functions/nested/cardinality.incn | 12 ++--- src/functions/nested/element_at.incn | 12 ++--- src/functions/nested/map_contains_key.incn | 12 ++--- src/functions/nested/map_entries.incn | 12 ++--- src/functions/nested/map_extract.incn | 12 ++--- src/functions/nested/map_from_arrays.incn | 12 ++--- src/functions/nested/map_keys.incn | 12 ++--- src/functions/nested/map_values.incn | 12 ++--- src/functions/nested/named_struct.incn | 12 ++--- 23 files changed, 174 insertions(+), 144 deletions(-) diff --git a/scripts/check_function_registry_metadata.incn b/scripts/check_function_registry_metadata.incn index 1cf9918..f9bd80b 100644 --- a/scripts/check_function_registry_metadata.incn +++ b/scripts/check_function_registry_metadata.incn @@ -5,7 +5,7 @@ from std.json import JsonValue from std.testing import fail_t const METADATA_PATH: str = "target/function_registry_api_metadata.json" -const REGISTRY_DECORATOR: str = "function_registry.add" +const REGISTRY_DECORATOR: str = "register_function" def metadata_fail[T](message: str) -> T: @@ -207,21 +207,51 @@ def decorator_source_matches(decorator: JsonValue) -> bool: None => return false -def decorator_canonical_name(decorator: JsonValue, context: str) -> str: - """Return the canonical name from a registry decorator.""" +def registry_spec_args(decorator: JsonValue, context: str) -> list[JsonValue]: + """Return the named metadata arguments passed to one registry spec call.""" args = require_array(require_field(decorator, "args", context), f"{context}.decorator.args") - if len(args) != 2: - return metadata_fail[str](f"{context}: registry decorator must have canonical name and typed metadata args") + if len(args) != 1: + return metadata_fail[list[JsonValue]](f"{context}: registry decorator must have one typed metadata arg") if require_string_field(args[0], "kind", context) != "positional": - return metadata_fail[str](f"{context}: registry decorator canonical name must be positional") + return metadata_fail[list[JsonValue]](f"{context}: registry decorator metadata must be positional") value = require_field(args[0], "value", context) + if require_string_field(value, "kind", context) != "call": + return metadata_fail[list[JsonValue]](f"{context}: registry decorator metadata must be a spec call") + return require_array(require_field(value, "args", context), f"{context}.decorator.spec.args") + + +def named_arg_value(args: list[JsonValue], name: str, context: str) -> Option[JsonValue]: + """Return one named call argument value when present.""" + for arg in args: + if require_string_field(arg, "kind", context) == "named" and require_string_field(arg, "name", context) == name: + return Some(require_field(arg, "value", context)) + return None + + +def canonical_name_override(args: list[JsonValue], context: str) -> Option[str]: + """Return the explicit canonical-name override from decorator metadata.""" + value = match named_arg_value(args, "canonical_name", context): + Some(item) => item + None => return None + if require_string_field(value, "kind", context) != "literal": - return metadata_fail[str](f"{context}: registry decorator canonical name must be a literal string") + return metadata_fail[Option[str]](f"{context}.canonical_name: expected string or None literal") + literal = require_field(value, "value", context) - if require_string_field(literal, "kind", context) != "string": - return metadata_fail[str](f"{context}: registry decorator canonical name must be a string") - return require_string_field(literal, "value", context) + literal_kind = require_string_field(literal, "kind", context) + if literal_kind == "string": + return Some(require_string_field(literal, "value", context)) + if literal_kind == "none": + return None + return metadata_fail[Option[str]](f"{context}.canonical_name: expected string or None literal") + + +def decorator_canonical_name(decorator: JsonValue, source_function: JsonValue, context: str) -> str: + """Return the canonical registry name for one declaration-side decorator.""" + match canonical_name_override(registry_spec_args(decorator, context), context): + Some(name) => return name + None => return require_string_field(source_function, "name", context) def projected_function(alias: JsonValue, alias_name: str) -> JsonValue: @@ -296,7 +326,7 @@ def check_registered_helper(alias_name: str, alias: JsonValue, source_function: f"{alias_name}: expected exactly one registry decorator, found {len(decorators)}", ) - canonical_name = decorator_canonical_name(decorators[0], alias_name) + canonical_name = decorator_canonical_name(decorators[0], source_function, alias_name) assert_decorated_callable_matches_source(decorators[0], source_function, canonical_name) match optional_string_field(source_function, "docstring"): @@ -318,7 +348,7 @@ def check_unmatched_decorators(package_value: JsonValue, seen_names: dict[str, b function = functions[function_name] decorators = registry_decorators(function) for decorator in decorators: - canonical_name = decorator_canonical_name(decorator, function_name) + canonical_name = decorator_canonical_name(decorator, function, function_name) if not has_key(seen_names, canonical_name): return metadata_fail[None]( f"{canonical_name}: registry decorator is not exposed through the public functions facade", diff --git a/src/functions/nested/array.incn b/src/functions/nested/array.incn index 3c681e7..885a33c 100644 --- a/src/functions/nested/array.incn +++ b/src/functions/nested/array.incn @@ -13,16 +13,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application, require_non_empty_args -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import MAKE_ARRAY_FUNCTION_ANCHOR -@function_registry.add("array", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("make_array", MAKE_ARRAY_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("make_array", MAKE_ARRAY_FUNCTION_ANCHOR), )) pub def array(values: list[ColumnExpr]) -> ColumnExpr: """ diff --git a/src/functions/nested/array_contains.incn b/src/functions/nested/array_contains.incn index fc0f480..5996bf7 100644 --- a/src/functions/nested/array_contains.incn +++ b/src/functions/nested/array_contains.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_HAS_FUNCTION_ANCHOR -@function_registry.add("array_contains", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_has", ARRAY_HAS_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_has", ARRAY_HAS_FUNCTION_ANCHOR), )) pub def array_contains(array_expr: ColumnExpr, value: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_distinct.incn b/src/functions/nested/array_distinct.incn index 2464b9d..5503680 100644 --- a/src/functions/nested/array_distinct.incn +++ b/src/functions/nested/array_distinct.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_DISTINCT_FUNCTION_ANCHOR -@function_registry.add("array_distinct", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_distinct", ARRAY_DISTINCT_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_distinct", ARRAY_DISTINCT_FUNCTION_ANCHOR), )) pub def array_distinct(array_expr: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_except.incn b/src/functions/nested/array_except.incn index 10a2caf..e2e86e2 100644 --- a/src/functions/nested/array_except.incn +++ b/src/functions/nested/array_except.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_EXCEPT_FUNCTION_ANCHOR -@function_registry.add("array_except", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_except", ARRAY_EXCEPT_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_except", ARRAY_EXCEPT_FUNCTION_ANCHOR), )) pub def array_except(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_flatten.incn b/src/functions/nested/array_flatten.incn index 8403ee1..5545495 100644 --- a/src/functions/nested/array_flatten.incn +++ b/src/functions/nested/array_flatten.incn @@ -13,16 +13,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_FLATTEN_FUNCTION_ANCHOR -@function_registry.add("array_flatten", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("flatten", ARRAY_FLATTEN_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("flatten", ARRAY_FLATTEN_FUNCTION_ANCHOR), )) pub def array_flatten(array_expr: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_intersect.incn b/src/functions/nested/array_intersect.incn index a457da4..3ead98e 100644 --- a/src/functions/nested/array_intersect.incn +++ b/src/functions/nested/array_intersect.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_INTERSECT_FUNCTION_ANCHOR -@function_registry.add("array_intersect", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_intersect", ARRAY_INTERSECT_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_intersect", ARRAY_INTERSECT_FUNCTION_ANCHOR), )) pub def array_intersect(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_join.incn b/src/functions/nested/array_join.incn index 600a1d9..39918f7 100644 --- a/src/functions/nested/array_join.incn +++ b/src/functions/nested/array_join.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_TO_STRING_FUNCTION_ANCHOR -@function_registry.add("array_join", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_to_string", ARRAY_TO_STRING_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_to_string", ARRAY_TO_STRING_FUNCTION_ANCHOR), )) pub def array_join(array_expr: ColumnExpr, delimiter: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_position.incn b/src/functions/nested/array_position.incn index e443508..97e639e 100644 --- a/src/functions/nested/array_position.incn +++ b/src/functions/nested/array_position.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_POSITION_FUNCTION_ANCHOR -@function_registry.add("array_position", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_position", ARRAY_POSITION_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_position", ARRAY_POSITION_FUNCTION_ANCHOR), )) pub def array_position(array_expr: ColumnExpr, value: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_reverse.incn b/src/functions/nested/array_reverse.incn index 8e4c13c..25994bc 100644 --- a/src/functions/nested/array_reverse.incn +++ b/src/functions/nested/array_reverse.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_REVERSE_FUNCTION_ANCHOR -@function_registry.add("array_reverse", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_reverse", ARRAY_REVERSE_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_reverse", ARRAY_REVERSE_FUNCTION_ANCHOR), )) pub def array_reverse(array_expr: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_slice.incn b/src/functions/nested/array_slice.incn index 08aa946..82f14e6 100644 --- a/src/functions/nested/array_slice.incn +++ b/src/functions/nested/array_slice.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_SLICE_FUNCTION_ANCHOR -@function_registry.add("array_slice", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_slice", ARRAY_SLICE_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_slice", ARRAY_SLICE_FUNCTION_ANCHOR), )) pub def array_slice(array_expr: ColumnExpr, start: ColumnExpr, stop: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_sort.incn b/src/functions/nested/array_sort.incn index 73c7f75..335b9ae 100644 --- a/src/functions/nested/array_sort.incn +++ b/src/functions/nested/array_sort.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_SORT_FUNCTION_ANCHOR -@function_registry.add("array_sort", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_sort", ARRAY_SORT_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_sort", ARRAY_SORT_FUNCTION_ANCHOR), )) pub def array_sort(array_expr: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/array_union.incn b/src/functions/nested/array_union.incn index 6aaba15..be73cd4 100644 --- a/src/functions/nested/array_union.incn +++ b/src/functions/nested/array_union.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_UNION_FUNCTION_ANCHOR -@function_registry.add("array_union", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_union", ARRAY_UNION_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_union", ARRAY_UNION_FUNCTION_ANCHOR), )) pub def array_union(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/arrays_overlap.incn b/src/functions/nested/arrays_overlap.incn index cedebed..c06d128 100644 --- a/src/functions/nested/arrays_overlap.incn +++ b/src/functions/nested/arrays_overlap.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_HAS_ANY_FUNCTION_ANCHOR -@function_registry.add("arrays_overlap", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_has_any", ARRAY_HAS_ANY_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_has_any", ARRAY_HAS_ANY_FUNCTION_ANCHOR), )) pub def arrays_overlap(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/cardinality.incn b/src/functions/nested/cardinality.incn index 9049ccc..81b4c31 100644 --- a/src/functions/nested/cardinality.incn +++ b/src/functions/nested/cardinality.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import CARDINALITY_FUNCTION_ANCHOR -@function_registry.add("cardinality", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("cardinality", CARDINALITY_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("cardinality", CARDINALITY_FUNCTION_ANCHOR), )) pub def cardinality(value: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/element_at.incn b/src/functions/nested/element_at.incn index e726baa..c6f514f 100644 --- a/src/functions/nested/element_at.incn +++ b/src/functions/nested/element_at.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_ELEMENT_FUNCTION_ANCHOR -@function_registry.add("element_at", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("array_element", ARRAY_ELEMENT_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("array_element", ARRAY_ELEMENT_FUNCTION_ANCHOR), )) pub def element_at(array_expr: ColumnExpr, index: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/map_contains_key.incn b/src/functions/nested/map_contains_key.incn index 8d0d02e..4389cdb 100644 --- a/src/functions/nested/map_contains_key.incn +++ b/src/functions/nested/map_contains_key.incn @@ -11,15 +11,15 @@ from function_registry import ( from functions.nested.cardinality import cardinality from functions.nested.map_extract import map_extract from functions.operators.gt import gt -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr, int_expr -@function_registry.add("map_contains_key", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - rewrite_mapping("gt(cardinality(map_extract(map_expr, key)), int_expr(0))"), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=rewrite_mapping("gt(cardinality(map_extract(map_expr, key)), int_expr(0))"), )) pub def map_contains_key(map_expr: ColumnExpr, key: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/map_entries.incn b/src/functions/nested/map_entries.incn index 49c12ff..75d063a 100644 --- a/src/functions/nested/map_entries.incn +++ b/src/functions/nested/map_entries.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import MAP_ENTRIES_FUNCTION_ANCHOR -@function_registry.add("map_entries", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("map_entries", MAP_ENTRIES_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("map_entries", MAP_ENTRIES_FUNCTION_ANCHOR), )) pub def map_entries(map_expr: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/map_extract.incn b/src/functions/nested/map_extract.incn index 30e4e7b..ef115ff 100644 --- a/src/functions/nested/map_extract.incn +++ b/src/functions/nested/map_extract.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import MAP_EXTRACT_FUNCTION_ANCHOR -@function_registry.add("map_extract", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("map_extract", MAP_EXTRACT_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("map_extract", MAP_EXTRACT_FUNCTION_ANCHOR), )) pub def map_extract(map_expr: ColumnExpr, key: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/map_from_arrays.incn b/src/functions/nested/map_from_arrays.incn index 1c94165..edb3edb 100644 --- a/src/functions/nested/map_from_arrays.incn +++ b/src/functions/nested/map_from_arrays.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import MAP_FUNCTION_ANCHOR -@function_registry.add("map_from_arrays", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("map", MAP_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("map", MAP_FUNCTION_ANCHOR), )) pub def map_from_arrays(keys: ColumnExpr, values: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/map_keys.incn b/src/functions/nested/map_keys.incn index d223229..8deb3bb 100644 --- a/src/functions/nested/map_keys.incn +++ b/src/functions/nested/map_keys.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import MAP_KEYS_FUNCTION_ANCHOR -@function_registry.add("map_keys", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("map_keys", MAP_KEYS_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("map_keys", MAP_KEYS_FUNCTION_ANCHOR), )) pub def map_keys(map_expr: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/map_values.incn b/src/functions/nested/map_values.incn index 35ec19a..18528f9 100644 --- a/src/functions/nested/map_values.incn +++ b/src/functions/nested/map_values.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import MAP_VALUES_FUNCTION_ANCHOR -@function_registry.add("map_values", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("map_values", MAP_VALUES_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("map_values", MAP_VALUES_FUNCTION_ANCHOR), )) pub def map_values(map_expr: ColumnExpr) -> ColumnExpr: """ diff --git a/src/functions/nested/named_struct.incn b/src/functions/nested/named_struct.incn index 2f18a30..6c5b002 100644 --- a/src/functions/nested/named_struct.incn +++ b/src/functions/nested/named_struct.incn @@ -9,16 +9,16 @@ from function_registry import ( v0_1, ) from functions.nested.common import named_struct_arguments, nested_application -from functions.registry import function_registry +from functions.registry import register_function from projection_builders import ColumnExpr from substrait.function_extensions import NAMED_STRUCT_FUNCTION_ANCHOR -@function_registry.add("named_struct", deterministic_spec( - FunctionClass.Scalar, - FunctionLifecycle(since=v0_1, changed=[], deprecated=None), - FunctionNullBehavior.DependsOnInputs, - extension_mapping("named_struct", NAMED_STRUCT_FUNCTION_ANCHOR), +@register_function(deterministic_spec( + function_class=FunctionClass.Scalar, + lifecycle=FunctionLifecycle(since=v0_1, changed=[], deprecated=None), + null_behavior=FunctionNullBehavior.DependsOnInputs, + substrait=extension_mapping("named_struct", NAMED_STRUCT_FUNCTION_ANCHOR), )) pub def named_struct(field_names: list[str], values: list[ColumnExpr]) -> ColumnExpr: """ From ba64dbfc7b9f9efbbf7ae9fe6e76e22e5ac9ba27 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Wed, 27 May 2026 16:12:41 +0200 Subject: [PATCH 3/4] refactor - 37 remove nested registry wrapper --- src/functions/nested/array.incn | 6 +++--- src/functions/nested/array_contains.incn | 5 ++--- src/functions/nested/array_distinct.incn | 5 ++--- src/functions/nested/array_except.incn | 5 ++--- src/functions/nested/array_flatten.incn | 5 ++--- src/functions/nested/array_intersect.incn | 5 ++--- src/functions/nested/array_join.incn | 5 ++--- src/functions/nested/array_position.incn | 5 ++--- src/functions/nested/array_reverse.incn | 5 ++--- src/functions/nested/array_slice.incn | 5 ++--- src/functions/nested/array_sort.incn | 5 ++--- src/functions/nested/array_union.incn | 5 ++--- src/functions/nested/arrays_overlap.incn | 5 ++--- src/functions/nested/cardinality.incn | 5 ++--- src/functions/nested/common.incn | 6 ------ src/functions/nested/element_at.incn | 5 ++--- src/functions/nested/map_entries.incn | 5 ++--- src/functions/nested/map_extract.incn | 5 ++--- src/functions/nested/map_from_arrays.incn | 5 ++--- src/functions/nested/map_keys.incn | 5 ++--- src/functions/nested/map_values.incn | 5 ++--- src/functions/nested/named_struct.incn | 6 +++--- 22 files changed, 44 insertions(+), 69 deletions(-) diff --git a/src/functions/nested/array.incn b/src/functions/nested/array.incn index 885a33c..4f6e809 100644 --- a/src/functions/nested/array.incn +++ b/src/functions/nested/array.incn @@ -12,8 +12,8 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application, require_non_empty_args -from functions.registry import register_function +from functions.nested.common import require_non_empty_args +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import MAKE_ARRAY_FUNCTION_ANCHOR @@ -35,4 +35,4 @@ pub def array(values: list[ColumnExpr]) -> ColumnExpr: values: Element expressions in array order. """ require_non_empty_args(values) - return nested_application("array", values) + return registered_application("array", values) diff --git a/src/functions/nested/array_contains.incn b/src/functions/nested/array_contains.incn index 5996bf7..87c5d63 100644 --- a/src/functions/nested/array_contains.incn +++ b/src/functions/nested/array_contains.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_HAS_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def array_contains(array_expr: ColumnExpr, value: ColumnExpr) -> ColumnExpr: array_expr: Array expression to search. value: Value expression to find. """ - return nested_application("array_contains", [array_expr, value]) + return registered_application("array_contains", [array_expr, value]) diff --git a/src/functions/nested/array_distinct.incn b/src/functions/nested/array_distinct.incn index 5503680..6f7e2db 100644 --- a/src/functions/nested/array_distinct.incn +++ b/src/functions/nested/array_distinct.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_DISTINCT_FUNCTION_ANCHOR @@ -30,4 +29,4 @@ pub def array_distinct(array_expr: ColumnExpr) -> ColumnExpr: Parameters: array_expr: Array expression to de-duplicate. """ - return nested_application("array_distinct", [array_expr]) + return registered_application("array_distinct", [array_expr]) diff --git a/src/functions/nested/array_except.incn b/src/functions/nested/array_except.incn index e2e86e2..d12effe 100644 --- a/src/functions/nested/array_except.incn +++ b/src/functions/nested/array_except.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_EXCEPT_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def array_except(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: left: Array expression that supplies candidate values. right: Array expression containing values to remove. """ - return nested_application("array_except", [left, right]) + return registered_application("array_except", [left, right]) diff --git a/src/functions/nested/array_flatten.incn b/src/functions/nested/array_flatten.incn index 5545495..6e1ab99 100644 --- a/src/functions/nested/array_flatten.incn +++ b/src/functions/nested/array_flatten.incn @@ -12,8 +12,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_FLATTEN_FUNCTION_ANCHOR @@ -34,4 +33,4 @@ pub def array_flatten(array_expr: ColumnExpr) -> ColumnExpr: Parameters: array_expr: Array expression to flatten. """ - return nested_application("array_flatten", [array_expr]) + return registered_application("array_flatten", [array_expr]) diff --git a/src/functions/nested/array_intersect.incn b/src/functions/nested/array_intersect.incn index 3ead98e..93b5e5b 100644 --- a/src/functions/nested/array_intersect.incn +++ b/src/functions/nested/array_intersect.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_INTERSECT_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def array_intersect(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: left: First array expression. right: Second array expression. """ - return nested_application("array_intersect", [left, right]) + return registered_application("array_intersect", [left, right]) diff --git a/src/functions/nested/array_join.incn b/src/functions/nested/array_join.incn index 39918f7..0981dd4 100644 --- a/src/functions/nested/array_join.incn +++ b/src/functions/nested/array_join.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_TO_STRING_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def array_join(array_expr: ColumnExpr, delimiter: ColumnExpr) -> ColumnExpr: array_expr: Array expression to render. delimiter: String delimiter expression placed between elements. """ - return nested_application("array_join", [array_expr, delimiter]) + return registered_application("array_join", [array_expr, delimiter]) diff --git a/src/functions/nested/array_position.incn b/src/functions/nested/array_position.incn index 97e639e..47160e9 100644 --- a/src/functions/nested/array_position.incn +++ b/src/functions/nested/array_position.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_POSITION_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def array_position(array_expr: ColumnExpr, value: ColumnExpr) -> ColumnExpr: array_expr: Array expression to search. value: Value expression to find. """ - return nested_application("array_position", [array_expr, value]) + return registered_application("array_position", [array_expr, value]) diff --git a/src/functions/nested/array_reverse.incn b/src/functions/nested/array_reverse.incn index 25994bc..89da6fe 100644 --- a/src/functions/nested/array_reverse.incn +++ b/src/functions/nested/array_reverse.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_REVERSE_FUNCTION_ANCHOR @@ -30,4 +29,4 @@ pub def array_reverse(array_expr: ColumnExpr) -> ColumnExpr: Parameters: array_expr: Array expression to reverse. """ - return nested_application("array_reverse", [array_expr]) + return registered_application("array_reverse", [array_expr]) diff --git a/src/functions/nested/array_slice.incn b/src/functions/nested/array_slice.incn index 82f14e6..fde8de6 100644 --- a/src/functions/nested/array_slice.incn +++ b/src/functions/nested/array_slice.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_SLICE_FUNCTION_ANCHOR @@ -32,4 +31,4 @@ pub def array_slice(array_expr: ColumnExpr, start: ColumnExpr, stop: ColumnExpr) start: One-based start index. stop: One-based stop index following the backend adapter's `array_slice` contract. """ - return nested_application("array_slice", [array_expr, start, stop]) + return registered_application("array_slice", [array_expr, start, stop]) diff --git a/src/functions/nested/array_sort.incn b/src/functions/nested/array_sort.incn index 335b9ae..1bc7340 100644 --- a/src/functions/nested/array_sort.incn +++ b/src/functions/nested/array_sort.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_SORT_FUNCTION_ANCHOR @@ -30,4 +29,4 @@ pub def array_sort(array_expr: ColumnExpr) -> ColumnExpr: Parameters: array_expr: Array expression to sort. """ - return nested_application("array_sort", [array_expr]) + return registered_application("array_sort", [array_expr]) diff --git a/src/functions/nested/array_union.incn b/src/functions/nested/array_union.incn index be73cd4..6f6a33b 100644 --- a/src/functions/nested/array_union.incn +++ b/src/functions/nested/array_union.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_UNION_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def array_union(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: left: First array expression. right: Second array expression. """ - return nested_application("array_union", [left, right]) + return registered_application("array_union", [left, right]) diff --git a/src/functions/nested/arrays_overlap.incn b/src/functions/nested/arrays_overlap.incn index c06d128..61fd885 100644 --- a/src/functions/nested/arrays_overlap.incn +++ b/src/functions/nested/arrays_overlap.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_HAS_ANY_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def arrays_overlap(left: ColumnExpr, right: ColumnExpr) -> ColumnExpr: left: First array expression. right: Second array expression. """ - return nested_application("arrays_overlap", [left, right]) + return registered_application("arrays_overlap", [left, right]) diff --git a/src/functions/nested/cardinality.incn b/src/functions/nested/cardinality.incn index 81b4c31..1c504fa 100644 --- a/src/functions/nested/cardinality.incn +++ b/src/functions/nested/cardinality.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import CARDINALITY_FUNCTION_ANCHOR @@ -30,4 +29,4 @@ pub def cardinality(value: ColumnExpr) -> ColumnExpr: Parameters: value: Array or map expression to size. """ - return nested_application("cardinality", [value]) + return registered_application("cardinality", [value]) diff --git a/src/functions/nested/common.incn b/src/functions/nested/common.incn index 319c115..8be230c 100644 --- a/src/functions/nested/common.incn +++ b/src/functions/nested/common.incn @@ -1,15 +1,9 @@ """Shared implementation helpers for nested scalar functions.""" from rust::incan_stdlib::errors import raise_value_error -from functions.registry import registered_application from projection_builders import ColumnExpr, str_expr -pub def nested_application(canonical_name: str, arguments: list[ColumnExpr]) -> ColumnExpr: - """Build one registry-backed nested scalar function application.""" - return registered_application(canonical_name, arguments) - - pub def require_non_empty_args(arguments: list[ColumnExpr]) -> None: """Reject empty variadic nested constructors that cannot infer a value type.""" if len(arguments) == 0: diff --git a/src/functions/nested/element_at.incn b/src/functions/nested/element_at.incn index c6f514f..238fb72 100644 --- a/src/functions/nested/element_at.incn +++ b/src/functions/nested/element_at.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import ARRAY_ELEMENT_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def element_at(array_expr: ColumnExpr, index: ColumnExpr) -> ColumnExpr: array_expr: Array expression to access. index: One-based element index. Negative indexes count from the end where supported by the backend adapter. """ - return nested_application("element_at", [array_expr, index]) + return registered_application("element_at", [array_expr, index]) diff --git a/src/functions/nested/map_entries.incn b/src/functions/nested/map_entries.incn index 75d063a..9b6115c 100644 --- a/src/functions/nested/map_entries.incn +++ b/src/functions/nested/map_entries.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import MAP_ENTRIES_FUNCTION_ANCHOR @@ -30,4 +29,4 @@ pub def map_entries(map_expr: ColumnExpr) -> ColumnExpr: Parameters: map_expr: Map expression to inspect. """ - return nested_application("map_entries", [map_expr]) + return registered_application("map_entries", [map_expr]) diff --git a/src/functions/nested/map_extract.incn b/src/functions/nested/map_extract.incn index ef115ff..3fabb29 100644 --- a/src/functions/nested/map_extract.incn +++ b/src/functions/nested/map_extract.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import MAP_EXTRACT_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def map_extract(map_expr: ColumnExpr, key: ColumnExpr) -> ColumnExpr: map_expr: Map expression to inspect. key: Key expression to look up. """ - return nested_application("map_extract", [map_expr, key]) + return registered_application("map_extract", [map_expr, key]) diff --git a/src/functions/nested/map_from_arrays.incn b/src/functions/nested/map_from_arrays.incn index edb3edb..39315db 100644 --- a/src/functions/nested/map_from_arrays.incn +++ b/src/functions/nested/map_from_arrays.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import MAP_FUNCTION_ANCHOR @@ -31,4 +30,4 @@ pub def map_from_arrays(keys: ColumnExpr, values: ColumnExpr) -> ColumnExpr: keys: Array expression containing non-null map keys. values: Array expression containing map values. """ - return nested_application("map_from_arrays", [keys, values]) + return registered_application("map_from_arrays", [keys, values]) diff --git a/src/functions/nested/map_keys.incn b/src/functions/nested/map_keys.incn index 8deb3bb..8b3f745 100644 --- a/src/functions/nested/map_keys.incn +++ b/src/functions/nested/map_keys.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import MAP_KEYS_FUNCTION_ANCHOR @@ -30,4 +29,4 @@ pub def map_keys(map_expr: ColumnExpr) -> ColumnExpr: Parameters: map_expr: Map expression to inspect. """ - return nested_application("map_keys", [map_expr]) + return registered_application("map_keys", [map_expr]) diff --git a/src/functions/nested/map_values.incn b/src/functions/nested/map_values.incn index 18528f9..db7c60b 100644 --- a/src/functions/nested/map_values.incn +++ b/src/functions/nested/map_values.incn @@ -8,8 +8,7 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import nested_application -from functions.registry import register_function +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import MAP_VALUES_FUNCTION_ANCHOR @@ -30,4 +29,4 @@ pub def map_values(map_expr: ColumnExpr) -> ColumnExpr: Parameters: map_expr: Map expression to inspect. """ - return nested_application("map_values", [map_expr]) + return registered_application("map_values", [map_expr]) diff --git a/src/functions/nested/named_struct.incn b/src/functions/nested/named_struct.incn index 6c5b002..b48b246 100644 --- a/src/functions/nested/named_struct.incn +++ b/src/functions/nested/named_struct.incn @@ -8,8 +8,8 @@ from function_registry import ( extension_mapping, v0_1, ) -from functions.nested.common import named_struct_arguments, nested_application -from functions.registry import register_function +from functions.nested.common import named_struct_arguments +from functions.registry import register_function, registered_application from projection_builders import ColumnExpr from substrait.function_extensions import NAMED_STRUCT_FUNCTION_ANCHOR @@ -31,4 +31,4 @@ pub def named_struct(field_names: list[str], values: list[ColumnExpr]) -> Column field_names: Non-empty struct field names. values: Field value expressions in the same order as `field_names`. """ - return nested_application("named_struct", named_struct_arguments(field_names, values)) + return registered_application("named_struct", named_struct_arguments(field_names, values)) From b8b24e0a9d620ed9be6009f1006dfd12b697fe78 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Wed, 27 May 2026 17:00:25 +0200 Subject: [PATCH 4/4] fix - 37 address rfc020 review findings --- docs/language/reference/functions/index.md | 4 +-- docs/release_notes/v0_1.md | 2 +- docs/rfcs/020_nested_data_functions.md | 2 +- docs/rfcs/README.md | 2 +- tests/test_session_projection.incn | 31 ++++++++++++++++++---- 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/docs/language/reference/functions/index.md b/docs/language/reference/functions/index.md index b0ed988..5bcbe86 100644 --- a/docs/language/reference/functions/index.md +++ b/docs/language/reference/functions/index.md @@ -11,11 +11,11 @@ Today the concrete shipped surfaces are documented here: The canonical scalar literal helper is `lit(...)`. Typed literal helpers construct the same scalar-expression representation. -The current registry-backed helper surface is registered in the package-owned function registry. Registry types live in `src/function_registry.incn`, the shared package registry lives in `src/functions/registry.incn`, and concrete public helper entries are produced by `register_function(...)` decorators in individual `src/functions//.incn` modules. The registry-backed families are references, literals, casts, operators, predicates, conditionals, math, ordering, aggregates, and nested data. Each runtime entry exposes a stable function reference such as `inql.functions.col`, namespace, canonical name, typed lifecycle metadata (`since`, versioned changes, and optional deprecation), InQL RFC 024 policy category, function class, null behavior, alias policy, aggregate modifier policy, and Substrait mapping metadata. Checked public helpers provide the signature and, by default, the canonical name; decorator metadata may override the canonical name only for source spelling constraints such as the reserved-word `mod` case. +The current registry-backed helper surface covers references, literals, casts, operators, predicates, conditionals, math, ordering, aggregates, and nested data. Each runtime entry exposes a stable function reference such as `inql.functions.col`, namespace, canonical name, typed lifecycle metadata (`since`, versioned changes, and optional deprecation), function policy category, function class, null behavior, alias policy, aggregate modifier policy, and Substrait mapping metadata. Checked public helpers provide the signature and, by default, the canonical name; metadata may override the canonical name only for source spelling constraints such as the reserved-word `mod` case. The registry is the source for non-derivable machine facts. Public helper declarations are the source for argument names, argument types, and return types. Docstrings remain human-facing explanation, examples, and parameter intent. The `registry-metadata` check validates the checked API metadata projections produced from public facade aliases, registry decorators, and decorated callable signatures. Runtime registry entries are lazy and process-local: they support helper execution and lowering for loaded helpers, while the complete public catalog comes from checked metadata. This matters for generated docs, diagnostics, Prism lowering, and backend capability checks as the catalog grows. -InQL RFC 024 policy category is separate from function class. Function class describes the semantic shape (`scalar`, `aggregate`, `ordering`, and later table-valued or partition-transform shapes). Policy category describes where the function belongs: portable core, explicitly namespaced extension-only, opt-in compatibility alias, engine-specific, or rejected compatibility request. Name-only registry lookup remains core-scoped; extension and engine-specific entries use namespace-qualified lookup so compatibility names cannot silently shadow portable core names. Rejected requests are documented as rejection metadata, not as lowerable registry entries or fake Substrait mappings. +Function policy category is separate from function class. Function class describes the semantic shape (`scalar`, `aggregate`, `ordering`, and later table-valued or partition-transform shapes). Policy category describes where the function belongs: portable core, explicitly namespaced extension-only, opt-in compatibility alias, engine-specific, or rejected compatibility request. Name-only registry lookup remains core-scoped; extension and engine-specific entries use namespace-qualified lookup so compatibility names cannot silently shadow portable core names. Rejected requests are documented as rejection metadata, not as lowerable registry entries or fake Substrait mappings. The registered helper surface currently includes: diff --git a/docs/release_notes/v0_1.md b/docs/release_notes/v0_1.md index 39785c7..9dea513 100644 --- a/docs/release_notes/v0_1.md +++ b/docs/release_notes/v0_1.md @@ -14,7 +14,7 @@ Entries will be filled in as work lands (link RFCs and PRs when applicable). - **Scalar expressions:** RFC 012 unifies filter predicates, computed projection values, grouping keys, and aggregate inputs around one `ColumnExpr` surface with canonical `lit(...)` and typed literal helpers. - **Core scalar functions:** RFC 015 adds registry-backed scalar function applications and the first core helper slice for casts, comparisons, boolean logic, null/NaN predicates, arithmetic, conditionals, membership/range predicates, and ordering expressions. Implemented helpers lower to Substrait IR through registry metadata, built-in Rex shapes, or structural sort-field lowering; DataFusion remains the first execution adapter rather than the semantic boundary. - **Common scalar functions:** The first RFC 018 slice adds registry-backed math helpers for `abs(...)`, `ceil(...)`, `floor(...)`, and single-argument `round(...)`, with Substrait mappings and DataFusion-backed execution coverage. -- **Nested data functions:** RFC 020 adds registry-backed scalar helpers for array construction/access, cardinality, containment, overlap, sorting, set-like operations, joining, slicing, reversing, scalar array flattening, map construction/access, map key/value/entry extraction, map key containment, and named struct construction. These helpers lower through Substrait extension metadata and execute through the DataFusion-backed Session path without introducing generator semantics. +- **Nested data functions:** RFC 020 adds registry-backed scalar helpers for array construction/access, cardinality, containment, overlap, sorting, set-like operations, joining, slicing, reversing, scalar array flattening, map construction/access, map key/value/entry extraction, map key containment, and named struct construction. These helpers lower through Substrait extension metadata without introducing generator semantics, with representative DataFusion-backed Session coverage for composable array projection paths. - **Function registry:** RFC 014 adds declaration-site registry decorators for the current public helper surface, including stable function references, checked signature projection, lifecycle metadata, behavior categories, alias policy, Substrait mapping categories, and checked API metadata drift validation. - **Function extension policy:** InQL RFC 024 policy metadata now distinguishes portable core functions, namespaced extension-only functions, opt-in compatibility aliases, engine-specific functions, and rejected compatibility requests without adding an extension plugin system or backend-owned semantics. - **Projection:** builder-based `with_column`, `add`, `mul`, and literal expression helpers now lower derived columns through Prism, Substrait, and Session execution. diff --git a/docs/rfcs/020_nested_data_functions.md b/docs/rfcs/020_nested_data_functions.md index 2680ec3..f6590a6 100644 --- a/docs/rfcs/020_nested_data_functions.md +++ b/docs/rfcs/020_nested_data_functions.md @@ -10,7 +10,7 @@ - InQL RFC 014 (function registry and catalog governance) - InQL RFC 021 (generator and table-valued functions) - **Issue:** [InQL #37](https://github.com/dannys-code-corner/InQL/issues/37) -- **RFC PR:** — +- **RFC PR:** [InQL #46](https://github.com/dannys-code-corner/InQL/pull/46) - **Written against:** Incan v0.3-era InQL - **Shipped in:** v0.1 diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index b555643..32ac4e0 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -26,7 +26,7 @@ InQL uses its **own** RFC series (starting at 000), independent of the [Incan la | [017][rfc-017] | Implemented | Aggregate modifiers | | | [018][rfc-018] | In Progress | Common scalar function catalog | | | [019][rfc-019] | Draft | Window functions | | -| [020][rfc-020] | Draft | Nested data functions | | +| [020][rfc-020] | Implemented | Nested data functions | | | [021][rfc-021] | Draft | Generator and table-valued functions | | | [022][rfc-022] | Draft | Semi-structured and format functions | | | [023][rfc-023] | Draft | Approximate and sketch functions | | diff --git a/tests/test_session_projection.incn b/tests/test_session_projection.incn index fb6207e..ee91282 100644 --- a/tests/test_session_projection.incn +++ b/tests/test_session_projection.incn @@ -50,6 +50,22 @@ def _collect_or_fail(mut session: Session, projected: LazyFrame[AggregateOrder]) Err(err) => return fail_t(err.error_message()) +def _preview_line_contains_all(line: str, expected_cells: list[str]) -> bool: + """Return whether one rendered preview row contains every expected cell value.""" + for cell in expected_cells: + if not line.contains(cell): + return false + return true + + +def _assert_preview_row_contains(payload: str, expected_cells: list[str], context: str) -> None: + """Assert one rendered preview row carries the expected materialized cells together.""" + for line in payload.split("\n"): + if _preview_line_contains_all(line, expected_cells): + return + return fail_t(context) + + def test_session_projection__plan_root_names_match_append_projection() -> None: # -- Arrange -- mut session = Session.default() @@ -220,11 +236,16 @@ def test_session_projection__collect_executes_nested_scalar_projection_functions assert payload.contains("first_tag"), "element_at projection should materialize its alias" assert payload.contains("paid_position"), "array_position projection should materialize its alias" assert payload.contains("joined_tags"), "array_join projection should materialize its alias" - assert payload.contains("3"), "cardinality should report three input array elements" - assert payload.contains("true"), "array_contains should find the paid tag" - assert payload.contains("paid"), "element_at should return the first tag" - assert payload.contains("1"), "array_position should use one-based positions" - assert payload.contains("paid|A"), "array_join should materialize distinct string tags" + _assert_preview_row_contains( + payload, + ["A", "10", "3", "true", "paid", "1", "paid|A"], + "nested scalar projections should materialize the expected values for customer A amount 10", + ) + _assert_preview_row_contains( + payload, + ["B", "7", "3", "true", "paid", "1", "paid|B"], + "nested scalar projections should materialize the expected values for customer B amount 7", + ) def test_session_projection__collect_executes_identity_select() -> None: