From 45994220dd958c997465abb6ec5364f032fac00b Mon Sep 17 00:00:00 2001 From: Megha Hegde Date: Sat, 29 Nov 2025 15:34:57 -0500 Subject: [PATCH 01/28] tasks 1 and 2 implementation --- src/palimpzest/core/data/dataset.py | 44 ++++++++++++++ src/palimpzest/core/elements/groupbysig.py | 5 -- src/palimpzest/query/operators/logical.py | 68 ++++++++++++++++++---- 3 files changed, 101 insertions(+), 16 deletions(-) diff --git a/src/palimpzest/core/data/dataset.py b/src/palimpzest/core/data/dataset.py index 25cff1d02..e2d8458f5 100644 --- a/src/palimpzest/core/data/dataset.py +++ b/src/palimpzest/core/data/dataset.py @@ -573,10 +573,54 @@ def max(self) -> Dataset: return Dataset(sources=[self], operator=operator, schema=operator.output_schema) def groupby(self, groupby: GroupBySig) -> Dataset: + # update this!! output_schema = groupby.output_schema() operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, group_by_sig=groupby) return Dataset(sources=[self], operator=operator, schema=output_schema) + def sem_groupby(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: list[str]) -> Dataset: + """ + Apply a semantic group by operation to this set using an LLM. This operator groups records + by the specified `gby_fields` and applies the `agg_funcs` to the `agg_fields` for each group. + + Args: + gby_fields: List of field names to group by (e.g., ['complaint']) + agg_fields: List of field names to aggregate (e.g., ['contents']) + agg_funcs: List of aggregation functions to apply (e.g., ['count']) + + Example: + ds = pz.TextFileDataset(id="reviews", dir="product-reviews/") + ds = ds.sem_groupby(gby_fields=['complaint'], agg_fields=['contents'], agg_funcs=['count']) + """ + from typing import Any + + # Construct the output schema dynamically based on gby_fields and agg_funcs + fields = [] + + # Add group by fields to output schema + for g in gby_fields: + f = {"name": g, "type": Any, "desc": f"Group by field: {g}"} + fields.append(f) + + # Add aggregation fields to output schema + for i, agg_func in enumerate(agg_funcs): + agg_field_name = f"{agg_func}({agg_fields[i]})" + f = {"name": agg_field_name, "type": Any, "desc": f"Aggregate field: {agg_field_name}"} + fields.append(f) + + output_schema = create_schema_from_fields(fields) + + # Create logical operator with direct parameters (no GroupBySig) + operator = GroupByAggregate( + input_schema=self.schema, + output_schema=output_schema, + gby_fields=gby_fields, + agg_fields=agg_fields, + agg_funcs=agg_funcs + ) + + return Dataset(sources=[self], operator=operator, schema=output_schema) + def sem_agg(self, col: dict | type[BaseModel], agg: str, depends_on: str | list[str] | None = None) -> Dataset: """ Apply a semantic aggregation to this set. The `agg` string will be applied using an LLM diff --git a/src/palimpzest/core/elements/groupbysig.py b/src/palimpzest/core/elements/groupbysig.py index 3390e0870..ef443c096 100644 --- a/src/palimpzest/core/elements/groupbysig.py +++ b/src/palimpzest/core/elements/groupbysig.py @@ -11,11 +11,6 @@ # - construct the correct output schema using the input schema and the group by and aggregation fields # - remove/update all other references to GroupBySig in the codebase -# TODO: -# - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator -# - construct the correct output schema using the input schema and the group by and aggregation fields -# - remove/update all other references to GroupBySig in the codebase - # signature for a group by aggregate that applies # group and aggregation to an input tuple class GroupBySig: diff --git a/src/palimpzest/query/operators/logical.py b/src/palimpzest/query/operators/logical.py index d933ef0f7..36f6cd84c 100644 --- a/src/palimpzest/query/operators/logical.py +++ b/src/palimpzest/query/operators/logical.py @@ -381,33 +381,79 @@ def get_logical_op_params(self) -> dict: class GroupByAggregate(LogicalOperator): def __init__( self, - group_by_sig: GroupBySig, + group_by_sig: GroupBySig | None = None, + gby_fields: list[str] | None = None, + agg_fields: list[str] | None = None, + agg_funcs: list[str] | None = None, *args, **kwargs, ): super().__init__(*args, **kwargs) if not self.input_schema: raise ValueError("GroupByAggregate requires an input schema") - (valid, error) = group_by_sig.validate_schema(self.input_schema) - if not valid: - raise TypeError(error) - self.group_by_sig = group_by_sig + + # Support both old GroupBySig and new direct parameters + if group_by_sig is not None: + # Old API: using GroupBySig + (valid, error) = group_by_sig.validate_schema(self.input_schema) + if not valid: + raise TypeError(error) + self.group_by_sig = group_by_sig + self.gby_fields = group_by_sig.group_by_fields + self.agg_fields = group_by_sig.agg_fields + self.agg_funcs = group_by_sig.agg_funcs + else: + # New API: using direct parameters + if gby_fields is None or agg_fields is None or agg_funcs is None: + raise ValueError("Must provide either group_by_sig or all of (gby_fields, agg_fields, agg_funcs)") + + # Validate fields exist in input schema + for f in gby_fields: + if f not in self.input_schema.model_fields: + raise TypeError(f"Supplied schema has no field {f}") + for f in agg_fields: + if f not in self.input_schema.model_fields: + raise TypeError(f"Supplied schema has no field {f}") + + self.group_by_sig = None + self.gby_fields = gby_fields + self.agg_fields = agg_fields + self.agg_funcs = agg_funcs def __str__(self): - return f"GroupBy({self.group_by_sig.serialize()})" + if self.group_by_sig is not None: + return f"GroupBy({self.group_by_sig.serialize()})" + else: + return f"GroupBy(gby_fields={self.gby_fields}, agg_fields={self.agg_fields}, agg_funcs={self.agg_funcs})" def get_logical_id_params(self) -> dict: logical_id_params = super().get_logical_id_params() - logical_id_params = {"group_by_sig": self.group_by_sig, **logical_id_params} + if self.group_by_sig is not None: + logical_id_params = {"group_by_sig": self.group_by_sig, **logical_id_params} + else: + logical_id_params = { + "gby_fields": self.gby_fields, + "agg_fields": self.agg_fields, + "agg_funcs": self.agg_funcs, + **logical_id_params + } return logical_id_params def get_logical_op_params(self) -> dict: logical_op_params = super().get_logical_op_params() - logical_op_params = { - "group_by_sig": self.group_by_sig, - **logical_op_params, - } + if self.group_by_sig is not None: + logical_op_params = { + "group_by_sig": self.group_by_sig, + **logical_op_params, + } + else: + logical_op_params = { + "gby_fields": self.gby_fields, + "agg_fields": self.agg_fields, + "agg_funcs": self.agg_funcs, + **logical_op_params, + } return logical_op_params From b643be085af27b1ee454bba0111da077e527236b Mon Sep 17 00:00:00 2001 From: Megha Hegde Date: Mon, 1 Dec 2025 20:14:35 -0500 Subject: [PATCH 02/28] code for task 3 --- src/palimpzest/query/operators/aggregate.py | 275 +++++++++++++++++++- 1 file changed, 271 insertions(+), 4 deletions(-) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index 6e93a6a43..244233df4 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -34,12 +34,37 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: class ApplyGroupByOp(AggregateOp): """ Implementation of a GroupBy operator. This operator groups records by a set of fields - and applies a function to each group. The group_by_sig object contains the fields to - group by and the aggregation functions to apply to each group. + and applies a function to each group. + + Can be initialized in two ways: + 1. Legacy: group_by_sig parameter containing fields and functions + 2. New: gby_fields, agg_fields, agg_funcs parameters directly """ - def __init__(self, group_by_sig: GroupBySig, *args, **kwargs): + def __init__(self, group_by_sig: GroupBySig = None, gby_fields: list[str] = None, + agg_fields: list[str] = None, agg_funcs: list[str] = None, *args, **kwargs): super().__init__(*args, **kwargs) - self.group_by_sig = group_by_sig + + # Support both old API (group_by_sig) and new API (individual fields) + if group_by_sig is not None: + # Legacy API: use group_by_sig + self.group_by_sig = group_by_sig + self.gby_fields = group_by_sig.group_by_fields + self.agg_fields = group_by_sig.agg_fields + self.agg_funcs = group_by_sig.agg_funcs + elif gby_fields is not None and agg_fields is not None and agg_funcs is not None: + # New API: construct group_by_sig from individual fields + self.gby_fields = gby_fields + self.agg_fields = agg_fields + self.agg_funcs = agg_funcs + # Create a GroupBySig for backwards compatibility with existing code + from palimpzest.core.elements.groupbysig import GroupBySig + self.group_by_sig = GroupBySig( + group_by_fields=gby_fields, + agg_fields=agg_fields, + agg_funcs=agg_funcs + ) + else: + raise ValueError("Either group_by_sig or (gby_fields, agg_fields, agg_funcs) must be provided") def __str__(self): op = super().__str__() @@ -664,3 +689,245 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: ) return DataRecordSet([dr], [record_op_stats]) + +class SemanticGroupByOp(AggregateOp): + """ + Implementation of a semantic GroupBy operator using LLMs. This operator groups records by a set + of fields and applies aggregation functions to each group using an LLM to determine the groups. + """ + def __init__(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: list[str], + model: Model | None = None, prompt_strategy: PromptStrategy = PromptStrategy.AGG, + reasoning_effort: str | None = None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.gby_fields = gby_fields + self.agg_fields = agg_fields + self.agg_funcs = agg_funcs + self.model = model if model is not None else Model.GPT_4O_MINI + self.prompt_strategy = prompt_strategy + self.reasoning_effort = reasoning_effort + + # Initialize the generator for LLM calls + self.generator = Generator(self.model, self.prompt_strategy, self.reasoning_effort, self.api_base) + + def __str__(self): + op = super().__str__() + op += f" Group-by Fields: {self.gby_fields}\n" + op += f" Agg. Fields: {self.agg_fields}\n" + op += f" Agg. Funcs: {self.agg_funcs}\n" + op += f" Model: {self.model.value}\n" + op += f" Prompt Strategy: {self.prompt_strategy}\n" + return op + + def get_id_params(self): + id_params = super().get_id_params() + return { + "gby_fields": self.gby_fields, + "agg_fields": self.agg_fields, + "agg_funcs": self.agg_funcs, + "model": self.model.value, + "prompt_strategy": self.prompt_strategy.value, + "reasoning_effort": self.reasoning_effort, + **id_params + } + + def get_op_params(self): + op_params = super().get_op_params() + return { + "gby_fields": self.gby_fields, + "agg_fields": self.agg_fields, + "agg_funcs": self.agg_funcs, + "model": self.model, + "prompt_strategy": self.prompt_strategy, + "reasoning_effort": self.reasoning_effort, + **op_params + } + + def get_model_name(self) -> str: + return self.model.value + + def get_fields_to_generate(self, candidate: DataRecord) -> list[str]: + """ + For aggregation operators, we need to generate ALL output fields (including group-by fields), + not just the new fields. This overrides the default behavior. + """ + return list(self.output_schema.model_fields.keys()) + + def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates: + """ + Compute naive cost estimates for the semantic group by operation using an LLM. + """ + # estimate number of input and output tokens + est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS * source_op_cost_estimates.cardinality + est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS * NAIVE_EST_NUM_GROUPS + + # get est. of conversion time per record from model card + model_name = self.model.value + model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens + + # get est. of conversion cost (in USD) per record from model card + usd_per_input_token = MODEL_CARDS[model_name].get("usd_per_input_token") + model_conversion_usd_per_record = ( + usd_per_input_token * est_num_input_tokens + + MODEL_CARDS[model_name]["usd_per_output_token"] * est_num_output_tokens + ) + + # estimate quality of output based on the strength of the model being used + quality = (MODEL_CARDS[model_name]["overall"] / 100.0) + + return OperatorCostEstimates( + cardinality=NAIVE_EST_NUM_GROUPS, + time_per_record=model_conversion_time_per_record, + cost_per_record=model_conversion_usd_per_record, + quality=quality, + ) + + def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: + """ + Execute the semantic group by operation on the given candidates using a two-phase approach: + Phase 1: LLM assigns each record to a group (MAP) + Phase 2: Apply aggregation functions to each group (REDUCE) + + Args: + candidates: List of DataRecords to group and aggregate + + Returns: + DataRecordSet containing one DataRecord per group with aggregated values + """ + start_time = time.time() + + # Handle empty input + if len(candidates) == 0: + return DataRecordSet([], []) + + # Use LLM to assign each record to a semantic group + group_assignments, phase1_stats = self._assign_groups_llm(candidates) + + # Group candidates by their assigned group labels + groups = {} + for candidate, group_label in zip(candidates, group_assignments): + if group_label not in groups: + groups[group_label] = [] + groups[group_label].append(candidate) + + # Apply aggregation functions to each group + drs = [] + record_op_stats_lst = [] + total_cost = phase1_stats.cost_per_record * len(candidates) + + for group_label, group_members in groups.items(): + # Build aggregated data item for this group + data_item = {} + + # Add group-by field value + data_item[self.gby_fields[0]] = group_label + + # Use LLM to compute aggregations for this group + # Use LLM to compute aggregations for this group + fields_to_generate = [agg_func.lower() for agg_func in self.agg_funcs] + fields = {field: self.output_schema.model_fields[field].annotation for field in fields_to_generate} + + # Construct generation kwargs + gen_kwargs = { + "project_cols": self.get_input_fields(), + "output_schema": self.output_schema, + "agg_instruction": f"Compute the following aggregations for group '{group_label}': {', '.join([f'{func}({field})' for func, field in zip(self.agg_funcs, self.agg_fields)])}" + } + + # Generate aggregation results + field_answers, _, agg_stats, _ = self.generator(group_members, fields, **gen_kwargs) + agg_results = {field: field_answers[field][0] for field in fields_to_generate} + + # Add aggregation results to data_item + for agg_field, agg_func in zip(self.agg_fields, self.agg_funcs): + output_field_name = agg_func.lower() + data_item[output_field_name] = agg_results.get(output_field_name) + + # Accumulate aggregation costs + total_cost += agg_stats.cost_per_record * len(group_members) + + # Create the DataRecord for this group + data_item_obj = self.output_schema(**data_item) + dr = DataRecord.from_agg_parents(data_item_obj, parent_records=group_members) + drs.append(dr) + + # Create RecordOpStats for this group + record_op_stats = RecordOpStats( + record_id=dr._id, + record_parent_ids=dr._parent_ids, + record_source_indices=dr._source_indices, + record_state=dr.to_dict(include_bytes=False), + full_op_id=self.get_full_op_id(), + logical_op_id=self.logical_op_id or "semantic-groupby", + op_name=self.op_name(), + time_per_record=(time.time() - start_time) / len(groups), + cost_per_record=total_cost / len(groups), + model_name=self.get_model_name(), + input_fields=self.get_input_fields(), + generated_fields=list(self.output_schema.model_fields.keys()), + total_input_tokens=phase1_stats.total_input_tokens, + total_output_tokens=phase1_stats.total_output_tokens, + total_input_cost=phase1_stats.total_input_cost, + total_output_cost=phase1_stats.total_output_cost, + llm_call_duration_secs=phase1_stats.llm_call_duration_secs, + fn_call_duration_secs=phase1_stats.fn_call_duration_secs, + total_llm_calls=phase1_stats.total_llm_calls, + op_details={k: str(v) for k, v in self.get_id_params().items()}, + ) + record_op_stats_lst.append(record_op_stats) + + return DataRecordSet(drs, record_op_stats_lst) + + def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], any]: + """ + Phase 1: Use LLM to assign each candidate to a semantic group. + + Args: + candidates: List of DataRecords to classify into groups + + Returns: + Tuple of (list of group labels, generation stats) + """ + # Create a schema that just extracts the group-by field + from palimpzest.core.lib.schemas import create_schema_from_fields + groupby_schema = create_schema_from_fields([ + {"name": self.gby_fields[0], "type": str, "desc": f"The semantic category for {self.gby_fields[0]}"} + ]) + + # Process candidates to extract group labels + group_labels = [] + total_stats = None + + # Get input fields once + input_fields = self.get_input_fields() + fields = {self.gby_fields[0]: str} + + for candidate in candidates: + # Ask LLM to classify this record - pass single candidate, not list + gen_kwargs = { + "project_cols": input_fields, + "output_schema": groupby_schema, + "agg_instruction": f"Determine the '{self.gby_fields[0]}' category for this record." + } + + field_answers, _, gen_stats, _ = self.generator(candidate, fields, **gen_kwargs) + + # Extract the group label - field_answers returns dict with field->list mapping + group_label = field_answers.get(self.gby_fields[0], [None])[0] + if group_label is None: + # Fallback: use a default group + group_label = "unknown" + group_labels.append(group_label) + + # Accumulate stats + if total_stats is None: + total_stats = gen_stats + else: + # Sum up the stats + total_stats.total_input_tokens += gen_stats.total_input_tokens + total_stats.total_output_tokens += gen_stats.total_output_tokens + total_stats.total_input_cost += gen_stats.total_input_cost + total_stats.total_output_cost += gen_stats.total_output_cost + total_stats.llm_call_duration_secs += gen_stats.llm_call_duration_secs + total_stats.total_llm_calls += gen_stats.total_llm_calls + + return group_labels, total_stats \ No newline at end of file From b65d5de94cf63fcd616e20f583ed9189d5ab6efd Mon Sep 17 00:00:00 2001 From: Megha Hegde Date: Mon, 1 Dec 2025 20:21:45 -0500 Subject: [PATCH 03/28] tests for Semantic GroupBy --- tests/pytest/test_semantic_groupby.py | 127 ++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 tests/pytest/test_semantic_groupby.py diff --git a/tests/pytest/test_semantic_groupby.py b/tests/pytest/test_semantic_groupby.py new file mode 100644 index 000000000..18f4b8345 --- /dev/null +++ b/tests/pytest/test_semantic_groupby.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +""" +Test script for semantic group by operation. + +This script tests the SemanticGroupByOp implementation by creating a small dataset +of product reviews and grouping them by complaint type. +""" + +import pandas as pd +import palimpzest as pz +from palimpzest.query.operators.aggregate import SemanticGroupByOp +from palimpzest.constants import Model + +# Define columns for the review schema +review_cols = [ + {"name": "complaint", "type": str, "desc": "The type of complaint mentioned in the review (e.g., size, quality, shipping, description mismatch, ergonomics)"}, +] + +def test_semantic_groupby_basic(): + """Test basic semantic group by functionality using the physical operator directly.""" + print("Testing SemanticGroupByOp basic functionality...") + + try: + # Create list of candidates from text file dataset with schema + ds = pz.TextFileDataset(id="reviews", path="product-reviews/") + ds = ds.sem_map(review_cols) # Add schema to extract complaint types + output = ds.run() + candidates = [dr for dr in output] + + print(f"Loaded {len(candidates)} review candidates with schema") + print(f"Sample candidate fields: {list(candidates[0].to_dict().keys()) if candidates else 'none'}") + + # Get input schema from the candidates + input_schema = candidates[0].schema if candidates else None + + # Create output schema (group by field + count) + from palimpzest.core.lib.schemas import create_schema_from_fields + output_schema = create_schema_from_fields([ + {"name": "complaint", "type": str, "desc": "The complaint type"}, + {"name": "count", "type": int, "desc": "Count of reviews in this group"} + ]) + + # Create instance of the physical operator + sem_group_by_op = SemanticGroupByOp( + gby_fields=['complaint'], + agg_fields=['contents'], + agg_funcs=['count'], + input_schema=input_schema, + output_schema=output_schema, + model=Model.GPT_4o_MINI, + logical_op_id="test_semantic_groupby", # Required for RecordOpStats + verbose=False + ) + + print(f"Created SemanticGroupByOp: {sem_group_by_op}") + + # Execute the group by operation + grouped_output = sem_group_by_op(candidates) + + # Convert to DataFrame and print + df = pd.DataFrame([dr.to_dict() for dr in grouped_output]) + print("\nGrouped Results:") + print(df) + print(f"\nTotal groups: {len(df)}") + # print(f"Total cost: ${grouped_output.stats.cost:.4f}") + # print(f"Total time: {grouped_output.stats.time:.2f}s") + + return True + + except Exception as e: + print(f"Error during test: {e}") + import traceback + traceback.print_exc() + return False + +def test_semantic_groupby_via_dataset(): + """Test semantic group by via Dataset API.""" + print("\nTesting sem_groupby via Dataset API...") + + try: + # Create dataset and add schema + ds = pz.TextFileDataset(id="reviews", path="product-reviews/") + ds = ds.sem_map(review_cols) # Add schema to extract complaint types + + # Apply semantic group by operation + ds = ds.sem_groupby( + gby_fields=['complaint'], + agg_fields=['contents'], + agg_funcs=['count'] + ) + + # Run the query + output = ds.run() + + # Convert to DataFrame and print + df = output.to_df() + print("\nGrouped Results:") + print(df) + print(f"\nTotal groups: {len(df)}") + # print(f"Total cost: ${output.stats.cost:.4f}") + # print(f"Total time: {output.stats.time:.2f}s") + + return True + + except Exception as e: + print(f"Error during test: {e}") + import traceback + traceback.print_exc() + return False + +if __name__ == "__main__": + print("=" * 80) + print("Semantic GroupBy Test Suite") + print("=" * 80) + + print("\nRunning tests...\n") + + # Run tests + print("Test 1: Basic SemanticGroupByOp") + test_semantic_groupby_basic() + + print("\n" + "=" * 80) + print("Test 2: Dataset.sem_groupby() API") + test_semantic_groupby_via_dataset() + + print("\n" + "=" * 80) + print("All tests completed!") From f07f611fd8eff921c6563c70cc8689b6dd77f561 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 4 Jan 2026 19:12:24 +0530 Subject: [PATCH 04/28] Removed back compatibility with GroupBySig --- src/palimpzest/__init__.py | 2 - src/palimpzest/core/data/dataset.py | 25 +++++-- src/palimpzest/core/elements/groupbysig.py | 70 ------------------ src/palimpzest/query/operators/aggregate.py | 75 +++++++++----------- src/palimpzest/query/operators/logical.py | 78 +++++++-------------- 5 files changed, 79 insertions(+), 171 deletions(-) delete mode 100644 src/palimpzest/core/elements/groupbysig.py diff --git a/src/palimpzest/__init__.py b/src/palimpzest/__init__.py index a5df07411..5a2c28203 100644 --- a/src/palimpzest/__init__.py +++ b/src/palimpzest/__init__.py @@ -13,7 +13,6 @@ TextFileDataset, XLSFileDataset, ) -from palimpzest.core.elements.groupbysig import GroupBySig from palimpzest.core.lib.schemas import AudioBase64, AudioFilepath, ImageBase64, ImageFilepath, ImageURL from palimpzest.policy import ( MaxQuality, @@ -37,7 +36,6 @@ "Cardinality", "Model", # core - "GroupBySig", "Context", "TextFileContext", "Dataset", diff --git a/src/palimpzest/core/data/dataset.py b/src/palimpzest/core/data/dataset.py index e2d8458f5..5b2ff60a9 100644 --- a/src/palimpzest/core/data/dataset.py +++ b/src/palimpzest/core/data/dataset.py @@ -9,7 +9,6 @@ from palimpzest.constants import AggFunc, Cardinality from palimpzest.core.elements.filters import Filter -from palimpzest.core.elements.groupbysig import GroupBySig from palimpzest.core.lib.schemas import create_schema_from_fields, project, relax_schema, union_schemas from palimpzest.policy import construct_policy_from_kwargs from palimpzest.query.operators.logical import ( @@ -572,10 +571,26 @@ def max(self) -> Dataset: operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MAX) return Dataset(sources=[self], operator=operator, schema=operator.output_schema) - def groupby(self, groupby: GroupBySig) -> Dataset: - # update this!! - output_schema = groupby.output_schema() - operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, group_by_sig=groupby) + def groupby(self, gby_fields, agg_fields, agg_funcs) -> Dataset: + """Apply a group by operation to this dataset.""" + from typing import Any + + # Construct the output schema dynamically based on gby_fields and agg_funcs + fields = [] + + # Add group by fields to output schema + for g in gby_fields: + f = {"name": g, "type": Any, "desc": f"Group by field: {g}"} + fields.append(f) + + # Add aggregation fields to output schema + for i, agg_func in enumerate(agg_funcs): + agg_field_name = f"{agg_func}({agg_fields[i]})" + f = {"name": agg_field_name, "type": Any, "desc": f"Aggregate field: {agg_field_name}"} + fields.append(f) + + output_schema = create_schema_from_fields(fields) + operator = GroupByAggregate(input_schema=self.schema, gby_fields=gby_fields, agg_fields=agg_fields, agg_funcs=agg_funcs) return Dataset(sources=[self], operator=operator, schema=output_schema) def sem_groupby(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: list[str]) -> Dataset: diff --git a/src/palimpzest/core/elements/groupbysig.py b/src/palimpzest/core/elements/groupbysig.py deleted file mode 100644 index ef443c096..000000000 --- a/src/palimpzest/core/elements/groupbysig.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import annotations - -from typing import Any - -from pydantic import BaseModel - -from palimpzest.core.lib.schemas import create_schema_from_fields - -# TODO: -# - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator -# - construct the correct output schema using the input schema and the group by and aggregation fields -# - remove/update all other references to GroupBySig in the codebase - -# signature for a group by aggregate that applies -# group and aggregation to an input tuple -class GroupBySig: - def __init__(self, group_by_fields: list[str], agg_funcs: list[str], agg_fields: list[str]): - self.group_by_fields = group_by_fields - self.agg_funcs = agg_funcs - self.agg_fields = agg_fields - - def validate_schema(self, input_schema: type[BaseModel]) -> tuple[bool, str | None]: - for f in self.group_by_fields: - if f not in input_schema.model_fields: - return (False, "Supplied schema has no field " + f) - for f in self.agg_fields: - if f not in input_schema.model_fields: - return (False, "Supplied schema has no field " + f) - return (True, None) - - def serialize(self) -> dict[str, Any]: - out = { - "group_by_fields": self.group_by_fields, - "agg_funcs": self.agg_funcs, - "agg_fields": self.agg_fields, - } - return out - - def __str__(self) -> str: - return "GroupBy(" + repr(self.serialize()) + ")" - - def __hash__(self) -> int: - # custom hash function - return hash(repr(self.serialize())) - - def __eq__(self, other) -> bool: - # __eq__ should be defined for consistency with __hash__ - return isinstance(other, GroupBySig) and self.serialize() == other.serialize() - - def get_agg_field_names(self) -> list[str]: - ops = [] - for i in range(0, len(self.agg_fields)): - ops.append(self.agg_funcs[i] + "(" + self.agg_fields[i] + ")") - return ops - - # TODO: output schema needs to account for input schema types and create new output schema types - def output_schema(self) -> type[BaseModel]: - # the output class varies depending on the group by, so here - # we dynamically construct this output - fields = [] - for g in self.group_by_fields: - f = {"name": g, "type": Any, "desc": f"Group by field: {g}"} - fields.append(f) - - ops = self.get_agg_field_names() - for op in ops: - f = {"name": op, "type": Any, "desc": f"Aggregate field: {op}"} - fields.append(f) - - return create_schema_from_fields(fields) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index 244233df4..411ca0091 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -13,7 +13,6 @@ Model, PromptStrategy, ) -from palimpzest.core.elements.groupbysig import GroupBySig from palimpzest.core.elements.records import DataRecord, DataRecordSet from palimpzest.core.lib.schemas import Average, Count, Max, Min, Sum from palimpzest.core.models import OperatorCostEstimates, RecordOpStats @@ -40,44 +39,39 @@ class ApplyGroupByOp(AggregateOp): 1. Legacy: group_by_sig parameter containing fields and functions 2. New: gby_fields, agg_fields, agg_funcs parameters directly """ - def __init__(self, group_by_sig: GroupBySig = None, gby_fields: list[str] = None, + def __init__(self, gby_fields: list[str] = None, agg_fields: list[str] = None, agg_funcs: list[str] = None, *args, **kwargs): super().__init__(*args, **kwargs) - # Support both old API (group_by_sig) and new API (individual fields) - if group_by_sig is not None: - # Legacy API: use group_by_sig - self.group_by_sig = group_by_sig - self.gby_fields = group_by_sig.group_by_fields - self.agg_fields = group_by_sig.agg_fields - self.agg_funcs = group_by_sig.agg_funcs - elif gby_fields is not None and agg_fields is not None and agg_funcs is not None: - # New API: construct group_by_sig from individual fields - self.gby_fields = gby_fields - self.agg_fields = agg_fields - self.agg_funcs = agg_funcs - # Create a GroupBySig for backwards compatibility with existing code - from palimpzest.core.elements.groupbysig import GroupBySig - self.group_by_sig = GroupBySig( - group_by_fields=gby_fields, - agg_fields=agg_fields, - agg_funcs=agg_funcs - ) - else: - raise ValueError("Either group_by_sig or (gby_fields, agg_fields, agg_funcs) must be provided") + # New API: construct group_by_sig from individual fields + self.gby_fields = gby_fields + self.agg_fields = agg_fields + self.agg_funcs = agg_funcs def __str__(self): op = super().__str__() - op += f" Group-by Signature: {str(self.group_by_sig)}\n" + op += f" Group-by Fields: {self.gby_fields}\n" + op += f" Agg. Fields: {self.agg_fields}\n" + op += f" Agg. Funcs: {self.agg_funcs}\n" return op def get_id_params(self): id_params = super().get_id_params() - return {"group_by_sig": str(self.group_by_sig.serialize()), **id_params} + return { + "gby_fields": self.gby_fields, + "agg_fields": self.agg_fields, + "agg_funcs": self.agg_funcs, + **id_params + } def get_op_params(self): op_params = super().get_op_params() - return {"group_by_sig": self.group_by_sig, **op_params} + return { + "gby_fields": self.gby_fields, + "agg_fields": self.agg_fields, + "agg_funcs": self.agg_funcs, + **op_params + } def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates: # for now, assume applying the groupby takes negligible additional time (and no cost in USD) @@ -154,28 +148,29 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: agg_state = {} for candidate in candidates: group = () - for f in self.group_by_sig.group_by_fields: - if not hasattr(candidate, f): - raise TypeError(f"ApplyGroupByOp record missing expected field {f}") + for f in self.gby_fields: + # if not hasattr(candidate, f): + # raise TypeError(f"ApplyGroupByOp record missing expected field {f}") group = group + (getattr(candidate, f),) if group in agg_state: state = agg_state[group] else: state = [] - for fun in self.group_by_sig.agg_funcs: + for fun in self.agg_funcs: state.append(ApplyGroupByOp.agg_init(fun)) - for i in range(0, len(self.group_by_sig.agg_funcs)): - fun = self.group_by_sig.agg_funcs[i] - if not hasattr(candidate, self.group_by_sig.agg_fields[i]): - raise TypeError(f"ApplyGroupByOp record missing expected field {self.group_by_sig.agg_fields[i]}") - field = getattr(candidate, self.group_by_sig.agg_fields[i]) + for i in range(0, len(self.agg_funcs)): + fun = self.agg_funcs[i] + # if not hasattr(candidate, self.agg_fields[i]): + # raise TypeError(f"ApplyGroupByOp record missing expected field {self.agg_fields[i]}") + field = getattr(candidate, self.agg_fields[i]) state[i] = ApplyGroupByOp.agg_merge(fun, state[i], field) agg_state[group] = state # return list of data records (one per group) drs: list[DataRecord] = [] - group_by_fields = self.group_by_sig.group_by_fields - agg_fields = self.group_by_sig.get_agg_field_names() + group_by_fields = self.gby_fields + # Construct aggregation field names: "func(field)" + agg_field_names = [f"{func}({field})" for func, field in zip(self.agg_funcs, self.agg_fields)] for g in agg_state: # build up data item data_item = {} @@ -184,11 +179,11 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: data_item[group_by_fields[i]] = k vals = agg_state[g] for i in range(0, len(vals)): - v = ApplyGroupByOp.agg_final(self.group_by_sig.agg_funcs[i], vals[i]) - data_item[agg_fields[i]] = v + v = ApplyGroupByOp.agg_final(self.agg_funcs[i], vals[i]) + data_item[agg_field_names[i]] = v # create new DataRecord - schema = self.group_by_sig.output_schema() + schema = self.output_schema data_item = schema(**data_item) dr = DataRecord.from_agg_parents(data_item, parent_records=candidates) drs.append(dr) diff --git a/src/palimpzest/query/operators/logical.py b/src/palimpzest/query/operators/logical.py index 36f6cd84c..ad4874598 100644 --- a/src/palimpzest/query/operators/logical.py +++ b/src/palimpzest/query/operators/logical.py @@ -8,7 +8,6 @@ from palimpzest.constants import AggFunc, Cardinality from palimpzest.core.data import context, dataset from palimpzest.core.elements.filters import Filter -from palimpzest.core.elements.groupbysig import GroupBySig from palimpzest.core.lib.schemas import Average, Count, Max, Min, Sum from palimpzest.utils.hash_helpers import hash_for_id @@ -381,7 +380,6 @@ def get_logical_op_params(self) -> dict: class GroupByAggregate(LogicalOperator): def __init__( self, - group_by_sig: GroupBySig | None = None, gby_fields: list[str] | None = None, agg_fields: list[str] | None = None, agg_funcs: list[str] | None = None, @@ -392,68 +390,40 @@ def __init__( if not self.input_schema: raise ValueError("GroupByAggregate requires an input schema") - # Support both old GroupBySig and new direct parameters - if group_by_sig is not None: - # Old API: using GroupBySig - (valid, error) = group_by_sig.validate_schema(self.input_schema) - if not valid: - raise TypeError(error) - self.group_by_sig = group_by_sig - self.gby_fields = group_by_sig.group_by_fields - self.agg_fields = group_by_sig.agg_fields - self.agg_funcs = group_by_sig.agg_funcs - else: - # New API: using direct parameters - if gby_fields is None or agg_fields is None or agg_funcs is None: - raise ValueError("Must provide either group_by_sig or all of (gby_fields, agg_fields, agg_funcs)") - - # Validate fields exist in input schema - for f in gby_fields: - if f not in self.input_schema.model_fields: - raise TypeError(f"Supplied schema has no field {f}") - for f in agg_fields: - if f not in self.input_schema.model_fields: - raise TypeError(f"Supplied schema has no field {f}") - - self.group_by_sig = None - self.gby_fields = gby_fields - self.agg_fields = agg_fields - self.agg_funcs = agg_funcs + # Validate that all required parameters are provided + if gby_fields is None or agg_fields is None or agg_funcs is None: + raise ValueError("Must provide all of (gby_fields, agg_fields, agg_funcs)") + + for f in agg_fields: + if f not in self.input_schema.model_fields: + raise TypeError(f"Supplied schema has no field {f}") + + self.gby_fields = gby_fields + self.agg_fields = agg_fields + self.agg_funcs = agg_funcs def __str__(self): - if self.group_by_sig is not None: - return f"GroupBy({self.group_by_sig.serialize()})" - else: - return f"GroupBy(gby_fields={self.gby_fields}, agg_fields={self.agg_fields}, agg_funcs={self.agg_funcs})" + return f"GroupBy(gby_fields={self.gby_fields}, agg_fields={self.agg_fields}, agg_funcs={self.agg_funcs})" def get_logical_id_params(self) -> dict: logical_id_params = super().get_logical_id_params() - if self.group_by_sig is not None: - logical_id_params = {"group_by_sig": self.group_by_sig, **logical_id_params} - else: - logical_id_params = { - "gby_fields": self.gby_fields, - "agg_fields": self.agg_fields, - "agg_funcs": self.agg_funcs, - **logical_id_params - } + logical_id_params = { + "gby_fields": self.gby_fields, + "agg_fields": self.agg_fields, + "agg_funcs": self.agg_funcs, + **logical_id_params, + } return logical_id_params def get_logical_op_params(self) -> dict: logical_op_params = super().get_logical_op_params() - if self.group_by_sig is not None: - logical_op_params = { - "group_by_sig": self.group_by_sig, - **logical_op_params, - } - else: - logical_op_params = { - "gby_fields": self.gby_fields, - "agg_fields": self.agg_fields, - "agg_funcs": self.agg_funcs, - **logical_op_params, - } + logical_op_params = { + "gby_fields": self.gby_fields, + "agg_fields": self.agg_fields, + "agg_funcs": self.agg_funcs, + **logical_op_params, + } return logical_op_params From 9351efc1efa47c1f7319d638aed9b7abc6970ae9 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 4 Jan 2026 19:26:15 +0530 Subject: [PATCH 05/28] restored field check in ApplyGroupByOp --- src/palimpzest/query/operators/aggregate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index 411ca0091..1500c2b11 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -149,8 +149,8 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: for candidate in candidates: group = () for f in self.gby_fields: - # if not hasattr(candidate, f): - # raise TypeError(f"ApplyGroupByOp record missing expected field {f}") + if not hasattr(candidate, f): + raise TypeError(f"ApplyGroupByOp record missing expected field {f}") group = group + (getattr(candidate, f),) if group in agg_state: state = agg_state[group] @@ -160,8 +160,8 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: state.append(ApplyGroupByOp.agg_init(fun)) for i in range(0, len(self.agg_funcs)): fun = self.agg_funcs[i] - # if not hasattr(candidate, self.agg_fields[i]): - # raise TypeError(f"ApplyGroupByOp record missing expected field {self.agg_fields[i]}") + if not hasattr(candidate, self.agg_fields[i]): + raise TypeError(f"ApplyGroupByOp record missing expected field {self.agg_fields[i]}") field = getattr(candidate, self.agg_fields[i]) state[i] = ApplyGroupByOp.agg_merge(fun, state[i], field) agg_state[group] = state From 8393b259e293db282089f4222ea258edf03a4d49 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 4 Jan 2026 21:21:50 +0530 Subject: [PATCH 06/28] Simplied aggregation logic in Semantic GroupBy's call --- src/palimpzest/query/operators/aggregate.py | 106 +++++++++----------- tests/pytest/product-reviews/review1.txt | 1 + tests/pytest/product-reviews/review2.txt | 1 + tests/pytest/product-reviews/review3.txt | 1 + 4 files changed, 52 insertions(+), 57 deletions(-) create mode 100644 tests/pytest/product-reviews/review1.txt create mode 100644 tests/pytest/product-reviews/review2.txt create mode 100644 tests/pytest/product-reviews/review3.txt diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index 1500c2b11..65e108a98 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -15,7 +15,7 @@ ) from palimpzest.core.elements.records import DataRecord, DataRecordSet from palimpzest.core.lib.schemas import Average, Count, Max, Min, Sum -from palimpzest.core.models import OperatorCostEstimates, RecordOpStats +from palimpzest.core.models import OperatorCostEstimates, RecordOpStats, GenerationStats from palimpzest.query.generators.generators import Generator from palimpzest.query.operators.physical import PhysicalOperator @@ -633,7 +633,6 @@ def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: start_time = time.time() - # TODO: if candidates is an empty list, return an empty DataRecordSet if len(candidates) == 0: return DataRecordSet([], []) @@ -697,7 +696,7 @@ def __init__(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: list self.gby_fields = gby_fields self.agg_fields = agg_fields self.agg_funcs = agg_funcs - self.model = model if model is not None else Model.GPT_4O_MINI + self.model = model self.prompt_strategy = prompt_strategy self.reasoning_effort = reasoning_effort @@ -795,57 +794,59 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: return DataRecordSet([], []) # Use LLM to assign each record to a semantic group - group_assignments, phase1_stats = self._assign_groups_llm(candidates) + group_assignments, gen_stats = self._assign_groups_llm(candidates) - # Group candidates by their assigned group labels - groups = {} + # Group candidates by their assigned group labels and compute aggregations + # Using the same approach as ApplyGroupByOp but with LLM-determined groups + agg_state = {} for candidate, group_label in zip(candidates, group_assignments): - if group_label not in groups: - groups[group_label] = [] - groups[group_label].append(candidate) + # Use group_label as the group key (tuple with single element) + group = (group_label,) + + # Initialize aggregation state for new groups + if group not in agg_state: + state = [] + for fun in self.agg_funcs: + state.append(ApplyGroupByOp.agg_init(fun)) + else: + state = agg_state[group] + + # Merge values from this candidate into the aggregation state + for i in range(0, len(self.agg_funcs)): + fun = self.agg_funcs[i] + if not hasattr(candidate, self.agg_fields[i]): + raise TypeError(f"SemanticGroupByOp record missing expected field {self.agg_fields[i]}") + field = getattr(candidate, self.agg_fields[i]) + state[i] = ApplyGroupByOp.agg_merge(fun, state[i], field) + + agg_state[group] = state - # Apply aggregation functions to each group + # Create output DataRecords (one per group) drs = [] record_op_stats_lst = [] - total_cost = phase1_stats.cost_per_record * len(candidates) - for group_label, group_members in groups.items(): + for group_key in agg_state: # Build aggregated data item for this group data_item = {} - # Add group-by field value - data_item[self.gby_fields[0]] = group_label - - # Use LLM to compute aggregations for this group - # Use LLM to compute aggregations for this group - fields_to_generate = [agg_func.lower() for agg_func in self.agg_funcs] - fields = {field: self.output_schema.model_fields[field].annotation for field in fields_to_generate} - - # Construct generation kwargs - gen_kwargs = { - "project_cols": self.get_input_fields(), - "output_schema": self.output_schema, - "agg_instruction": f"Compute the following aggregations for group '{group_label}': {', '.join([f'{func}({field})' for func, field in zip(self.agg_funcs, self.agg_fields)])}" - } - - # Generate aggregation results - field_answers, _, agg_stats, _ = self.generator(group_members, fields, **gen_kwargs) - agg_results = {field: field_answers[field][0] for field in fields_to_generate} + # Add group-by field value (extract from tuple) + data_item[self.gby_fields[0]] = group_key[0] - # Add aggregation results to data_item - for agg_field, agg_func in zip(self.agg_fields, self.agg_funcs): + # Add aggregation results (using agg_final to compute final values) + vals = agg_state[group_key] + for i in range(0, len(vals)): + agg_func = self.agg_funcs[i] output_field_name = agg_func.lower() - data_item[output_field_name] = agg_results.get(output_field_name) - - # Accumulate aggregation costs - total_cost += agg_stats.cost_per_record * len(group_members) + v = ApplyGroupByOp.agg_final(agg_func, vals[i]) + data_item[output_field_name] = v # Create the DataRecord for this group data_item_obj = self.output_schema(**data_item) - dr = DataRecord.from_agg_parents(data_item_obj, parent_records=group_members) + dr = DataRecord.from_agg_parents(data_item_obj, parent_records=candidates) drs.append(dr) # Create RecordOpStats for this group + # Cost is from LLM group assignment only (aggregation is free) record_op_stats = RecordOpStats( record_id=dr._id, record_parent_ids=dr._parent_ids, @@ -854,18 +855,18 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: full_op_id=self.get_full_op_id(), logical_op_id=self.logical_op_id or "semantic-groupby", op_name=self.op_name(), - time_per_record=(time.time() - start_time) / len(groups), - cost_per_record=total_cost / len(groups), + time_per_record=(time.time() - start_time) / len(agg_state), + cost_per_record=gen_stats.total_output_cost / len(agg_state), model_name=self.get_model_name(), input_fields=self.get_input_fields(), generated_fields=list(self.output_schema.model_fields.keys()), - total_input_tokens=phase1_stats.total_input_tokens, - total_output_tokens=phase1_stats.total_output_tokens, - total_input_cost=phase1_stats.total_input_cost, - total_output_cost=phase1_stats.total_output_cost, - llm_call_duration_secs=phase1_stats.llm_call_duration_secs, - fn_call_duration_secs=phase1_stats.fn_call_duration_secs, - total_llm_calls=phase1_stats.total_llm_calls, + total_input_tokens=gen_stats.total_input_tokens, + total_output_tokens=gen_stats.total_output_tokens, + total_input_cost=gen_stats.total_input_cost, + total_output_cost=gen_stats.total_output_cost, + llm_call_duration_secs=gen_stats.llm_call_duration_secs, + fn_call_duration_secs=gen_stats.fn_call_duration_secs, + total_llm_calls=gen_stats.total_llm_calls, op_details={k: str(v) for k, v in self.get_id_params().items()}, ) record_op_stats_lst.append(record_op_stats) @@ -890,7 +891,7 @@ def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], a # Process candidates to extract group labels group_labels = [] - total_stats = None + total_stats = GenerationStats() # Get input fields once input_fields = self.get_input_fields() @@ -914,15 +915,6 @@ def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], a group_labels.append(group_label) # Accumulate stats - if total_stats is None: - total_stats = gen_stats - else: - # Sum up the stats - total_stats.total_input_tokens += gen_stats.total_input_tokens - total_stats.total_output_tokens += gen_stats.total_output_tokens - total_stats.total_input_cost += gen_stats.total_input_cost - total_stats.total_output_cost += gen_stats.total_output_cost - total_stats.llm_call_duration_secs += gen_stats.llm_call_duration_secs - total_stats.total_llm_calls += gen_stats.total_llm_calls + total_stats += gen_stats return group_labels, total_stats \ No newline at end of file diff --git a/tests/pytest/product-reviews/review1.txt b/tests/pytest/product-reviews/review1.txt new file mode 100644 index 000000000..9532e00ba --- /dev/null +++ b/tests/pytest/product-reviews/review1.txt @@ -0,0 +1 @@ +Shipping took forever. I am very disappointed. \ No newline at end of file diff --git a/tests/pytest/product-reviews/review2.txt b/tests/pytest/product-reviews/review2.txt new file mode 100644 index 000000000..287e028d7 --- /dev/null +++ b/tests/pytest/product-reviews/review2.txt @@ -0,0 +1 @@ +The quality of the product is poor and it fell apart quickly \ No newline at end of file diff --git a/tests/pytest/product-reviews/review3.txt b/tests/pytest/product-reviews/review3.txt new file mode 100644 index 000000000..67fffaa58 --- /dev/null +++ b/tests/pytest/product-reviews/review3.txt @@ -0,0 +1 @@ +This is too small for my needs. \ No newline at end of file From fdecc4786f271c7f7482acc78e12ff73f625d1c0 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 4 Jan 2026 21:35:58 +0530 Subject: [PATCH 07/28] Added Implementation Rule for Semantic GroupBy --- src/palimpzest/query/optimizer/rules.py | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/palimpzest/query/optimizer/rules.py b/src/palimpzest/query/optimizer/rules.py index a1861290d..fe0a3c828 100644 --- a/src/palimpzest/query/optimizer/rules.py +++ b/src/palimpzest/query/optimizer/rules.py @@ -1086,3 +1086,33 @@ def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> logger.debug(f"Substituting BasicSubstitutionRule for {logical_expression}") physical_op_class = cls.LOGICAL_OP_CLASS_TO_PHYSICAL_OP_CLASS_MAP[logical_expression.operator.__class__] return cls._perform_substitution(logical_expression, physical_op_class, runtime_kwargs) + + +class SemanticGroupBy(ImplementationRule): + """ + Substitute a logical expression for a GroupBy with an llm physical implementation. + """ + + @classmethod + def matches_pattern(cls, logical_expression: LogicalExpression) -> bool: + is_match = isinstance(logical_expression.operator, GroupByAggregate) and logical_expression.operator.group_by_fn is None + logger.debug(f"SemanticGroupBy matches_pattern: {is_match} for {logical_expression}") + return is_match + + @classmethod + def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]: + logger.debug(f"Substituting SemanticGroupBy for {logical_expression}") + + # create variable physical operator kwargs for each model which can implement this logical_expression + models = [model for model in runtime_kwargs["available_models"] if cls._model_matches_input(model, logical_expression)] + no_reasoning = runtime_kwargs["reasoning_effort"] in [None, "minimal", "low"] + variable_op_kwargs = [ + { + "model": model, + "prompt_strategy": PromptStrategy.GROUP_BY_NO_REASONING if model.is_reasoning_model() and no_reasoning else PromptStrategy.GROUP_BY, + "reasoning_effort": runtime_kwargs["reasoning_effort"] + } + for model in models + ] + + return cls._perform_substitution(logical_expression, SemanticGroupByOp, runtime_kwargs, variable_op_kwargs) \ No newline at end of file From 12ba5f1e724c75c2665256b20ea889b4db4e8c67 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 4 Jan 2026 22:01:36 +0530 Subject: [PATCH 08/28] Updated implementation rule and added distinction between semantic and non-semantic groupbys --- src/palimpzest/core/data/dataset.py | 1 + src/palimpzest/query/operators/logical.py | 1 + src/palimpzest/query/optimizer/rules.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/palimpzest/core/data/dataset.py b/src/palimpzest/core/data/dataset.py index 5b2ff60a9..719352237 100644 --- a/src/palimpzest/core/data/dataset.py +++ b/src/palimpzest/core/data/dataset.py @@ -628,6 +628,7 @@ def sem_groupby(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: l # Create logical operator with direct parameters (no GroupBySig) operator = GroupByAggregate( input_schema=self.schema, + is_semantic=True, output_schema=output_schema, gby_fields=gby_fields, agg_fields=agg_fields, diff --git a/src/palimpzest/query/operators/logical.py b/src/palimpzest/query/operators/logical.py index ad4874598..dc0339089 100644 --- a/src/palimpzest/query/operators/logical.py +++ b/src/palimpzest/query/operators/logical.py @@ -380,6 +380,7 @@ def get_logical_op_params(self) -> dict: class GroupByAggregate(LogicalOperator): def __init__( self, + is_semantic: bool = False, gby_fields: list[str] | None = None, agg_fields: list[str] | None = None, agg_funcs: list[str] | None = None, diff --git a/src/palimpzest/query/optimizer/rules.py b/src/palimpzest/query/optimizer/rules.py index fe0a3c828..991a11344 100644 --- a/src/palimpzest/query/optimizer/rules.py +++ b/src/palimpzest/query/optimizer/rules.py @@ -1095,7 +1095,7 @@ class SemanticGroupBy(ImplementationRule): @classmethod def matches_pattern(cls, logical_expression: LogicalExpression) -> bool: - is_match = isinstance(logical_expression.operator, GroupByAggregate) and logical_expression.operator.group_by_fn is None + is_match = isinstance(logical_expression.operator, GroupByAggregate) and logical_expression.operator.is_semantic == True logger.debug(f"SemanticGroupBy matches_pattern: {is_match} for {logical_expression}") return is_match From dd9dd0bccc801759ba9f4c8fde2f1ba76bcac76b Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:18:00 +0530 Subject: [PATCH 09/28] New Implementation Rule for Non Semantic GroupBys --- src/palimpzest/query/operators/logical.py | 3 +++ src/palimpzest/query/optimizer/__init__.py | 12 ++++++++++-- src/palimpzest/query/optimizer/rules.py | 19 ++++++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/palimpzest/query/operators/logical.py b/src/palimpzest/query/operators/logical.py index dc0339089..f9008b2d6 100644 --- a/src/palimpzest/query/operators/logical.py +++ b/src/palimpzest/query/operators/logical.py @@ -399,6 +399,7 @@ def __init__( if f not in self.input_schema.model_fields: raise TypeError(f"Supplied schema has no field {f}") + self.is_semantic = is_semantic self.gby_fields = gby_fields self.agg_fields = agg_fields self.agg_funcs = agg_funcs @@ -409,6 +410,7 @@ def __str__(self): def get_logical_id_params(self) -> dict: logical_id_params = super().get_logical_id_params() logical_id_params = { + "is_semantic": self.is_semantic, "gby_fields": self.gby_fields, "agg_fields": self.agg_fields, "agg_funcs": self.agg_funcs, @@ -420,6 +422,7 @@ def get_logical_id_params(self) -> dict: def get_logical_op_params(self) -> dict: logical_op_params = super().get_logical_op_params() logical_op_params = { + "is_semantic": self.is_semantic, "gby_fields": self.gby_fields, "agg_fields": self.agg_fields, "agg_funcs": self.agg_funcs, diff --git a/src/palimpzest/query/optimizer/__init__.py b/src/palimpzest/query/optimizer/__init__.py index 880806003..cc894b5e3 100644 --- a/src/palimpzest/query/optimizer/__init__.py +++ b/src/palimpzest/query/optimizer/__init__.py @@ -32,6 +32,9 @@ from palimpzest.query.optimizer.rules import ( NonLLMFilterRule as _NonLLMFilterRule, ) +from palimpzest.query.optimizer.rules import ( + NonSemanticGroupBy as _NonSemanticGroupBy, +) from palimpzest.query.optimizer.rules import ( PushDownFilter as _PushDownFilter, ) @@ -50,6 +53,9 @@ from palimpzest.query.optimizer.rules import ( SemanticAggregateRule as _SemanticAggregateRule, ) +from palimpzest.query.optimizer.rules import ( + SemanticGroupBy as _SemanticGroupBy, +) from palimpzest.query.optimizer.rules import ( SplitRule as _SplitRule, ) @@ -73,14 +79,16 @@ _MixtureOfAgentsRule, _NonLLMConvertRule, _NonLLMFilterRule, + _NonSemanticGroupBy, _PushDownFilter, _RAGRule, _RelationalJoinRule, _ReorderConverts, - _TopKRule, - _Rule, _SemanticAggregateRule, + _SemanticGroupBy, _SplitRule, + _TopKRule, + _Rule, _TransformationRule, ] diff --git a/src/palimpzest/query/optimizer/rules.py b/src/palimpzest/query/optimizer/rules.py index 991a11344..c84954670 100644 --- a/src/palimpzest/query/optimizer/rules.py +++ b/src/palimpzest/query/optimizer/rules.py @@ -19,6 +19,7 @@ MaxAggregateOp, MinAggregateOp, SemanticAggregate, + SemanticGroupByOp, SumAggregateOp, ) from palimpzest.query.operators.compute import SmolAgentsCompute @@ -1071,7 +1072,6 @@ class BasicSubstitutionRule(ImplementationRule): Distinct: DistinctOp, LimitScan: LimitScanOp, Project: ProjectOp, - GroupByAggregate: ApplyGroupByOp, } @classmethod @@ -1088,6 +1088,23 @@ def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> return cls._perform_substitution(logical_expression, physical_op_class, runtime_kwargs) +class NonSemanticGroupBy(ImplementationRule): + """ + Substitute a logical expression for a non-semantic GroupBy with ApplyGroupByOp. + """ + + @classmethod + def matches_pattern(cls, logical_expression: LogicalExpression) -> bool: + is_match = isinstance(logical_expression.operator, GroupByAggregate) and logical_expression.operator.is_semantic == False + logger.debug(f"NonSemanticGroupBy matches_pattern: {is_match} for {logical_expression}") + return is_match + + @classmethod + def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]: + logger.debug(f"Substituting NonSemanticGroupBy for {logical_expression}") + return cls._perform_substitution(logical_expression, ApplyGroupByOp, runtime_kwargs) + + class SemanticGroupBy(ImplementationRule): """ Substitute a logical expression for a GroupBy with an llm physical implementation. From ebe125d203562894d9ddd0766f2fbbb0815d92fb Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:20:38 +0530 Subject: [PATCH 10/28] Deleted get_fields_to_generate from SemanticGroupByOp --- src/palimpzest/query/operators/aggregate.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index 65e108a98..897ca857c 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -738,13 +738,6 @@ def get_op_params(self): def get_model_name(self) -> str: return self.model.value - - def get_fields_to_generate(self, candidate: DataRecord) -> list[str]: - """ - For aggregation operators, we need to generate ALL output fields (including group-by fields), - not just the new fields. This overrides the default behavior. - """ - return list(self.output_schema.model_fields.keys()) def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates: """ From ba1ec682ae6a372a1d7da8e65022b8392caf19ba Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:52:20 +0530 Subject: [PATCH 11/28] updated prompt strategy in SemanticGroupBy's implementation rule --- src/palimpzest/query/optimizer/rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/palimpzest/query/optimizer/rules.py b/src/palimpzest/query/optimizer/rules.py index c84954670..15efc523e 100644 --- a/src/palimpzest/query/optimizer/rules.py +++ b/src/palimpzest/query/optimizer/rules.py @@ -1126,7 +1126,7 @@ def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> variable_op_kwargs = [ { "model": model, - "prompt_strategy": PromptStrategy.GROUP_BY_NO_REASONING if model.is_reasoning_model() and no_reasoning else PromptStrategy.GROUP_BY, + "prompt_strategy": PromptStrategy.AGG_NO_REASONING if model.is_reasoning_model() and no_reasoning else PromptStrategy.AGG, "reasoning_effort": runtime_kwargs["reasoning_effort"] } for model in models From c45312b2b5516ad975bb1b25ec51b3fe71facb25 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:59:10 +0530 Subject: [PATCH 12/28] SemanticGroupByOp's call uses output_schema to set output_field_names --- src/palimpzest/query/operators/aggregate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index 897ca857c..155ca211e 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -818,6 +818,9 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: drs = [] record_op_stats_lst = [] + # Get the output field names from the output schema + output_field_names = [f for f in self.output_schema.model_fields.keys() if f not in self.gby_fields] + for group_key in agg_state: # Build aggregated data item for this group data_item = {} @@ -829,7 +832,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: vals = agg_state[group_key] for i in range(0, len(vals)): agg_func = self.agg_funcs[i] - output_field_name = agg_func.lower() + output_field_name = output_field_names[i] v = ApplyGroupByOp.agg_final(agg_func, vals[i]) data_item[output_field_name] = v From d6ba70d74983f16ba12cab5f210b623ade5cdadb Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Tue, 6 Jan 2026 18:06:21 +0530 Subject: [PATCH 13/28] updated schema initialization in test_semantic_groupby --- tests/pytest/test_semantic_groupby.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/pytest/test_semantic_groupby.py b/tests/pytest/test_semantic_groupby.py index 18f4b8345..02584bdec 100644 --- a/tests/pytest/test_semantic_groupby.py +++ b/tests/pytest/test_semantic_groupby.py @@ -23,7 +23,6 @@ def test_semantic_groupby_basic(): try: # Create list of candidates from text file dataset with schema ds = pz.TextFileDataset(id="reviews", path="product-reviews/") - ds = ds.sem_map(review_cols) # Add schema to extract complaint types output = ds.run() candidates = [dr for dr in output] @@ -34,11 +33,25 @@ def test_semantic_groupby_basic(): input_schema = candidates[0].schema if candidates else None # Create output schema (group by field + count) + # Using the same naming convention as Dataset.sem_groupby() from palimpzest.core.lib.schemas import create_schema_from_fields - output_schema = create_schema_from_fields([ - {"name": "complaint", "type": str, "desc": "The complaint type"}, - {"name": "count", "type": int, "desc": "Count of reviews in this group"} - ]) + from typing import Any + + fields = [] + # Add group by fields to output schema + for g in ['complaint']: + f = {"name": g, "type": Any, "desc": f"Group by field: {g}"} + fields.append(f) + + # Add aggregation fields to output schema + agg_fields_list = ['contents'] + agg_funcs_list = ['count'] + for i, agg_func in enumerate(agg_funcs_list): + agg_field_name = f"{agg_func}({agg_fields_list[i]})" + f = {"name": agg_field_name, "type": Any, "desc": f"Aggregate field: {agg_field_name}"} + fields.append(f) + + output_schema = create_schema_from_fields(fields) # Create instance of the physical operator sem_group_by_op = SemanticGroupByOp( @@ -80,7 +93,6 @@ def test_semantic_groupby_via_dataset(): try: # Create dataset and add schema ds = pz.TextFileDataset(id="reviews", path="product-reviews/") - ds = ds.sem_map(review_cols) # Add schema to extract complaint types # Apply semantic group by operation ds = ds.sem_groupby( From b1d8861a3be50e68b3ea4d7ce80cd84eb893dd05 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 11 Jan 2026 12:01:37 +0800 Subject: [PATCH 14/28] updated total cost parameter --- src/palimpzest/query/operators/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index 155ca211e..b7c8c33aa 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -852,7 +852,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: logical_op_id=self.logical_op_id or "semantic-groupby", op_name=self.op_name(), time_per_record=(time.time() - start_time) / len(agg_state), - cost_per_record=gen_stats.total_output_cost / len(agg_state), + cost_per_record=gen_stats.cost_per_record / len(agg_state), model_name=self.get_model_name(), input_fields=self.get_input_fields(), generated_fields=list(self.output_schema.model_fields.keys()), From ff2a5c3030d901620402c94f200404357054405e Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 11 Jan 2026 12:07:00 +0800 Subject: [PATCH 15/28] Added output schema during groupByAggregate creation --- src/palimpzest/core/data/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/palimpzest/core/data/dataset.py b/src/palimpzest/core/data/dataset.py index 719352237..83de59bc0 100644 --- a/src/palimpzest/core/data/dataset.py +++ b/src/palimpzest/core/data/dataset.py @@ -590,7 +590,7 @@ def groupby(self, gby_fields, agg_fields, agg_funcs) -> Dataset: fields.append(f) output_schema = create_schema_from_fields(fields) - operator = GroupByAggregate(input_schema=self.schema, gby_fields=gby_fields, agg_fields=agg_fields, agg_funcs=agg_funcs) + operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, gby_fields=gby_fields, agg_fields=agg_fields, agg_funcs=agg_funcs) return Dataset(sources=[self], operator=operator, schema=output_schema) def sem_groupby(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: list[str]) -> Dataset: From 142be5bf701d9644b7ae46219d76ffbb965cd02b Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 11 Jan 2026 12:12:44 +0800 Subject: [PATCH 16/28] Created schema from fields helper for groupBy functions --- src/palimpzest/core/data/dataset.py | 38 +++-------------------------- src/palimpzest/core/lib/schemas.py | 17 ++++++++++++- 2 files changed, 19 insertions(+), 36 deletions(-) diff --git a/src/palimpzest/core/data/dataset.py b/src/palimpzest/core/data/dataset.py index 83de59bc0..99b084f71 100644 --- a/src/palimpzest/core/data/dataset.py +++ b/src/palimpzest/core/data/dataset.py @@ -9,7 +9,7 @@ from palimpzest.constants import AggFunc, Cardinality from palimpzest.core.elements.filters import Filter -from palimpzest.core.lib.schemas import create_schema_from_fields, project, relax_schema, union_schemas +from palimpzest.core.lib.schemas import create_schema_from_fields, create_groupby_schema_from_fields, project, relax_schema, union_schemas from palimpzest.policy import construct_policy_from_kwargs from palimpzest.query.operators.logical import ( Aggregate, @@ -573,23 +573,7 @@ def max(self) -> Dataset: def groupby(self, gby_fields, agg_fields, agg_funcs) -> Dataset: """Apply a group by operation to this dataset.""" - from typing import Any - - # Construct the output schema dynamically based on gby_fields and agg_funcs - fields = [] - - # Add group by fields to output schema - for g in gby_fields: - f = {"name": g, "type": Any, "desc": f"Group by field: {g}"} - fields.append(f) - - # Add aggregation fields to output schema - for i, agg_func in enumerate(agg_funcs): - agg_field_name = f"{agg_func}({agg_fields[i]})" - f = {"name": agg_field_name, "type": Any, "desc": f"Aggregate field: {agg_field_name}"} - fields.append(f) - - output_schema = create_schema_from_fields(fields) + output_schema = create_groupby_schema_from_fields(self.schema, gby_fields, agg_fields, agg_funcs) operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, gby_fields=gby_fields, agg_fields=agg_fields, agg_funcs=agg_funcs) return Dataset(sources=[self], operator=operator, schema=output_schema) @@ -607,23 +591,7 @@ def sem_groupby(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: l ds = pz.TextFileDataset(id="reviews", dir="product-reviews/") ds = ds.sem_groupby(gby_fields=['complaint'], agg_fields=['contents'], agg_funcs=['count']) """ - from typing import Any - - # Construct the output schema dynamically based on gby_fields and agg_funcs - fields = [] - - # Add group by fields to output schema - for g in gby_fields: - f = {"name": g, "type": Any, "desc": f"Group by field: {g}"} - fields.append(f) - - # Add aggregation fields to output schema - for i, agg_func in enumerate(agg_funcs): - agg_field_name = f"{agg_func}({agg_fields[i]})" - f = {"name": agg_field_name, "type": Any, "desc": f"Aggregate field: {agg_field_name}"} - fields.append(f) - - output_schema = create_schema_from_fields(fields) + output_schema = create_groupby_schema_from_fields(self.schema, gby_fields, agg_fields, agg_funcs) # Create logical operator with direct parameters (no GroupBySig) operator = GroupByAggregate( diff --git a/src/palimpzest/core/lib/schemas.py b/src/palimpzest/core/lib/schemas.py index f2df7743b..f9af60fe0 100644 --- a/src/palimpzest/core/lib/schemas.py +++ b/src/palimpzest/core/lib/schemas.py @@ -141,7 +141,22 @@ def create_schema_from_df(df: pd.DataFrame) -> type[BaseModel]: # create and return the new schema return _create_pickleable_model(fields) - +def create_groupby_schema_from_fields(gby_fields: list[str], agg_fields: list[str]): + # construct the output schema dynamically based on groupby and aggregate fields + fields = [] + + # add group by fields to output schema + for g in gby_fields: + f = {"name": g, "type": Any, "desc": f"Group by field: {g}"} + fields.append(f) + + # add aggregation fields to output schema + for agg_field in agg_fields: + f = {"name": agg_field, "type": Any, "desc": f"Aggregate field: {agg_field}"} + fields.append(f) + + return create_schema_from_fields(fields) + def union_schemas(models: list[type[BaseModel]], join: bool = False, on: list[str] | None = None) -> type[BaseModel]: """Union multiple Pydantic models into a single model.""" # convert on to empty list if None From 1f4d8706a4e269378c5140cce98ff92b675cfd10 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 11 Jan 2026 12:21:06 +0800 Subject: [PATCH 17/28] updated agg_field_name align with previous changes --- src/palimpzest/query/operators/aggregate.py | 2 +- tests/pytest/test_semantic_groupby.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index b7c8c33aa..c28354290 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -170,7 +170,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: drs: list[DataRecord] = [] group_by_fields = self.gby_fields # Construct aggregation field names: "func(field)" - agg_field_names = [f"{func}({field})" for func, field in zip(self.agg_funcs, self.agg_fields)] + agg_field_names = [f"({field})" for field in (self.agg_fields)] for g in agg_state: # build up data item data_item = {} diff --git a/tests/pytest/test_semantic_groupby.py b/tests/pytest/test_semantic_groupby.py index 02584bdec..2d84d63e8 100644 --- a/tests/pytest/test_semantic_groupby.py +++ b/tests/pytest/test_semantic_groupby.py @@ -47,7 +47,7 @@ def test_semantic_groupby_basic(): agg_fields_list = ['contents'] agg_funcs_list = ['count'] for i, agg_func in enumerate(agg_funcs_list): - agg_field_name = f"{agg_func}({agg_fields_list[i]})" + agg_field_name = f"({agg_fields_list[i]})" f = {"name": agg_field_name, "type": Any, "desc": f"Aggregate field: {agg_field_name}"} fields.append(f) From f9b4631ea409056b297332330062593a35d6c4ec Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Sun, 11 Jan 2026 12:32:09 +0800 Subject: [PATCH 18/28] Updated input parameters in groupby schema to field helper --- src/palimpzest/core/data/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/palimpzest/core/data/dataset.py b/src/palimpzest/core/data/dataset.py index 99b084f71..bdb4cecce 100644 --- a/src/palimpzest/core/data/dataset.py +++ b/src/palimpzest/core/data/dataset.py @@ -573,7 +573,7 @@ def max(self) -> Dataset: def groupby(self, gby_fields, agg_fields, agg_funcs) -> Dataset: """Apply a group by operation to this dataset.""" - output_schema = create_groupby_schema_from_fields(self.schema, gby_fields, agg_fields, agg_funcs) + output_schema = create_groupby_schema_from_fields(gby_fields, agg_fields) operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, gby_fields=gby_fields, agg_fields=agg_fields, agg_funcs=agg_funcs) return Dataset(sources=[self], operator=operator, schema=output_schema) @@ -591,7 +591,7 @@ def sem_groupby(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: l ds = pz.TextFileDataset(id="reviews", dir="product-reviews/") ds = ds.sem_groupby(gby_fields=['complaint'], agg_fields=['contents'], agg_funcs=['count']) """ - output_schema = create_groupby_schema_from_fields(self.schema, gby_fields, agg_fields, agg_funcs) + output_schema = create_groupby_schema_from_fields(gby_fields, agg_fields) # Create logical operator with direct parameters (no GroupBySig) operator = GroupByAggregate( From 1fe6063ac904460dea6bb1de91641dbfde8ba639 Mon Sep 17 00:00:00 2001 From: Matthew Russo Date: Tue, 13 Jan 2026 17:56:39 -0500 Subject: [PATCH 19/28] minor --- src/palimpzest/query/operators/aggregate.py | 8 +- .../{ => data}/product-reviews/review1.txt | 0 .../{ => data}/product-reviews/review2.txt | 0 .../{ => data}/product-reviews/review3.txt | 0 tests/pytest/test_semantic_groupby.py | 190 +++++++----------- 5 files changed, 81 insertions(+), 117 deletions(-) rename tests/pytest/{ => data}/product-reviews/review1.txt (100%) rename tests/pytest/{ => data}/product-reviews/review2.txt (100%) rename tests/pytest/{ => data}/product-reviews/review3.txt (100%) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index c28354290..c31647fd1 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -169,8 +169,6 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: # return list of data records (one per group) drs: list[DataRecord] = [] group_by_fields = self.gby_fields - # Construct aggregation field names: "func(field)" - agg_field_names = [f"({field})" for field in (self.agg_fields)] for g in agg_state: # build up data item data_item = {} @@ -180,7 +178,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: vals = agg_state[g] for i in range(0, len(vals)): v = ApplyGroupByOp.agg_final(self.agg_funcs[i], vals[i]) - data_item[agg_field_names[i]] = v + data_item[self.agg_fields[i]] = v # create new DataRecord schema = self.output_schema @@ -819,7 +817,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: record_op_stats_lst = [] # Get the output field names from the output schema - output_field_names = [f for f in self.output_schema.model_fields.keys() if f not in self.gby_fields] + output_field_names = [f for f in self.output_schema.model_fields if f not in self.gby_fields] for group_key in agg_state: # Build aggregated data item for this group @@ -869,7 +867,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: return DataRecordSet(drs, record_op_stats_lst) - def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], any]: + def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], GenerationStats]: """ Phase 1: Use LLM to assign each candidate to a semantic group. diff --git a/tests/pytest/product-reviews/review1.txt b/tests/pytest/data/product-reviews/review1.txt similarity index 100% rename from tests/pytest/product-reviews/review1.txt rename to tests/pytest/data/product-reviews/review1.txt diff --git a/tests/pytest/product-reviews/review2.txt b/tests/pytest/data/product-reviews/review2.txt similarity index 100% rename from tests/pytest/product-reviews/review2.txt rename to tests/pytest/data/product-reviews/review2.txt diff --git a/tests/pytest/product-reviews/review3.txt b/tests/pytest/data/product-reviews/review3.txt similarity index 100% rename from tests/pytest/product-reviews/review3.txt rename to tests/pytest/data/product-reviews/review3.txt diff --git a/tests/pytest/test_semantic_groupby.py b/tests/pytest/test_semantic_groupby.py index 2d84d63e8..9660213f4 100644 --- a/tests/pytest/test_semantic_groupby.py +++ b/tests/pytest/test_semantic_groupby.py @@ -7,9 +7,10 @@ """ import pandas as pd + import palimpzest as pz -from palimpzest.query.operators.aggregate import SemanticGroupByOp from palimpzest.constants import Model +from palimpzest.query.operators.aggregate import SemanticGroupByOp # Define columns for the review schema review_cols = [ @@ -18,122 +19,87 @@ def test_semantic_groupby_basic(): """Test basic semantic group by functionality using the physical operator directly.""" - print("Testing SemanticGroupByOp basic functionality...") + # Create list of candidates from text file dataset with schema + ds = pz.TextFileDataset(id="reviews", path="tests/pytest/data/product-reviews/") + output = ds.run() + candidates = [dr for dr in output] + + print(f"Loaded {len(candidates)} review candidates with schema") + print(f"Sample candidate fields: {list(candidates[0].to_dict().keys()) if candidates else 'none'}") - try: - # Create list of candidates from text file dataset with schema - ds = pz.TextFileDataset(id="reviews", path="product-reviews/") - output = ds.run() - candidates = [dr for dr in output] - - print(f"Loaded {len(candidates)} review candidates with schema") - print(f"Sample candidate fields: {list(candidates[0].to_dict().keys()) if candidates else 'none'}") - - # Get input schema from the candidates - input_schema = candidates[0].schema if candidates else None - - # Create output schema (group by field + count) - # Using the same naming convention as Dataset.sem_groupby() - from palimpzest.core.lib.schemas import create_schema_from_fields - from typing import Any - - fields = [] - # Add group by fields to output schema - for g in ['complaint']: - f = {"name": g, "type": Any, "desc": f"Group by field: {g}"} - fields.append(f) - - # Add aggregation fields to output schema - agg_fields_list = ['contents'] - agg_funcs_list = ['count'] - for i, agg_func in enumerate(agg_funcs_list): - agg_field_name = f"({agg_fields_list[i]})" - f = {"name": agg_field_name, "type": Any, "desc": f"Aggregate field: {agg_field_name}"} - fields.append(f) - - output_schema = create_schema_from_fields(fields) - - # Create instance of the physical operator - sem_group_by_op = SemanticGroupByOp( - gby_fields=['complaint'], - agg_fields=['contents'], - agg_funcs=['count'], - input_schema=input_schema, - output_schema=output_schema, - model=Model.GPT_4o_MINI, - logical_op_id="test_semantic_groupby", # Required for RecordOpStats - verbose=False - ) - - print(f"Created SemanticGroupByOp: {sem_group_by_op}") - - # Execute the group by operation - grouped_output = sem_group_by_op(candidates) - - # Convert to DataFrame and print - df = pd.DataFrame([dr.to_dict() for dr in grouped_output]) - print("\nGrouped Results:") - print(df) - print(f"\nTotal groups: {len(df)}") - # print(f"Total cost: ${grouped_output.stats.cost:.4f}") - # print(f"Total time: {grouped_output.stats.time:.2f}s") - - return True - - except Exception as e: - print(f"Error during test: {e}") - import traceback - traceback.print_exc() - return False + # Get input schema from the candidates + input_schema = candidates[0].schema if candidates else None + + # Create output schema (group by field + count) + # Using the same naming convention as Dataset.sem_groupby() + from typing import Any -def test_semantic_groupby_via_dataset(): - """Test semantic group by via Dataset API.""" - print("\nTesting sem_groupby via Dataset API...") + from palimpzest.core.lib.schemas import create_schema_from_fields + + # define the groupby and aggregate fields + gby_fields = ['complaint'] + agg_fields = ['contents'] + agg_funcs = ['count'] - try: - # Create dataset and add schema - ds = pz.TextFileDataset(id="reviews", path="product-reviews/") - - # Apply semantic group by operation - ds = ds.sem_groupby( - gby_fields=['complaint'], - agg_fields=['contents'], - agg_funcs=['count'] - ) - - # Run the query - output = ds.run() - - # Convert to DataFrame and print - df = output.to_df() - print("\nGrouped Results:") - print(df) - print(f"\nTotal groups: {len(df)}") - # print(f"Total cost: ${output.stats.cost:.4f}") - # print(f"Total time: {output.stats.time:.2f}s") - - return True - - except Exception as e: - print(f"Error during test: {e}") - import traceback - traceback.print_exc() - return False + fields = [] + # Add group by fields to output schema + for g in gby_fields: + f = {"name": g, "type": Any, "desc": f"Group by field: {g}"} + fields.append(f) + + # Add aggregation fields to output schema + for agg_field_name in agg_fields: + f = {"name": agg_field_name, "type": Any, "desc": f"Aggregate field: {agg_field_name}"} + fields.append(f) + + output_schema = create_schema_from_fields(fields) -if __name__ == "__main__": - print("=" * 80) - print("Semantic GroupBy Test Suite") - print("=" * 80) + # Create instance of the physical operator + sem_group_by_op = SemanticGroupByOp( + gby_fields=gby_fields, + agg_fields=agg_fields, + agg_funcs=agg_funcs, + input_schema=input_schema, + output_schema=output_schema, + model=Model.GPT_4o_MINI, + logical_op_id="test_semantic_groupby", # Required for RecordOpStats + verbose=False + ) + + print(f"Created SemanticGroupByOp: {sem_group_by_op}") + + # Execute the group by operation + grouped_output = sem_group_by_op(candidates) - print("\nRunning tests...\n") + # Convert to DataFrame and print + df = pd.DataFrame([dr.to_dict() for dr in grouped_output]) + print("\nGrouped Results:") + print(df) + print(f"\nTotal groups: {len(df)}") + # print(f"Total cost: ${grouped_output.stats.cost:.4f}") + # print(f"Total time: {grouped_output.stats.time:.2f}s") + + assert False + +def test_semantic_groupby_via_dataset(): + """Test semantic group by via Dataset API.""" + # Create dataset and add schema + ds = pz.TextFileDataset(id="reviews", path="tests/pytest/data/product-reviews/") - # Run tests - print("Test 1: Basic SemanticGroupByOp") - test_semantic_groupby_basic() + # Apply semantic group by operation + ds = ds.sem_groupby( + gby_fields=['complaint'], + agg_fields=['contents'], + agg_funcs=['count'] + ) - print("\n" + "=" * 80) - print("Test 2: Dataset.sem_groupby() API") - test_semantic_groupby_via_dataset() + # Run the query + output = ds.run() - print("\n" + "=" * 80) - print("All tests completed!") + # Convert to DataFrame and print + df = output.to_df() + print("\nGrouped Results:") + print(df) + print(f"\nTotal groups: {len(df)}") + # print(f"Total cost: ${output.stats.cost:.4f}") + # print(f"Total time: {output.stats.time:.2f}s") From 197564c2062d6114e053dc6685649557907a92b2 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Thu, 29 Jan 2026 15:56:39 +0530 Subject: [PATCH 20/28] formatted queries for wildlife, ecommerce and amazon reviews --- .../amazon reviews/amazon_1.py | 26 +++++++++++++ .../amazon reviews/amazon_2.py | 30 +++++++++++++++ .../ecommerce/ecommerce_1.py | 28 ++++++++++++++ .../ecommerce/ecommerce_2.py | 37 +++++++++++++++++++ .../wildlife/wildlife_1.py | 28 ++++++++++++++ .../wildlife/wildlife_2.py | 34 +++++++++++++++++ .../wildlife/wildlife_3.py | 29 +++++++++++++++ 7 files changed, 212 insertions(+) create mode 100644 tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_1.py create mode 100644 tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_2.py create mode 100644 tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_1.py create mode 100644 tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_2.py create mode 100644 tests/semantic groupBy tests/ground truth results/wildlife/wildlife_1.py create mode 100644 tests/semantic groupBy tests/ground truth results/wildlife/wildlife_2.py create mode 100644 tests/semantic groupBy tests/ground truth results/wildlife/wildlife_3.py diff --git a/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_1.py b/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_1.py new file mode 100644 index 000000000..979936355 --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_1.py @@ -0,0 +1,26 @@ +""" +Amazon Sales — Review Analysis + +Query NL: "Group by review type and return average cost of the products" + +group_cols: [LLM("reviewText")] +agg_cols: ["price"] +semantic group: yes (review type/sentiment inferred from review text) +semantic agg: no (average is a standard aggregate) +""" + +import pandas as pd + +df = pd.read_csv("amazon.csv") +# assume columns: productID, reviewText, price, reviewType (LLM inferred: positive/negative/neutral) + +# Group by review type and compute average price +result = ( + df + .groupby("reviewType") + .agg({"price": "mean"}) + .reset_index() + .rename(columns={"price": "avg_price"}) +) + +result.to_csv("amazon-review-type-avg-price.csv", index=False) \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_2.py b/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_2.py new file mode 100644 index 000000000..44cfea61e --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_2.py @@ -0,0 +1,30 @@ +""" +Amazon Sales — Product Sentiment + +Query NL: "Group by user product review title" +Categories: +- Good overall +- Neutral +- Bad overall + +group_cols: [LLM("reviewTitle")] +agg_cols: ["productID"] +semantic group: yes (sentiment category inferred from review title) +semantic agg: no +""" + +import pandas as pd + +df = pd.read_csv("amazon_sales.csv") +# assume columns: productID, reviewTitle, sentimentCategory (LLM inferred: good_overall/good_with_negatives/bad_with_positives/bad_overall) + +# Group by sentiment category and count products +result = ( + df + .groupby("sentimentCategory") + .agg({"productID": "count"}) + .reset_index() + .rename(columns={"productID": "product_count"}) +) + +result.to_csv("amazon-sentiment-category-count.csv", index=False) diff --git a/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_1.py b/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_1.py new file mode 100644 index 000000000..7fa462024 --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_1.py @@ -0,0 +1,28 @@ +""" +E-Commerce — Color Analysis + +Query NL: "Group by color of images and return the count" + +group_cols: [LLM("imageFile")] +agg_cols: ["productID"] +semantic group: yes (color inferred from product image) +semantic agg: no +""" + +import pandas as pd + +df = pd.read_csv("ecommerce_products.csv") +# assume columns: productID, imageFile, productColor (LLM inferred from image) + +# Group by color and count products +result = ( + df + .groupby("baseColour") + .agg({"productID": "count"}) + .reset_index() + .rename(columns={"productID": "product_count"}) +) + +result.to_csv("ecommerce_1.csv", index=False) + +#TODO: join images.csv and styles.csv by productID to get imageFile and productColor \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_2.py b/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_2.py new file mode 100644 index 000000000..253a685fa --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_2.py @@ -0,0 +1,37 @@ +""" +E-Commerce — Brand Grouping + +Query NL: "Group by brand and by color return the ratio between topwear + (apparel and accessories that are worn above the waist) and + bottomwear (worn at and below the waist)" + +group_cols: ["color", LLM("productDisplayName, imageFile")] +agg_cols: [LLM("productDisplayName")] +semantic group: mixed (color is direct, brand inferred from display name and image) +semantic agg: yes (clothing category inferred from product name/image) +""" + +import pandas as pd + +def topwear_bottomwear_ratio(series): + topwear_count = (series == "topwear").sum() + bottomwear_count = (series == "bottomwear").sum() + if bottomwear_count == 0: + return float('inf') if topwear_count > 0 else 0 + return topwear_count / bottomwear_count + +df = pd.read_csv("ecommerce_products.csv") +# assume columns: productID, brand, productDisplayName, productColor (LLM inferred), clothingCategory (LLM inferred: topwear/bottomwear) + +# Group by brand and color, compute ratio +result = ( + df + .groupby(["brand", "baseColour"]) + .agg({"subCategory": topwear_bottomwear_ratio}) + .reset_index() + .rename(columns={"subCategory": "topwear_bottomwear_ratio"}) +) + +result.to_csv("ecommerce_2.csv", index=False) + +#TODO: augmenting the brand to styles.csv \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_1.py b/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_1.py new file mode 100644 index 000000000..8479a8faa --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_1.py @@ -0,0 +1,28 @@ +""" +Wildlife — Audio-to-Logic + +Query NL: "Group by animals that are carnivorous (from audio) and return the count for all such animals." + +group_cols: [LLM("audioFile")] +agg_cols: ["animalID"] +semantic group: yes (diet type inferred from audio) +semantic agg: no +""" + +import pandas as pd + +df = pd.read_csv("wildlife_audio.csv") +# assume columns: animalID, animalName, audioFile, dietType (LLM inferred from audio) + +# Filter by carnivorous animals (LLM output already materialized) +carnivorous_df = df[df["dietType"] == "carnivorous"] + +# Count the number of carnivorous animals +result = pd.DataFrame({ + "dietType": ["carnivorous"], + "animal_count": [len(carnivorous_df)] +}) + +result.to_csv("wildlife_1.csv", index=False) + +#TODO: Augment dietType to the dataset \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_2.py b/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_2.py new file mode 100644 index 000000000..7bcf63d80 --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_2.py @@ -0,0 +1,34 @@ +""" +Wildlife — Lat/Long Extraction + +Query NL: "Group by country (from the longitude and latitude). + Compute the count of {animal} for every country." + +group_cols: [LLM("latitude", "longitude")] +agg_cols: [LLM("imageFile")] +semantic group: yes (country inferred from coordinates) +semantic agg: yes (animal type inferred from image) +""" + +import pandas as pd + +df = pd.read_csv("wildlife_location.csv") +# assume columns: animalID, latitude, longitude, imageFile, country (LLM inferred), animalType (LLM inferred from image) + +ANIMAL_TYPE = "lion" + +# Filter by animal type +filtered_df = df[df["animalType"] == ANIMAL_TYPE] + +# Group by country and animal type, count animals +result = ( + filtered_df + .groupby(["country", "animalType"]) + .agg({"animalID": "count"}) + .reset_index() + .rename(columns={"animalID": "animal_count"}) +) + +result.to_csv("wildlife_2.csv", index=False) + +#TODO: Augment country to the dataset \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_3.py b/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_3.py new file mode 100644 index 000000000..1bdb14b5a --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_3.py @@ -0,0 +1,29 @@ +""" +Wildlife — Average Age + +Query NL: "Group by small animals (from image) and return their average age." +Note: Small = an animal that weighs less than 50kg and has dimensions less than 1m + +group_cols: [LLM("imageFile")] +agg_cols: ["age"] +semantic group: yes (size category inferred from image, weight and dimensions) +semantic agg: no +""" + +import pandas as pd + +df = pd.read_csv("wildlife_detailed.csv") +# assume columns: animalID, imageFile, age, weight_kg, max_dimension_m, isSmall (LLM inferred: weight < 50kg AND dimension < 1m) + +# Filter by small animals (LLM output already materialized) +small_animals_df = df[df["isSmall"] == True] + +# Calculate average age +result = pd.DataFrame({ + "size_category": ["small"], + "avg_age": [small_animals_df["age"].mean()] +}) + +result.to_csv("wildlife_3.csv", index=False) + +# TODO: Augment size_category to the dataset \ No newline at end of file From c1dfee921a8eecd8c2410d23f8c703d7416bf554 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Thu, 29 Jan 2026 16:07:44 +0530 Subject: [PATCH 21/28] formatted queries - movies dataset --- .../ground truth results/movies/movies_1.py | 28 +++++++++++ .../ground truth results/movies/movies_2.py | 48 +++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 tests/semantic groupBy tests/ground truth results/movies/movies_1.py create mode 100644 tests/semantic groupBy tests/ground truth results/movies/movies_2.py diff --git a/tests/semantic groupBy tests/ground truth results/movies/movies_1.py b/tests/semantic groupBy tests/ground truth results/movies/movies_1.py new file mode 100644 index 000000000..8b0d099d5 --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/movies/movies_1.py @@ -0,0 +1,28 @@ +""" +Movies - Sentiment Analysis + +Query NL: "Group by criticName and compute the fraction of reviews with positive sentiment" +- group_cols: ["criticName"] +- agg_cols: [LLM("reviewText")] +- semantic group: no +- semantic agg: yes +""" + +import pandas as pd + +def frac_positive(series): + num_pos = (series == "POSITIVE").sum() + total = len(series) + return num_pos / total + +df = pd.read_csv("movie_reviews.csv") +# assume columns: criticName, reviewText, scoreSentiment + +result = ( + df + .groupby("criticName") + .agg({"scoreSentiment": frac_positive}) + .reset_index() +) + +result.to_csv("movies_1.csv", index=False) \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/movies/movies_2.py b/tests/semantic groupBy tests/ground truth results/movies/movies_2.py new file mode 100644 index 000000000..35038d705 --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/movies/movies_2.py @@ -0,0 +1,48 @@ +""" +Movies — Templated Query + +Query NL: "Group by director and genre, and count movies with directed by {director} in {genre}." +Categories: +- Adventure +- Action +- Comedy +- Mystery/Crime +- Fantasy +- Horror +- Romance +- Sci-fi + +group_cols: [Director, LLM("Genre", "reviewText")] +agg_cols: [] +semantic group: mixed (director name is literal, genre inferred from movie metadata) +semantic agg: no +""" + +import pandas as pd + +# Parameters for the templated query +DIRECTOR = "Christopher Nolan" +GENRE = "Science Fiction" + +df = pd.read_csv("movies_reviews.csv") +# assume columns: Director, Genre, reviewText, scoreSentiment, movieTitle + +# Filter by director and genre +filtered_df = df[ + (df["Director"] == DIRECTOR) & + (df["Genre"] == GENRE) +] + +# Group by Director and Genre, count the number of movies +result = ( + filtered_df + .groupby(["Director", "Genre"]) + .agg({"movieTitle": "count"}) + .reset_index() + .rename(columns={"movieTitle": "movie_count"}) +) + +result.to_csv("movies_2.csv", index=False) + +# TODO: Augment genre to the dataset +# TODO: join the datasets From 094d14d456376821f901bc6aa894c4206da7d7c6 Mon Sep 17 00:00:00 2001 From: kepler11c <73941237+kepler11c@users.noreply.github.com> Date: Mon, 9 Feb 2026 11:40:59 -0500 Subject: [PATCH 22/28] PZ program for movies query 1 + added functionality to handle usd per audio token --- src/palimpzest/query/operators/aggregate.py | 52 ++++-- .../movies/movies_1_pz.py | 151 ++++++++++++++++++ 2 files changed, 192 insertions(+), 11 deletions(-) create mode 100644 tests/semantic groupBy tests/ground truth results/movies/movies_1_pz.py diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index c31647fd1..d91093d85 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -169,6 +169,8 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: # return list of data records (one per group) drs: list[DataRecord] = [] group_by_fields = self.gby_fields + # Construct aggregation field names: "func(field)" + agg_field_names = [f"{field}" for field in self.agg_fields] for g in agg_state: # build up data item data_item = {} @@ -178,7 +180,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: vals = agg_state[g] for i in range(0, len(vals)): v = ApplyGroupByOp.agg_final(self.agg_funcs[i], vals[i]) - data_item[self.agg_fields[i]] = v + data_item[agg_field_names[i]] = v # create new DataRecord schema = self.output_schema @@ -609,9 +611,17 @@ def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens # get est. of conversion cost (in USD) per record from model card - usd_per_input_token = MODEL_CARDS[model_name].get("usd_per_input_token") - if getattr(self, "prompt_strategy", None) is not None and self.prompt_strategy.is_audio_prompt(): + # Check for audio models first + if "usd_per_audio_input_token" in MODEL_CARDS[model_name]: usd_per_input_token = MODEL_CARDS[model_name]["usd_per_audio_input_token"] + else: + usd_per_input_token = MODEL_CARDS[model_name].get("usd_per_input_token") + + if usd_per_input_token is None: + raise ValueError( + f"Model '{model_name}' has usd_per_input_token=None in MODEL_CARDS. " + f"This model may not support cost estimation. Model card: {MODEL_CARDS[model_name]}" + ) model_conversion_usd_per_record = ( usd_per_input_token * est_num_input_tokens @@ -750,7 +760,18 @@ def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens # get est. of conversion cost (in USD) per record from model card - usd_per_input_token = MODEL_CARDS[model_name].get("usd_per_input_token") + # Check for audio models first + if "usd_per_audio_input_token" in MODEL_CARDS[model_name]: + usd_per_input_token = MODEL_CARDS[model_name]["usd_per_audio_input_token"] + else: + usd_per_input_token = MODEL_CARDS[model_name].get("usd_per_input_token") + + if usd_per_input_token is None: + raise ValueError( + f"Model '{model_name}' has usd_per_input_token=None in MODEL_CARDS. " + f"This model may not support cost estimation. Model card: {MODEL_CARDS[model_name]}" + ) + model_conversion_usd_per_record = ( usd_per_input_token * est_num_input_tokens + MODEL_CARDS[model_name]["usd_per_output_token"] * est_num_output_tokens @@ -817,7 +838,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: record_op_stats_lst = [] # Get the output field names from the output schema - output_field_names = [f for f in self.output_schema.model_fields if f not in self.gby_fields] + output_field_names = [f for f in self.output_schema.model_fields.keys() if f not in self.gby_fields] for group_key in agg_state: # Build aggregated data item for this group @@ -867,7 +888,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: return DataRecordSet(drs, record_op_stats_lst) - def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], GenerationStats]: + def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], any]: """ Phase 1: Use LLM to assign each candidate to a semantic group. @@ -887,16 +908,24 @@ def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], G group_labels = [] total_stats = GenerationStats() - # Get input fields once - input_fields = self.get_input_fields() + # Get input fields - but only use the groupby field to avoid image detection issues + # Since ImageFilepath is just an alias for str, passing all string fields causes + # the prompt factory to try to open them as image files + input_fields = [self.gby_fields[0]] # Only pass the groupby field + fields = {self.gby_fields[0]: str} - for candidate in candidates: - # Ask LLM to classify this record - pass single candidate, not list + print(f"\nSemanticGroupByOp: Processing {len(candidates)} records for group assignment...") + for idx, candidate in enumerate(candidates): + # Show progress every 10 records + if idx % 10 == 0: + print(f" Processing record {idx+1}/{len(candidates)}...") + + # Ask LLM to extract/normalize the groupby field value - pass single candidate, not list gen_kwargs = { "project_cols": input_fields, "output_schema": groupby_schema, - "agg_instruction": f"Determine the '{self.gby_fields[0]}' category for this record." + "agg_instruction": f"Extract the value of '{self.gby_fields[0]}' from this record." } field_answers, _, gen_stats, _ = self.generator(candidate, fields, **gen_kwargs) @@ -911,4 +940,5 @@ def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], G # Accumulate stats total_stats += gen_stats + print(f" Completed! Found {len(set(group_labels))} unique groups from {len(candidates)} records") return group_labels, total_stats \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/movies/movies_1_pz.py b/tests/semantic groupBy tests/ground truth results/movies/movies_1_pz.py new file mode 100644 index 000000000..4cf5391bf --- /dev/null +++ b/tests/semantic groupBy tests/ground truth results/movies/movies_1_pz.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +""" +Movies - Sentiment Analysis with Palimpzest + +This program uses Palimpzest to: +1. Read movie reviews from CSV file +2. Parse the sentiment (POSITIVE/NEGATIVE) from each review +3. Group by critic name +4. Compute the fraction of positive reviews per critic +""" + +import argparse +import os +import sys +import time +from pathlib import Path + +import pandas as pd +from dotenv import load_dotenv + +# Add the src directory to the path to import palimpzest +repo_root = Path(__file__).resolve().parents[4] +sys.path.insert(0, str(repo_root / "src")) + +import palimpzest as pz + +load_dotenv() + + +def custom_frac_positive(group_data): + """ + Custom aggregation function to compute fraction of positive sentiments. + This will be used for semantic aggregation. + """ + sentiments = [record.scoreSentiment for record in group_data] + num_pos = sum(1 for s in sentiments if s == "POSITIVE") + total = len(sentiments) + return num_pos / total if total > 0 else 0.0 + + +def main(): + # Parse arguments + parser = argparse.ArgumentParser(description="Run movies sentiment analysis with Palimpzest") + parser.add_argument("--verbose", default=False, action="store_true", help="Print verbose output") + parser.add_argument("--profile", default=False, action="store_true", help="Profile execution") + parser.add_argument( + "--policy", + type=str, + help="One of 'mincost', 'mintime', 'maxquality'", + default="maxquality", + ) + parser.add_argument( + "--execution-strategy", + type=str, + help="The execution strategy to use. One of sequential, pipelined, parallel", + default="sequential", + ) + parser.add_argument( + "--output", + type=str, + help="Output CSV file path", + default="movies_1_pz_output.csv", + ) + + args = parser.parse_args() + + # Set policy + policy = pz.MaxQuality() + if args.policy == "mincost": + policy = pz.MinCost() + elif args.policy == "mintime": + policy = pz.MinTime() + elif args.policy == "maxquality": + policy = pz.MaxQuality() + else: + print("Policy not supported") + exit(1) + + # Check for API keys + if os.getenv("OPENAI_API_KEY") is None and os.getenv("TOGETHER_API_KEY") is None and os.getenv("ANTHROPIC_API_KEY") is None: + print("WARNING: OPENAI_API_KEY, TOGETHER_API_KEY, and ANTHROPIC_API_KEY are unset") + + # Get the path to the CSV file + script_dir = Path(__file__).parent + csv_path = script_dir / "movie_reviews.csv" + + print(f"Loading movie reviews from: {csv_path}") + start_time = time.time() + + # Read CSV file into memory using pandas (limit to first 500 rows) + csv_df = pd.read_csv(csv_path).head(500) + print(f"Loaded {len(csv_df)} reviews from CSV") + + # Build the Palimpzest query plan using MemoryDataset + # Let MemoryDataset infer the schema from the DataFrame + # This avoids type inference issues + reviews = pz.MemoryDataset(id="movie-reviews", vals=csv_df) + + # Data is already in the right format, no need for sem_map + # Define the GroupBy operation + # Group by criticName and compute fraction of positive reviews + gby_fields = ["criticName"] + agg_fields = ["scoreSentiment"] + agg_funcs = ["count"] # We'll use count initially to demonstrate grouping + + grouped_reviews = reviews.groupby(gby_fields, agg_fields, agg_funcs) + + # Configure and run the query + config = pz.QueryProcessorConfig( + policy=policy, + verbose=args.verbose, + execution_strategy=args.execution_strategy, + ) + + print(f"Policy: {str(policy)}") + print("Running Palimpzest query...") + + # Pass policy as kwarg based on policy type + policy_kwargs = {} + if isinstance(policy, pz.MaxQuality): + policy_kwargs["max_quality"] = True + elif isinstance(policy, pz.MinCost): + policy_kwargs["min_cost"] = True + elif isinstance(policy, pz.MinTime): + policy_kwargs["min_time"] = True + + print(f"Policy kwargs: {policy_kwargs}") # Debug: show what we're passing + data_record_collection = grouped_reviews.run(config, **policy_kwargs) + + end_time = time.time() + print(f"Elapsed time: {end_time - start_time:.2f} seconds") + + # Convert results to DataFrame + results_df = data_record_collection.to_df() + print(f"\nResults shape: {results_df.shape}") + print("\nFirst 10 results:") + # print(results_df.head(10)) + + # Save results to CSV + output_path = script_dir / args.output + results_df.to_csv(output_path, index=False) + print(f"\nResults saved to: {output_path}") + + # Print execution statistics + if hasattr(data_record_collection, 'execution_stats'): + print("\nExecution Statistics:") + print(data_record_collection.execution_stats) + + +if __name__ == "__main__": + main() From 3250c2920a7f4726fafc6b139bd2687f35256dc4 Mon Sep 17 00:00:00 2001 From: kepler11c Date: Sun, 15 Feb 2026 19:59:52 -0500 Subject: [PATCH 23/28] testing --- a.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 a.txt diff --git a/a.txt b/a.txt new file mode 100644 index 000000000..e69de29bb From 593303e12410866b090535b9e53d3513557b7790 Mon Sep 17 00:00:00 2001 From: kepler11c Date: Mon, 23 Feb 2026 20:46:13 -0500 Subject: [PATCH 24/28] updated sem_groupBy --- src/palimpzest/core/data/dataset.py | 123 +++++++++++++++-- src/palimpzest/query/operators/aggregate.py | 36 ++++- src/palimpzest/query/operators/logical.py | 23 ++-- .../movies/pz-programs/query_1_pz.py | 126 ++++++++++++++++++ .../movies/pz-programs/query_2_pz.py | 103 ++++++++++++++ .../movies/pz-programs/query_3_pz.py | 120 +++++++++++++++++ .../movies/pz-programs/query_4_pz.py | 110 +++++++++++++++ .../movies/pz-programs/query_5_pz.py | 112 ++++++++++++++++ .../movies/pz-programs/query_6_pz.py | 0 9 files changed, 731 insertions(+), 22 deletions(-) create mode 100644 tests/semantic groupBy tests/movies/pz-programs/query_1_pz.py create mode 100644 tests/semantic groupBy tests/movies/pz-programs/query_2_pz.py create mode 100644 tests/semantic groupBy tests/movies/pz-programs/query_3_pz.py create mode 100644 tests/semantic groupBy tests/movies/pz-programs/query_4_pz.py create mode 100644 tests/semantic groupBy tests/movies/pz-programs/query_5_pz.py create mode 100644 tests/semantic groupBy tests/movies/pz-programs/query_6_pz.py diff --git a/src/palimpzest/core/data/dataset.py b/src/palimpzest/core/data/dataset.py index bdb4cecce..a37ccc71c 100644 --- a/src/palimpzest/core/data/dataset.py +++ b/src/palimpzest/core/data/dataset.py @@ -577,29 +577,135 @@ def groupby(self, gby_fields, agg_fields, agg_funcs) -> Dataset: operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, gby_fields=gby_fields, agg_fields=agg_fields, agg_funcs=agg_funcs) return Dataset(sources=[self], operator=operator, schema=output_schema) - def sem_groupby(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: list[str]) -> Dataset: + def group_by( + self, + group_cols: list[str] | list[dict], + agg_func: Callable, + output_col: str, + ) -> Dataset: + """ + Apply a semantic group by operation with detailed field specifications. + + Args: + group_cols: List of group-by field specifications. Each can be: + - A string (field name): Uses default grouping behavior + - A dict with keys: 'name', 'desc', 'type', and optionally 'model' + agg_func: Aggregation function to apply (e.g., count, sum, average) + output_col: Name of the output aggregation column + + Example: + ds.group_by( + group_cols=[ + {'name': 'era', 'desc': 'Era bucket: pre-2000, 2000s, 2010s, or 2020s', 'type': str} + ], + agg_func=count_reviews, + output_col="review_count" + ) + """ + # Normalize group_cols to list of dicts + normalized_group_cols = [] + for col in group_cols: + if isinstance(col, str): + normalized_group_cols.append({ + 'name': col, + 'desc': f'Group by {col}', + 'type': str + }) + elif isinstance(col, dict): + normalized_group_cols.append(col) + else: + raise ValueError("group_cols must be a list of strings or dicts") + + # Extract field names for the logical operator + gby_field_names = [col['name'] for col in normalized_group_cols] + + # Infer aggregation function name from the callable + # For now, we'll use 'count' as default - user can extend this + agg_func_name = agg_func.__name__ if hasattr(agg_func, '__name__') else 'count' + if 'count' in agg_func_name.lower(): + agg_func_str = 'count' + else: + # Default to custom function - will need to be handled + agg_func_str = 'count' # fallback + + # Create output schema + output_schema = create_groupby_schema_from_fields(gby_field_names, [output_col]) + + # Create logical operator + operator = GroupByAggregate( + input_schema=self.schema, + is_semantic=True, + output_schema=output_schema, + gby_fields=normalized_group_cols, # Pass full dict specifications + agg_fields=[output_col], + agg_funcs=[agg_func_str] + ) + + return Dataset(sources=[self], operator=operator, schema=output_schema) + + def sem_groupby(self, gby_fields: list[str] | list[dict], agg_fields: list[str] | list[dict], agg_funcs: list[str]) -> Dataset: """ Apply a semantic group by operation to this set using an LLM. This operator groups records by the specified `gby_fields` and applies the `agg_funcs` to the `agg_fields` for each group. Args: - gby_fields: List of field names to group by (e.g., ['complaint']) - agg_fields: List of field names to aggregate (e.g., ['contents']) + gby_fields: List of field specifications to group by. Each can be: + - A string (field name): Uses default grouping behavior + - A dict with keys: 'name', 'desc', 'type', and optionally 'model' + agg_fields: List of field specifications to aggregate. Each can be: + - A string (field name): Uses default aggregation behavior + - A dict with keys: 'name', 'desc', 'type', and optionally 'model' agg_funcs: List of aggregation functions to apply (e.g., ['count']) Example: ds = pz.TextFileDataset(id="reviews", dir="product-reviews/") - ds = ds.sem_groupby(gby_fields=['complaint'], agg_fields=['contents'], agg_funcs=['count']) + ds = ds.sem_groupby( + gby_fields=[{'name': 'complaint', 'desc': 'Type of complaint', 'type': str}], + agg_fields=['contents'], + agg_funcs=['count'] + ) """ - output_schema = create_groupby_schema_from_fields(gby_fields, agg_fields) + # Normalize gby_fields to list of dicts + normalized_gby_fields = [] + for field in gby_fields: + if isinstance(field, str): + normalized_gby_fields.append({ + 'name': field, + 'desc': f'Group by {field}', + 'type': str + }) + elif isinstance(field, dict): + normalized_gby_fields.append(field) + else: + raise ValueError("gby_fields must be a list of strings or dicts") + + # Normalize agg_fields to list of dicts + normalized_agg_fields = [] + for field in agg_fields: + if isinstance(field, str): + normalized_agg_fields.append({ + 'name': field, + 'desc': f'Aggregate {field}', + 'type': str + }) + elif isinstance(field, dict): + normalized_agg_fields.append(field) + else: + raise ValueError("agg_fields must be a list of strings or dicts") + + # Extract field names for schema creation + gby_field_names = [f['name'] for f in normalized_gby_fields] + agg_field_names = [f['name'] for f in normalized_agg_fields] + + output_schema = create_groupby_schema_from_fields(gby_field_names, agg_field_names) - # Create logical operator with direct parameters (no GroupBySig) + # Create logical operator with full dict specifications operator = GroupByAggregate( input_schema=self.schema, is_semantic=True, output_schema=output_schema, - gby_fields=gby_fields, - agg_fields=agg_fields, + gby_fields=normalized_gby_fields, + agg_fields=normalized_agg_fields, agg_funcs=agg_funcs ) @@ -697,6 +803,7 @@ def run(self, config: QueryProcessorConfig | None = None, **kwargs): """Invoke the QueryProcessor to execute the query. `kwargs` will be applied to the QueryProcessorConfig.""" # TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory + print("Running Query Processor...") # as syntactic sugar, we will allow some keyword arguments to parameterize our policies policy = construct_policy_from_kwargs(**kwargs) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index d91093d85..aee9d606b 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -697,12 +697,19 @@ class SemanticGroupByOp(AggregateOp): Implementation of a semantic GroupBy operator using LLMs. This operator groups records by a set of fields and applies aggregation functions to each group using an LLM to determine the groups. """ - def __init__(self, gby_fields: list[str], agg_fields: list[str], agg_funcs: list[str], + def __init__(self, gby_fields: list[str] | list[dict], agg_fields: list[str] | list[dict], agg_funcs: list[str], model: Model | None = None, prompt_strategy: PromptStrategy = PromptStrategy.AGG, reasoning_effort: str | None = None, *args, **kwargs): super().__init__(*args, **kwargs) - self.gby_fields = gby_fields - self.agg_fields = agg_fields + + # Store original field specifications (may be dicts or strings) + self.gby_fields_spec = gby_fields + self.agg_fields_spec = agg_fields + + # Extract field names for backward compatibility + self.gby_fields = [f['name'] if isinstance(f, dict) else f for f in gby_fields] + self.agg_fields = [f['name'] if isinstance(f, dict) else f for f in agg_fields] + self.agg_funcs = agg_funcs self.model = model self.prompt_strategy = prompt_strategy @@ -899,9 +906,21 @@ def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], a Tuple of (list of group labels, generation stats) """ # Create a schema that just extracts the group-by field + # Use the description from the field spec if available from palimpzest.core.lib.schemas import create_schema_from_fields + + first_gby_spec = self.gby_fields_spec[0] + if isinstance(first_gby_spec, dict): + field_desc = first_gby_spec.get('desc', f"The semantic category for {first_gby_spec['name']}") + field_name = first_gby_spec['name'] + field_type = first_gby_spec.get('type', str) + else: + field_desc = f"The semantic category for {first_gby_spec}" + field_name = first_gby_spec + field_type = str + groupby_schema = create_schema_from_fields([ - {"name": self.gby_fields[0], "type": str, "desc": f"The semantic category for {self.gby_fields[0]}"} + {"name": field_name, "type": field_type, "desc": field_desc} ]) # Process candidates to extract group labels @@ -915,17 +934,22 @@ def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], a fields = {self.gby_fields[0]: str} + # Build the aggregation instruction that includes the field description + # This tells the LLM HOW to categorize/group the values semantically + agg_instruction = f"Categorize this record into a semantic group based on the field '{field_name}' Return the category name (one of those specified in '{field_desc}'s)" + print(f"\nSemanticGroupByOp: Processing {len(candidates)} records for group assignment...") + print(f" Grouping instruction: {agg_instruction}") for idx, candidate in enumerate(candidates): # Show progress every 10 records if idx % 10 == 0: print(f" Processing record {idx+1}/{len(candidates)}...") - # Ask LLM to extract/normalize the groupby field value - pass single candidate, not list + # Ask LLM to categorize the record according to the field description gen_kwargs = { "project_cols": input_fields, "output_schema": groupby_schema, - "agg_instruction": f"Extract the value of '{self.gby_fields[0]}' from this record." + "agg_instruction": agg_instruction } field_answers, _, gen_stats, _ = self.generator(candidate, fields, **gen_kwargs) diff --git a/src/palimpzest/query/operators/logical.py b/src/palimpzest/query/operators/logical.py index f9008b2d6..ffb34a218 100644 --- a/src/palimpzest/query/operators/logical.py +++ b/src/palimpzest/query/operators/logical.py @@ -381,8 +381,8 @@ class GroupByAggregate(LogicalOperator): def __init__( self, is_semantic: bool = False, - gby_fields: list[str] | None = None, - agg_fields: list[str] | None = None, + gby_fields: list[str] | list[dict] | None = None, + agg_fields: list[str] | list[dict] | None = None, agg_funcs: list[str] | None = None, *args, **kwargs, @@ -395,14 +395,21 @@ def __init__( if gby_fields is None or agg_fields is None or agg_funcs is None: raise ValueError("Must provide all of (gby_fields, agg_fields, agg_funcs)") - for f in agg_fields: + # Store original field specifications (may be dicts or strings) + self.gby_fields_spec = gby_fields + self.agg_fields_spec = agg_fields + self.agg_funcs = agg_funcs + + # Extract field names for ID computation and validation + self.gby_fields = [f['name'] if isinstance(f, dict) else f for f in gby_fields] + self.agg_fields = [f['name'] if isinstance(f, dict) else f for f in agg_fields] + + # Validate agg fields exist in schema + for f in self.agg_fields: if f not in self.input_schema.model_fields: raise TypeError(f"Supplied schema has no field {f}") self.is_semantic = is_semantic - self.gby_fields = gby_fields - self.agg_fields = agg_fields - self.agg_funcs = agg_funcs def __str__(self): return f"GroupBy(gby_fields={self.gby_fields}, agg_fields={self.agg_fields}, agg_funcs={self.agg_funcs})" @@ -423,8 +430,8 @@ def get_logical_op_params(self) -> dict: logical_op_params = super().get_logical_op_params() logical_op_params = { "is_semantic": self.is_semantic, - "gby_fields": self.gby_fields, - "agg_fields": self.agg_fields, + "gby_fields": self.gby_fields_spec, # Pass full dict specs to physical operators + "agg_fields": self.agg_fields_spec, # Pass full dict specs to physical operators "agg_funcs": self.agg_funcs, **logical_op_params, } diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_1_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_1_pz.py new file mode 100644 index 000000000..82ebaa146 --- /dev/null +++ b/tests/semantic groupBy tests/movies/pz-programs/query_1_pz.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +""" +Query 1 — Sentiment by Publication (Palimpzest) + +Group by publicatioName and compute the fraction of positive reviews. + +Pipeline: + 1. sem_groupby – Semantically groups the records by `publicatioName` + (the LLM normalises slight variations in publication + names) and collects the scoreSentiment values into a + list per group. + 2. Post-processing – computes frac_positive from the collected lists. +""" + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +import pandas as pd +from dotenv import load_dotenv + +# Add the src directory to the path +repo_root = Path(__file__).resolve().parents[4] +sys.path.insert(0, str(repo_root / "src")) + +import palimpzest as pz + +load_dotenv() + + +def compute_frac_positive(sentiments): + """Compute fraction of positive sentiments from a collected list.""" + num_pos = sum(1 for s in sentiments if s and str(s).upper() == "POSITIVE") + total = len(sentiments) + return num_pos / total if total > 0 else 0.0 + + +def main(): + parser = argparse.ArgumentParser(description="Query 1: Sentiment by Publication") + parser.add_argument("--verbose", default=False, action="store_true") + parser.add_argument("--policy", type=str, default="maxquality", + help="One of 'mincost', 'mintime', 'maxquality'") + parser.add_argument("--output", type=str, default="query1_pz_output.csv") + parser.add_argument("--stats-output", type=str, default=None, + help="Optional path to write execution stats JSON") + parser.add_argument( + "--execution-strategy", + type=str, + default="sequential", + help="One of 'sequential', 'pipelined', 'parallel'", + ) + args = parser.parse_args() + + # Set policy + policy_map = { + "mincost": pz.MinCost(), + "mintime": pz.MinTime(), + "maxquality": pz.MaxQuality() + } + policy = policy_map.get(args.policy, pz.MaxQuality()) + + # Load data + script_dir = Path(__file__).parent + csv_path = script_dir / "../movie_reviews.csv" + print(f"Loading reviews from: {csv_path}") + + csv_df = pd.read_csv(csv_path).head(500) + print(f"Loaded {len(csv_df)} reviews") + + # ── Ingest the DataFrame ───────────────────────────────────────── + # MemoryDataset automatically creates a schema from the DataFrame. + # The CSV already contains: publicatioName, reviewText, + # scoreSentiment, etc. + reviews = pz.MemoryDataset(id="reviews", vals=csv_df) + + # ── sem_groupby – semantically group by publication name ───────── + # The LLM normalises publication names (e.g. "NY Times" vs + # "The New York Times") and groups the records accordingly. + # We collect the existing scoreSentiment values into a list per + # group so we can compute the fraction of positive reviews. + grouped = reviews.sem_groupby( + gby_fields=["publicatioName"], + agg_fields=["scoreSentiment"], + agg_funcs=["list"], + ) + + # ── Execute ─────────────────────────────────────────────────────── + start_time = time.time() + config = pz.QueryProcessorConfig( + policy=policy, + verbose=args.verbose, + execution_strategy=args.execution_strategy, + ) + data_record_collection = grouped.run(config) + exec_time = time.time() - start_time + + # ── Post-process – compute frac_positive per group ──────────────── + result_df = pd.DataFrame([ + { + "publicatioName": r.publicatioName, + "frac_positive": compute_frac_positive( + getattr(r, "scoreSentiment", []) or [] + ), + } + for r in data_record_collection + ]) + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + result_df.to_csv(args.output, index=False) + + if args.stats_output is not None: + os.makedirs(os.path.dirname(args.stats_output) or ".", exist_ok=True) + with open(args.stats_output, "w") as f: + json.dump(data_record_collection.execution_stats.to_json(), f, indent=2) + + print(f"\nExecution time: {exec_time:.2f}s") + print(f"Results saved to: {args.output}") + if args.stats_output is not None: + print(f"Execution stats saved to: {args.stats_output}") + print(f"Generated {len(result_df)} publication groups") + + +if __name__ == "__main__": + main() diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_2_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_2_pz.py new file mode 100644 index 000000000..300a06c8f --- /dev/null +++ b/tests/semantic groupBy tests/movies/pz-programs/query_2_pz.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +""" +Query 2 — Critic Volume by Inferred Era (Palimpzest) + +Group reviews by movie era (pre-2000, 2000s, 2010s, 2020s) and count reviews. +The LLM semantically infers the era from the releaseDateTheaters column. + +Pipeline: + 1. Join movie_reviews with movies to get releaseDateTheaters. + 2. sem_groupby – LLM reads releaseDateTheaters and groups into era buckets; + counts reviewId per group. +""" + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +import pandas as pd +from dotenv import load_dotenv + +repo_root = Path(__file__).resolve().parents[4] +sys.path.insert(0, str(repo_root / "src")) + +import palimpzest as pz + +load_dotenv() + + +def main(): + parser = argparse.ArgumentParser(description="Query 2: Reviews by Era") + parser.add_argument("--verbose", default=False, action="store_true") + parser.add_argument("--policy", type=str, default="maxquality") + parser.add_argument("--output", type=str, default="query2_pz_output.csv") + parser.add_argument("--stats-output", type=str, default=None, + help="Optional path to write execution stats JSON") + parser.add_argument( + "--execution-strategy", type=str, default="sequential", + help="One of 'sequential', 'pipelined', 'parallel'", + ) + args = parser.parse_args() + + policy_map = { + "mincost": pz.MinCost(), + "mintime": pz.MinTime(), + "maxquality": pz.MaxQuality(), + } + policy = policy_map.get(args.policy, pz.MaxQuality()) + + script_dir = Path(__file__).parent + + # Load and join data + reviews_df = pd.read_csv(script_dir / "../movie_reviews.csv").head(500) + movies_df = pd.read_csv(script_dir / "../movies.csv")[["id", "releaseDateTheaters"]] + merged_df = reviews_df.merge(movies_df, on="id", how="left") + print(f"Loaded {len(merged_df)} reviews") + + reviews = pz.MemoryDataset(id="reviews", vals=merged_df) + + # sem_groupby: LLM infers era from releaseDateTheaters, count reviewId per era + grouped = reviews.sem_groupby( + gby_fields=["releaseDateTheaters"], + agg_fields=["reviewId"], + agg_funcs=["count"], + ) + + # Execute + start_time = time.time() + config = pz.QueryProcessorConfig( + policy=policy, + verbose=args.verbose, + execution_strategy=args.execution_strategy, + ) + data_record_collection = grouped.run(config) + exec_time = time.time() - start_time + + # Post-process: rename the semantic group key to "era" + result_df = pd.DataFrame([ + { + "era": r.releaseDateTheaters, + "review_count": r.reviewId, + } + for r in data_record_collection + ]) + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + result_df.to_csv(args.output, index=False) + + if args.stats_output is not None: + os.makedirs(os.path.dirname(args.stats_output) or ".", exist_ok=True) + with open(args.stats_output, "w") as f: + json.dump(data_record_collection.execution_stats.to_json(), f, indent=2) + + print(f"\nExecution time: {exec_time:.2f}s") + print(f"Results saved to: {args.output}") + if args.stats_output is not None: + print(f"Execution stats saved to: {args.stats_output}") + print(f"Generated {len(result_df)} era groups") + + +if __name__ == "__main__": + main() diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_3_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_3_pz.py new file mode 100644 index 000000000..d904838c5 --- /dev/null +++ b/tests/semantic groupBy tests/movies/pz-programs/query_3_pz.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +""" +Query 3 — Fraction Positive per Audience Type (Palimpzest) + +For a specific director, group reviews by MPAA-inferred audience type +and compute fraction positive. + +Pipeline: + 1. Join movie_reviews with movies filtered by director to get rating. + 2. sem_groupby – LLM semantically normalises the MPAA rating into + audience-type buckets (Children, Teen, Adult, Unrated); lists + scoreSentiment per group. + 3. Post-process list → frac_positive. +""" + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +import pandas as pd +from dotenv import load_dotenv + +repo_root = Path(__file__).resolve().parents[4] +sys.path.insert(0, str(repo_root / "src")) + +import palimpzest as pz + +load_dotenv() + + +def main(): + parser = argparse.ArgumentParser(description="Query 3: Sentiment by Audience Type") + parser.add_argument("--director", type=str, default="Christopher Nolan", + help="Director name to filter by") + parser.add_argument("--verbose", default=False, action="store_true") + parser.add_argument("--policy", type=str, default="maxquality") + parser.add_argument("--output", type=str, default="query3_pz_output.csv") + parser.add_argument("--stats-output", type=str, default=None, + help="Optional path to write execution stats JSON") + parser.add_argument( + "--execution-strategy", type=str, default="sequential", + help="One of 'sequential', 'pipelined', 'parallel'", + ) + args = parser.parse_args() + + policy_map = { + "mincost": pz.MinCost(), + "mintime": pz.MinTime(), + "maxquality": pz.MaxQuality(), + } + policy = policy_map.get(args.policy, pz.MaxQuality()) + + script_dir = Path(__file__).parent + + # Load and filter data + reviews_df = pd.read_csv(script_dir / "../movie_reviews.csv").head(500) + movies_df = pd.read_csv(script_dir / "../movies.csv") + + # Filter for director's movies and keep the rating column + director_movies = movies_df[ + movies_df["director"].str.contains(args.director, na=False, case=False) + ][["id", "rating"]] + + merged_df = reviews_df.merge(director_movies, on="id", how="inner") + print(f"Loaded {len(merged_df)} reviews for {args.director}") + + reviews = pz.MemoryDataset(id="reviews", vals=merged_df) + + # sem_groupby: LLM maps MPAA rating → audience type bucket, list scoreSentiment + grouped = reviews.sem_groupby( + gby_fields=["rating"], + agg_fields=["scoreSentiment"], + agg_funcs=["list"], + ) + + # Execute + start_time = time.time() + config = pz.QueryProcessorConfig( + policy=policy, + verbose=args.verbose, + execution_strategy=args.execution_strategy, + ) + data_record_collection = grouped.run(config) + exec_time = time.time() - start_time + + # Post-process: compute frac_positive from the sentiment lists + result_df = pd.DataFrame([ + { + "audienceType": r.rating, + "frac_positive": ( + sum(1 for s in r.scoreSentiment if str(s).upper() == "POSITIVE") + / len(r.scoreSentiment) + if len(r.scoreSentiment) > 0 + else 0.0 + ), + "review_count": len(r.scoreSentiment), + "director": args.director, + } + for r in data_record_collection + ]) + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + result_df.to_csv(args.output, index=False) + + if args.stats_output is not None: + os.makedirs(os.path.dirname(args.stats_output) or ".", exist_ok=True) + with open(args.stats_output, "w") as f: + json.dump(data_record_collection.execution_stats.to_json(), f, indent=2) + + print(f"\nExecution time: {exec_time:.2f}s") + print(f"Results saved to: {args.output}") + if args.stats_output is not None: + print(f"Execution stats saved to: {args.stats_output}") + print(f"Generated {len(result_df)} audience type groups") + + +if __name__ == "__main__": + main() diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_4_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_4_pz.py new file mode 100644 index 000000000..76edf0860 --- /dev/null +++ b/tests/semantic groupBy tests/movies/pz-programs/query_4_pz.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +""" +Query 4 — Sentiment and Top Critic Bias by Genre (Palimpzest) + +Hard query: genre must be inferred from review text itself (not available +in reviews table). Both group key and aggregation value are semantic. + +Pipeline: + 1. Load movie_reviews. + 2. sem_groupby – LLM infers primaryGenre from reviewText and groups by + [primaryGenre, isTopCritic]; lists scoreSentiment per group. + 3. Post-process list → frac_positive. +""" + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +import pandas as pd +from dotenv import load_dotenv + +repo_root = Path(__file__).resolve().parents[4] +sys.path.insert(0, str(repo_root / "src")) + +import palimpzest as pz + +load_dotenv() + + +def main(): + parser = argparse.ArgumentParser(description="Query 4: Sentiment by Inferred Genre") + parser.add_argument("--verbose", default=False, action="store_true") + parser.add_argument("--policy", type=str, default="maxquality") + parser.add_argument("--output", type=str, default="query4_pz_output.csv") + parser.add_argument("--stats-output", type=str, default=None, + help="Optional path to write execution stats JSON") + parser.add_argument( + "--execution-strategy", type=str, default="sequential", + help="One of 'sequential', 'pipelined', 'parallel'", + ) + args = parser.parse_args() + + policy_map = { + "mincost": pz.MinCost(), + "mintime": pz.MinTime(), + "maxquality": pz.MaxQuality(), + } + policy = policy_map.get(args.policy, pz.MaxQuality()) + + script_dir = Path(__file__).parent + + reviews_df = pd.read_csv(script_dir / "../movie_reviews.csv").head(500) + print(f"Loaded {len(reviews_df)} reviews") + + reviews = pz.MemoryDataset(id="reviews", vals=reviews_df) + + # sem_groupby: LLM infers primaryGenre from reviewText, + # groups by [reviewText (→ genre), isTopCritic], + # lists scoreSentiment per group. + grouped = reviews.sem_groupby( + gby_fields=["reviewText", "isTopCritic"], + agg_fields=["scoreSentiment"], + agg_funcs=["list"], + ) + + # Execute + start_time = time.time() + config = pz.QueryProcessorConfig( + policy=policy, + verbose=args.verbose, + execution_strategy=args.execution_strategy, + ) + data_record_collection = grouped.run(config) + exec_time = time.time() - start_time + + # Post-process: compute frac_positive from the sentiment lists + result_df = pd.DataFrame([ + { + "primaryGenre": r.reviewText, + "isTopCritic": r.isTopCritic, + "frac_positive": ( + sum(1 for s in r.scoreSentiment if str(s).upper() == "POSITIVE") + / len(r.scoreSentiment) + if len(r.scoreSentiment) > 0 + else 0.0 + ), + "review_count": len(r.scoreSentiment), + } + for r in data_record_collection + ]) + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + result_df.to_csv(args.output, index=False) + + if args.stats_output is not None: + os.makedirs(os.path.dirname(args.stats_output) or ".", exist_ok=True) + with open(args.stats_output, "w") as f: + json.dump(data_record_collection.execution_stats.to_json(), f, indent=2) + + print(f"\nExecution time: {exec_time:.2f}s") + print(f"Results saved to: {args.output}") + if args.stats_output is not None: + print(f"Execution stats saved to: {args.stats_output}") + print(f"Generated {len(result_df)} genre-topcritic groups") + + +if __name__ == "__main__": + main() diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_5_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_5_pz.py new file mode 100644 index 000000000..ceab5f651 --- /dev/null +++ b/tests/semantic groupBy tests/movies/pz-programs/query_5_pz.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +Query 5 — Emotional Tone by Director and Genre (Palimpzest) + +Finer-grained emotional tone classification beyond binary sentiment. + +Pipeline: + 1. Join movie_reviews with movies filtered by director + genre. + 2. sem_groupby – LLM reads reviewText and groups by emotional tone + (Enthusiastic, Measured, Disappointed); counts reviewId per group. +""" + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +import pandas as pd +from dotenv import load_dotenv + +repo_root = Path(__file__).resolve().parents[4] +sys.path.insert(0, str(repo_root / "src")) + +import palimpzest as pz + +load_dotenv() + + +def main(): + parser = argparse.ArgumentParser(description="Query 5: Emotional Tone by Director and Genre") + parser.add_argument("--director", type=str, default="Steven Spielberg") + parser.add_argument("--genre", type=str, default="Adventure") + parser.add_argument("--verbose", default=False, action="store_true") + parser.add_argument("--policy", type=str, default="maxquality") + parser.add_argument("--output", type=str, default="query5_pz_output.csv") + parser.add_argument("--stats-output", type=str, default=None, + help="Optional path to write execution stats JSON") + parser.add_argument( + "--execution-strategy", type=str, default="sequential", + help="One of 'sequential', 'pipelined', 'parallel'", + ) + args = parser.parse_args() + + policy_map = { + "mincost": pz.MinCost(), + "mintime": pz.MinTime(), + "maxquality": pz.MaxQuality(), + } + policy = policy_map.get(args.policy, pz.MaxQuality()) + + script_dir = Path(__file__).parent + + # Load and filter data + reviews_df = pd.read_csv(script_dir / "../movie_reviews.csv").head(500) + movies_df = pd.read_csv(script_dir / "../movies.csv") + + filtered_movies = movies_df[ + movies_df["director"].str.contains(args.director, na=False, case=False) + & movies_df["genre"].str.contains(args.genre, na=False, case=False) + ][["id"]] + + merged_df = reviews_df.merge(filtered_movies, on="id", how="inner") + print(f"Loaded {len(merged_df)} reviews for {args.director} in {args.genre}") + + reviews = pz.MemoryDataset(id="reviews", vals=merged_df) + + # sem_groupby: LLM reads reviewText and groups by emotional tone, count reviewId + grouped = reviews.sem_groupby( + gby_fields=["reviewText"], + agg_fields=["reviewId"], + agg_funcs=["count"], + ) + + # Execute + start_time = time.time() + config = pz.QueryProcessorConfig( + policy=policy, + verbose=args.verbose, + execution_strategy=args.execution_strategy, + ) + data_record_collection = grouped.run(config) + exec_time = time.time() - start_time + + # Post-process: rename the semantic group key to "emotionalTone" + result_df = pd.DataFrame([ + { + "emotionalTone": r.reviewText, + "review_count": r.reviewId, + "director": args.director, + "genre": args.genre, + } + for r in data_record_collection + ]) + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + result_df.to_csv(args.output, index=False) + + if args.stats_output is not None: + os.makedirs(os.path.dirname(args.stats_output) or ".", exist_ok=True) + with open(args.stats_output, "w") as f: + json.dump(data_record_collection.execution_stats.to_json(), f, indent=2) + + print(f"\nExecution time: {exec_time:.2f}s") + print(f"Results saved to: {args.output}") + if args.stats_output is not None: + print(f"Execution stats saved to: {args.stats_output}") + print(f"Generated {len(result_df)} tone groups") + + +if __name__ == "__main__": + main() diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_6_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_6_pz.py new file mode 100644 index 000000000..e69de29bb From 72a5024e855f734158652ade67aae832e94f74bc Mon Sep 17 00:00:00 2001 From: kepler11c Date: Tue, 3 Mar 2026 10:40:32 -0500 Subject: [PATCH 25/28] Queries 1 through 5 --- .../movies/queries/query1_ground_truth.csv | 271 ++++++++++++++++++ .../movies/queries/query2_ground_truth.csv | 6 + .../movies/queries/query3_ground_truth.csv | 4 + .../movies/queries/query4_ground_truth.csv | 22 ++ .../movies/queries/query5_ground_truth.csv | 4 + .../movies/queries/query_1.py | 30 ++ .../movies/queries/query_2.py | 42 +++ .../movies/queries/query_3.py | 60 ++++ .../movies/queries/query_4.py | 38 +++ .../movies/queries/query_5.py | 59 ++++ .../movies/queries/query_6.py | 0 11 files changed, 536 insertions(+) create mode 100644 tests/semantic groupBy tests/movies/queries/query1_ground_truth.csv create mode 100644 tests/semantic groupBy tests/movies/queries/query2_ground_truth.csv create mode 100644 tests/semantic groupBy tests/movies/queries/query3_ground_truth.csv create mode 100644 tests/semantic groupBy tests/movies/queries/query4_ground_truth.csv create mode 100644 tests/semantic groupBy tests/movies/queries/query5_ground_truth.csv create mode 100644 tests/semantic groupBy tests/movies/queries/query_1.py create mode 100644 tests/semantic groupBy tests/movies/queries/query_2.py create mode 100644 tests/semantic groupBy tests/movies/queries/query_3.py create mode 100644 tests/semantic groupBy tests/movies/queries/query_4.py create mode 100644 tests/semantic groupBy tests/movies/queries/query_5.py create mode 100644 tests/semantic groupBy tests/movies/queries/query_6.py diff --git a/tests/semantic groupBy tests/movies/queries/query1_ground_truth.csv b/tests/semantic groupBy tests/movies/queries/query1_ground_truth.csv new file mode 100644 index 000000000..2bfea922f --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query1_ground_truth.csv @@ -0,0 +1,271 @@ +publicatioName,frac_positive +3AW,1.0 +48 Hills,0.0 +ABC News Radio,1.0 +ABC Radio (Australia),0.5 +AIPT,1.0 +AV Club,0.8 +Antagony & Ecstasy,1.0 +Apollo Guide,0.3333333333333333 +Arkansas Democrat-Gazette,1.0 +Asian Movie Pulse,1.0 +Associated Press,1.0 +Atlantic City Weekly,1.0 +Austin Chronicle,0.6 +AwardsCircuit.com,0.0 +BBC.com,0.5 +Baret News,1.0 +Beach Reporter (Southern California),1.0 +BlackFilm.com,0.0 +Bleeding Cool,1.0 +Blu-ray.com,0.5 +Boston Globe,0.0 +Boston Herald,0.0 +Boulder Weekly,1.0 +Bowling Green Daily News,1.0 +Boxoffice Magazine,0.0 +BrianOrndorf.com,0.0 +Bust Magazine,1.0 +But Why Tho? A Geek Community,1.0 +CBR,1.0 +CNN.com,1.0 +"Capital Times (Madison, WI)",1.0 +Chicago Reader,1.0 +Chicago Sun-Times,0.5 +Chicago Tribune,1.0 +Cinapse,1.0 +CinePassion,1.0 +Cinema Crazed,1.0 +Cinema Signals,1.0 +Cinemalogue,0.0 +Cinemanía (Spain),0.5 +Clarín,1.0 +Cleveland Press,0.0 +Close Up,1.0 +Combustible Celluloid,1.0 +"Commercial Appeal (Memphis, TN)",1.0 +Common Sense Media,1.0 +Compuserve,1.0 +Consequence,0.0 +Contactmusic.com,1.0 +DCist,1.0 +DVDTalk.com,0.0 +Daily Express (UK),0.0 +Daily Star (UK),1.0 +Daily Telegraph (UK),0.0 +Dennis Schwartz Movie Reviews,0.75 +Deseret News (Salt Lake City),0.6666666666666666 +Digital Spy,1.0 +Dread Central,0.0 +El Mundo (Spain),0.0 +El Pais (Spain),1.0 +El antepenúltimo mohicano,0.0 +EmanuelLevy.Com,0.8 +Empire Magazine,0.6666666666666666 +Entertainment Weekly,1.0 +Epoch Times,1.0 +Espinof,1.0 +Esquire Magazine,0.0 +Eye for Film,1.0 +"F5 (Wichita, KS)",1.0 +FILMINK (Australia),1.0 +Film Blather,1.0 +Film Comment Magazine,1.0 +Film Freak Central,1.0 +Film Frenzy,1.0 +Film Inquiry,1.0 +Film Journal International,0.3333333333333333 +Film Threat,0.7142857142857143 +Film4,0.75 +Filmcritic.com,0.25 +Filmfare,1.0 +Filmmaker Magazine,1.0 +Financial Times,0.5 +Flick Filosopher,1.0 +Floating World,1.0 +Fotogramas,1.0 +Fresh Fiction,1.0 +Future Movies UK,0.0 +GeekNation,0.0 +Globe and Mail,0.6666666666666666 +Gone With The Twins,1.0 +Grantland,0.0 +Groucho Reviews,1.0 +Guardian,0.5 +HanCinema,1.0 +Herald Sun (Australia),1.0 +"HeraldNet (Everett, WA)",1.0 +HeyUGuys,1.0 +Hindustan Times,0.0 +Hollywood Reporter,0.36363636363636365 +Houston Chronicle,1.0 +Houston Press,1.0 +IONCINEMA.com,0.5 +In Film Australia,1.0 +In Review Online,1.0 +Independent (UK),0.5 +Independent Online (South Africa),1.0 +Internet Reviews,1.0 +Irish Times,1.0 +Japan Times,0.5 +JoBlo's Movie Network,0.0 +"Journal and Courier (Lafayette, IN)",1.0 +KPBS.org,1.0 +Kalamazoo Gazette,1.0 +Kansas City Kansan,1.0 +Keith & the Movies,1.0 +Killer Movie Reviews,1.0 +L.A. Weekly,1.0 +La Movie Boeuf,1.0 +La Nación (Argentina),1.0 +"Lagniappe (Mobile, AL)",0.0 +Las Vegas Mercury,1.0 +Las Vegas Review-Journal,1.0 +Lawrence.com,1.0 +Lessons of Darkness,1.0 +Little White Lies,0.3333333333333333 +Livemint,0.5 +Los Angeles Free Press,0.0 +Los Angeles Times,0.5 +Lybarger Links,1.0 +Maclean's Magazine,1.0 +Manhattan Movie Magazine,0.0 +Mark Reviews Movies,0.0 +Matt's Movie Reviews,1.0 +"Mountain Xpress (Asheville, NC)",1.0 +Movie Bitches,1.0 +Movie Chambers,1.0 +Movie Dearest,1.0 +Movie Metropolis,1.0 +Movie Mom,1.0 +Movie Nation,0.3333333333333333 +Movie Talk,1.0 +MovieMartyr.com,1.0 +Movieline,1.0 +NME,1.0 +NOW Toronto,0.5 +NPR,1.0 +NYC Movie Guru,0.6666666666666666 +National Post,0.0 +New York Magazine/Vulture,1.0 +New York Times,0.7692307692307693 +New Yorker,0.75 +Newark Star-Ledger,1.0 +Newcity,1.0 +Newhouse News Service,0.0 +Nolan's Pop Culture Review,1.0 +North Shore Movies,1.0 +"Northwest Herald (Crystal Lake, IL)",1.0 +Observer (UK),1.0 +Offoffoff,1.0 +Old School Reviews,1.0 +One Room With A View,1.0 +Oregonian,1.0 +Outlook,0.0 +Paste Magazine,1.0 +Philadelphia Inquirer,0.0 +Planet S Magazine,0.0 +Projected Figures,1.0 +Q Network Film Desk,1.0 +"Quad City Times (Davenport, IA)",0.0 +Radio Times,0.6666666666666666 +Rediff.com,0.0 +Reel Film Reviews,0.3333333333333333 +ReelTalk Movie Reviews,1.0 +Reeling Reviews,0.0 +Remezcla,1.0 +Reuters,1.0 +Richard Crouse,1.0 +RogerEbert.com,0.3333333333333333 +SSG Syndicate,1.0 +Sacramento News & Review,1.0 +Salt Lake Tribune,1.0 +San Francisco Chronicle,1.0 +San Francisco Examiner,1.0 +Scotsman,0.6666666666666666 +Screen International,0.6666666666666666 +Screen It!,1.0 +Screen Rant,1.0 +Screen-Space,0.0 +ScreenAnarchy,1.0 +Screenwize,1.0 +Scroll.in,1.0 +Seanax.com,1.0 +Seattle Film Blog,1.0 +Seattle Times,0.5 +Shadows on the Wall,1.0 +Showbiz Junkies,1.0 +Sight & Sound,0.0 +Sky Cinema,0.0 +Slant Magazine,0.8 +Spectrum Culture,0.0 +Spirituality & Practice,1.0 +Sunday Times (UK),0.5 +TAKE ONE Magazine,1.0 +THN,0.0 +TIME Magazine,1.0 +TV Guide,1.0 +Talking Pictures (U.S.),1.0 +The Age (Australia),0.5 +The Australian,1.0 +The Baffler,1.0 +The Daily Dot,1.0 +The Daily Review/Crikey,1.0 +The Daily Times (Tennessee),1.0 +The Dissolve,1.0 +The Film Experience,0.0 +The Film Stage,0.6666666666666666 +The Indian Express,0.0 +The List,1.0 +The MacGuffin,1.0 +"The Monitor (McAllen, TX)",1.0 +The Nation,0.0 +The National (UAE),1.0 +The New Beverly,1.0 +The Pink Lens,0.0 +The Playlist,0.6666666666666666 +The Retro Set,1.0 +The Skinny,1.0 +The Spectator,1.0 +The Spool,1.0 +"The Stranger (Seattle, WA)",1.0 +The Sun (UK),1.0 +The Times of India,1.0 +The Upcoming,0.0 +The Victoria Advocate,1.0 +The Virginian-Pilot,1.0 +The Young Folks,1.0 +TheIndependentCritic.com,1.0 +TheWrap,0.5 +This is Film,1.0 +Tilt Magazine,0.0 +Time Out,0.5555555555555556 +Times (UK),0.6666666666666666 +Times-Picayune,0.0 +Toronto Star,0.6666666666666666 +Total Film,0.0 +Trespass,1.0 +Tri-City Herald,1.0 +Tyler Morning Telegraph (Texas),1.0 +Under the Radar,0.0 +Urban Tulsa Weekly,1.0 +Vanity Fair,0.0 +Variety,0.7692307692307693 +ViewLondon,0.0 +Village Voice,1.0 +Vogue,1.0 +Vox,1.0 +"WBGR-FM (93.7 FM - Monroe, WI)",1.0 +WORLD,1.0 +Washington Post,0.6666666666666666 +Way Too Indie,1.0 +We Got This Covered,1.0 +Willamette Week,0.0 +Winnipeg Free Press,0.0 +eFilmCritic.com,0.5 +easternKicks.com,1.0 +film-authority.com,1.0 +jackiekcooper.com,1.0 +rachelsreviews.net,0.0 +rec.arts.movies.reviews,0.5 +sbs.com.au,1.0 diff --git a/tests/semantic groupBy tests/movies/queries/query2_ground_truth.csv b/tests/semantic groupBy tests/movies/queries/query2_ground_truth.csv new file mode 100644 index 000000000..7d351b22c --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query2_ground_truth.csv @@ -0,0 +1,6 @@ +era,review_count +2000s,24 +2010s,147 +2020s,27 +Unknown,203 +pre-2000,99 diff --git a/tests/semantic groupBy tests/movies/queries/query3_ground_truth.csv b/tests/semantic groupBy tests/movies/queries/query3_ground_truth.csv new file mode 100644 index 000000000..c6f5cf8f2 --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query3_ground_truth.csv @@ -0,0 +1,4 @@ +audienceType,frac_positive,review_count,director +Adult,0.9166666666666666,408,Christopher Nolan +Teen,0.8308709175738724,2572,Christopher Nolan +Unrated,0.75,7,Christopher Nolan diff --git a/tests/semantic groupBy tests/movies/queries/query4_ground_truth.csv b/tests/semantic groupBy tests/movies/queries/query4_ground_truth.csv new file mode 100644 index 000000000..4b92f8fa0 --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query4_ground_truth.csv @@ -0,0 +1,22 @@ +primaryGenre,isTopCritic,frac_positive,review_count +Action,False,0.38461538461538464,13 +Action,True,0.0,4 +Adventure,False,0.5,6 +Adventure,True,1.0,1 +Comedy,False,0.3888888888888889,36 +Comedy,True,0.2777777777777778,18 +Crime,False,0.8,5 +Crime,True,1.0,2 +Documentary,False,0.8823529411764706,34 +Documentary,True,0.8571428571428571,21 +Drama,False,0.8072289156626506,83 +Drama,True,0.7017543859649122,57 +History,False,0.8181818181818182,11 +History,True,0.6,5 +Mystery & thriller,False,0.8333333333333334,42 +Mystery & thriller,True,0.6,20 +Romance,False,0.6493506493506493,77 +Romance,True,0.6296296296296297,27 +Sci-fi,False,1.0,2 +Sci-fi,True,1.0,1 +War,False,1.0,1 diff --git a/tests/semantic groupBy tests/movies/queries/query5_ground_truth.csv b/tests/semantic groupBy tests/movies/queries/query5_ground_truth.csv new file mode 100644 index 000000000..9d6fd0494 --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query5_ground_truth.csv @@ -0,0 +1,4 @@ +emotionalTone,review_count,director,genre +Disappointed,495,Steven Spielberg,Adventure +Enthusiastic,586,Steven Spielberg,Adventure +Measured,1009,Steven Spielberg,Adventure diff --git a/tests/semantic groupBy tests/movies/queries/query_1.py b/tests/semantic groupBy tests/movies/queries/query_1.py new file mode 100644 index 000000000..10202d26a --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query_1.py @@ -0,0 +1,30 @@ +""" +Query 1 — Sentiment by Publication (Single Col, Semantic Agg) + +Query NL: "Group by publicationName and compute the fraction of positive reviews" +- group_cols: ["publicationName"] +- agg_cols: [LLM("reviewText") for POSITIVE/NEGATIVE] +- semantic group: no +- semantic agg: yes + +Ground truth from scoreSentiment column. +""" + +import pandas as pd + +def frac_positive(series): + num_pos = (series == "POSITIVE").sum() + return num_pos / len(series) if len(series) > 0 else 0.0 + +reviews = pd.read_csv("../movie_reviews.csv").head(500) + +result = ( + reviews + .groupby("publicatioName") + .agg(frac_positive_sentiment=("scoreSentiment", frac_positive)) + .reset_index() + .rename(columns={"frac_positive_sentiment": "frac_positive"}) +) + +result.to_csv("query1_ground_truth.csv", index=False) +print(f"Generated ground truth with {len(result)} publication groups") diff --git a/tests/semantic groupBy tests/movies/queries/query_2.py b/tests/semantic groupBy tests/movies/queries/query_2.py new file mode 100644 index 000000000..b8cdbb50d --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query_2.py @@ -0,0 +1,42 @@ +""" +Query 2 — Critic Volume by Inferred Era (Single Col, Semantic Group) + +Query NL: "Group reviews by the era of the movie they reviewed (pre-2000, 2000s, 2010s, 2020s) + and count the number of reviews per era" +- group_cols: [LLM("reviewDate")] +- agg_cols: ["reviewId" (count)] +- semantic group: yes +- semantic agg: no + +Ground truth uses date parsing and rule-based era bucketing. +""" + +import pandas as pd + +reviews = pd.read_csv("../movie_reviews.csv").head(500) +movies = pd.read_csv("../movies.csv")[["id", "releaseDateTheaters"]] + +# Join to get the movie's release year +merged = reviews.merge(movies, on="id", how="left") +merged["releaseYear"] = pd.to_datetime( + merged["releaseDateTheaters"], errors="coerce" +).dt.year + +def era_bucket(year): + if pd.isna(year): return "Unknown" + if year < 2000: return "pre-2000" + if year < 2010: return "2000s" + if year < 2020: return "2010s" + return "2020s" + +merged["era"] = merged["releaseYear"].apply(era_bucket) + +result = ( + merged + .groupby("era") + .agg(review_count=("reviewId", "count")) + .reset_index() +) + +result.to_csv("query2_ground_truth.csv", index=False) +print(f"Generated ground truth with {len(result)} era groups") diff --git a/tests/semantic groupBy tests/movies/queries/query_3.py b/tests/semantic groupBy tests/movies/queries/query_3.py new file mode 100644 index 000000000..937753c70 --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query_3.py @@ -0,0 +1,60 @@ +""" +Query 3 — Fraction Positive per Audience Type (Templatable, Semantic Group) + +Query NL: "For movies directed by {director}, group reviews by the audience type targeted + by the movie's MPAA rating (Children, Teen, Adult, Unrated) and compute the + fraction of positive reviews per audience type" +- group_cols: [LLM("rating") → audience type] +- agg_cols: [LLM("reviewText") → POSITIVE/NEGATIVE, frac_positive] +- semantic group: yes +- semantic agg: yes + +Ground truth uses MPAA rating mapping and scoreSentiment column. +""" + +import pandas as pd +import sys + +DIRECTOR = sys.argv[1] if len(sys.argv) > 1 else "Christopher Nolan" + +RATING_TO_AUDIENCE = { + "G": "Children", "PG": "Children", + "PG-13": "Teen", + "R": "Adult", "NC-17": "Adult", + "NR": "Unrated", "": "Unrated", +} + +def frac_positive(series): + return (series == "POSITIVE").sum() / len(series) if len(series) > 0 else 0.0 + +movies = pd.read_csv("../movies.csv") +reviews = pd.read_csv("../movie_reviews.csv") + +# Filter for director's movies +director_movies = movies[movies["director"].str.contains(DIRECTOR, na=False, case=False)][["id", "rating"]] +director_movies["audienceType"] = director_movies["rating"].map( + lambda r: RATING_TO_AUDIENCE.get(str(r).strip(), "Unrated") +) + +# merged = reviews.merge(director_movies, on="id", how="inner") + +print("director_movies shape:", director_movies.shape) +print(director_movies.head()) + +merged = director_movies.merge(reviews, on="id", how="left") +print("merged shape:", merged.shape) +print(merged.head()) + +result = ( + merged + .groupby("audienceType") + .agg( + frac_positive=("scoreSentiment", frac_positive), + review_count=("scoreSentiment", "count"), + ) + .reset_index() +) +result["director"] = DIRECTOR + +result.to_csv("query3_ground_truth.csv", index=False) +print(f"Generated ground truth for {DIRECTOR}: {len(result)} audience type groups") diff --git a/tests/semantic groupBy tests/movies/queries/query_4.py b/tests/semantic groupBy tests/movies/queries/query_4.py new file mode 100644 index 000000000..4ce1022de --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query_4.py @@ -0,0 +1,38 @@ +""" +Query 6 — Sentiment and Top Critic Bias by Genre (Multi-Col, Semantic Group + Agg) + +Query NL: "Group reviews by inferred genre of the movie and whether the reviewer is a top critic, + and compute the fraction of positive reviews" +- group_cols: [LLM("reviewText") for the genre, "isTopCritic"] +- agg_cols: [LLM("reviewText") for POSITIVE/NEGATIVE, frac_positive] +- semantic group: yes (genre inferred from review text) +- semantic agg: yes (sentiment inferred from reviewText) + +Ground truth obtained by joining to movies table for genre. +""" + +import pandas as pd + +def frac_positive(series): + return (series == "POSITIVE").sum() / len(series) if len(series) > 0 else 0.0 + +movies = pd.read_csv("../movies.csv")[["id", "genre"]] +reviews = pd.read_csv("../movie_reviews.csv").head(500) + +merged = reviews.merge(movies, on="id", how="left") +# Coarsen multi-genre entries to primary genre +merged["primaryGenre"] = merged["genre"].str.split(",").str[0].str.strip() + +result = ( + merged + .dropna(subset=["primaryGenre", "isTopCritic"]) + .groupby(["primaryGenre", "isTopCritic"]) + .agg( + frac_positive=("scoreSentiment", frac_positive), + review_count=("scoreSentiment", "count"), + ) + .reset_index() +) + +result.to_csv("query4_ground_truth.csv", index=False) +print(f"Generated ground truth with {len(result)} genre-topcritic groups") diff --git a/tests/semantic groupBy tests/movies/queries/query_5.py b/tests/semantic groupBy tests/movies/queries/query_5.py new file mode 100644 index 000000000..20d155979 --- /dev/null +++ b/tests/semantic groupBy tests/movies/queries/query_5.py @@ -0,0 +1,59 @@ +""" +Query 7 — Sentiment by Director and Genre (Templatable, Mixed Group + Semantic Agg) + +Query NL: "For movies directed by {director} in the {genre} genre, group reviews by + the emotional tone of the review (Enthusiastic, Measured, Disappointed) and + count the number of reviews per tone" +- group_cols: ["director" (literal, filtered), "genre" (literal, filtered), + LLM("reviewText") → emotional tone] +- agg_cols: ["reviewId" (count)] +- semantic group: mixed (director and genre are filter/literal; tone is semantic) +- semantic agg: no + +Ground truth approximation: map scoreSentiment + originalScore to ternary label. +""" + +import pandas as pd +import sys + +DIRECTOR = sys.argv[1] if len(sys.argv) > 1 else "Steven Spielberg" +GENRE = sys.argv[2] if len(sys.argv) > 2 else "Adventure" + +def approx_tone(row): + sentiment = row["scoreSentiment"] + score_str = str(row["originalScore"]) + # Parse scores like "4/5", "3.5/4", "A", "B+" — use sentiment as fallback + if sentiment == "NEGATIVE": + return "Disappointed" + # Try to parse numeric score to detect Enthusiastic vs Measured + try: + parts = score_str.split("/") + if len(parts) == 2: + ratio = float(parts[0]) / float(parts[1]) + return "Enthusiastic" if ratio >= 0.8 else "Measured" + except Exception: + pass + return "Measured" # default for POSITIVE without parseable score + +movies = pd.read_csv("../movies.csv") +reviews = pd.read_csv("../movie_reviews.csv") + +filtered_movies = movies[ + movies["director"].str.contains(DIRECTOR, na=False, case=False) & + movies["genre"].str.contains(GENRE, na=False, case=False) +][["id"]] + +merged = reviews.merge(filtered_movies, on="id", how="inner") +merged["emotionalTone"] = merged.apply(approx_tone, axis=1) + +result = ( + merged + .groupby("emotionalTone") + .agg(review_count=("reviewId", "count")) + .reset_index() +) +result["director"] = DIRECTOR +result["genre"] = GENRE + +result.to_csv("query5_ground_truth.csv", index=False) +print(f"Generated ground truth for {DIRECTOR} in {GENRE}: {len(result)} tone groups") diff --git a/tests/semantic groupBy tests/movies/queries/query_6.py b/tests/semantic groupBy tests/movies/queries/query_6.py new file mode 100644 index 000000000..e69de29bb From 05600389b54125f91184cc6fa0cb073e07e5548e Mon Sep 17 00:00:00 2001 From: kepler11c Date: Tue, 10 Mar 2026 10:35:23 -0400 Subject: [PATCH 26/28] checking in sem gby changes before refactor --- a.txt | 0 src/palimpzest/core/data/dataset.py | 155 +++++++++- src/palimpzest/query/generators/generators.py | 3 + src/palimpzest/query/operators/aggregate.py | 139 ++++++++- .../amazon reviews/amazon_1.py | 26 -- .../amazon reviews/amazon_2.py | 30 -- .../ecommerce/ecommerce_1.py | 28 -- .../ecommerce/ecommerce_2.py | 37 --- .../wildlife/wildlife_1.py | 28 -- .../wildlife/wildlife_2.py | 34 --- .../wildlife/wildlife_3.py | 29 -- .../movies/movies_1.py | 0 .../movies/movies_1_pz.py | 2 +- .../movies/movies_2.py | 0 .../pz-programs/compare_query6_results.py | 224 +++++++++++++++ .../movies/pz-programs/query_2_pz.py | 21 +- .../movies/pz-programs/query_3_pz.py | 20 +- .../movies/pz-programs/query_4_pz.py | 104 +++++-- .../movies/pz-programs/query_5_pz.py | 24 +- .../movies/pz-programs/query_6_pz.py | 190 +++++++++++++ .../movies/queries/query_6.py | 78 +++++ .../movies/rerun_comparison.py | 241 ++++++++++++++++ .../movies/run_baseline_tests.py | 267 ++++++++++++++++++ .../movies/run_groupby_tests.py | 254 +++++++++++++++++ 24 files changed, 1673 insertions(+), 261 deletions(-) delete mode 100644 a.txt delete mode 100644 tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_1.py delete mode 100644 tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_2.py delete mode 100644 tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_1.py delete mode 100644 tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_2.py delete mode 100644 tests/semantic groupBy tests/ground truth results/wildlife/wildlife_1.py delete mode 100644 tests/semantic groupBy tests/ground truth results/wildlife/wildlife_2.py delete mode 100644 tests/semantic groupBy tests/ground truth results/wildlife/wildlife_3.py rename tests/semantic groupBy tests/{ground truth results => }/movies/movies_1.py (100%) rename tests/semantic groupBy tests/{ground truth results => }/movies/movies_1_pz.py (98%) rename tests/semantic groupBy tests/{ground truth results => }/movies/movies_2.py (100%) create mode 100644 tests/semantic groupBy tests/movies/pz-programs/compare_query6_results.py create mode 100644 tests/semantic groupBy tests/movies/rerun_comparison.py create mode 100644 tests/semantic groupBy tests/movies/run_baseline_tests.py create mode 100644 tests/semantic groupBy tests/movies/run_groupby_tests.py diff --git a/a.txt b/a.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/palimpzest/core/data/dataset.py b/src/palimpzest/core/data/dataset.py index a37ccc71c..078b28336 100644 --- a/src/palimpzest/core/data/dataset.py +++ b/src/palimpzest/core/data/dataset.py @@ -708,9 +708,162 @@ def sem_groupby(self, gby_fields: list[str] | list[dict], agg_fields: list[str] agg_fields=normalized_agg_fields, agg_funcs=agg_funcs ) - + return Dataset(sources=[self], operator=operator, schema=output_schema) + def hierarchical_groupby( + self, + groupby_fields: list[list[str]], + agg_fields: list[list[str]], + agg_funcs: list[list[str]], + ) -> dict: + """ + Perform hierarchical (nested) exact groupby operations across multiple levels. + + At each level except the last, records are partitioned by the groupby fields + without aggregation; the last level applies full aggregation. + + Args: + groupby_fields: List of lists of field names to group by at each level. + agg_fields: List of lists of field names to aggregate at each level. + agg_funcs: List of lists of aggregation function names at each level. + + Returns: + A DataRecordSet for a single level, or a nested dict + ``{group_key: }`` for multiple levels. + """ + from palimpzest.core.lib.schemas import create_groupby_schema_from_fields + from palimpzest.query.operators.aggregate import ApplyGroupByOp + + assert len(groupby_fields) == len(agg_fields) == len(agg_funcs), \ + "groupby_fields, agg_fields, and agg_funcs must all have the same length" + + result = self.run() + candidates = result.data_records + + def run_level(candidates, level): + gby_names = groupby_fields[level] + agg_names = agg_fields[level] + funcs = agg_funcs[level] + output_schema = create_groupby_schema_from_fields(gby_names, agg_names) + op = ApplyGroupByOp( + gby_fields=gby_names, + agg_fields=agg_names, + agg_funcs=funcs, + output_schema=output_schema, + input_schema=self.schema, + ) + if level == len(groupby_fields) - 1: + return op(candidates) + # Intermediate level: partition candidates by exact field values + outer_groups = {} + for candidate in candidates: + key = tuple(getattr(candidate, f, None) for f in gby_names) + outer_groups.setdefault(key, []).append(candidate) + return {key: run_level(grp, level + 1) for key, grp in outer_groups.items()} + + return run_level(candidates, 0) + + def hierarchical_sem_groupby( + self, + groupby_fields: list[list[str | dict]], + agg_fields: list[list[str | dict]], + agg_funcs: list[list[str]], + model=None, + prompt_strategy=None, + reasoning_effort=None, + ) -> dict: + """ + Perform hierarchical (nested) semantic groupby operations using LLMs. + + At each intermediate level the LLM assigns group labels to the original records + (without aggregation) so that inner levels can operate on the same raw records. + The final level runs a full semantic groupby with aggregation. + + Args: + groupby_fields: List of lists of field specs (str or dict with name/desc/type) per level. + agg_fields: List of lists of field specs to aggregate per level. + agg_funcs: List of lists of aggregation function names per level. + model: Optional LLM model override. + prompt_strategy: Optional prompt strategy override. + reasoning_effort: Optional reasoning effort override. + + Returns: + A DataRecordSet for a single level, or a nested dict + ``{group_key: }`` for multiple levels. + """ + from palimpzest.constants import Model, PromptStrategy + from palimpzest.core.lib.schemas import create_groupby_schema_from_fields + from palimpzest.query.operators.aggregate import SemanticGroupByOp + + assert len(groupby_fields) == len(agg_fields) == len(agg_funcs), \ + "groupby_fields, agg_fields, and agg_funcs must all have the same length" + + # Default to GPT-4o if no model specified; sem_groupby requires an explicit model + # because hierarchical_sem_groupby bypasses the query optimizer / policy system. + _model = model if model is not None else Model.GPT_4o + _prompt_strategy = prompt_strategy if prompt_strategy is not None else PromptStrategy.AGG + + from palimpzest.core.models import GenerationStats + + result = self.run() + candidates = result.data_records + + # Accumulate GenerationStats across all levels so callers can track + # total cost / token usage for the entire hierarchical operation. + accumulated_stats = GenerationStats() + + def normalize_fields(fields): + out = [] + for f in fields: + if isinstance(f, str): + out.append({'name': f, 'desc': f'Group by {f}', 'type': str}) + else: + out.append(f) + return out + + def run_level(candidates, level): + nonlocal accumulated_stats + gby_specs = normalize_fields(groupby_fields[level]) + agg_specs = normalize_fields(agg_fields[level]) + funcs = agg_funcs[level] + gby_names = [s['name'] for s in gby_specs] + agg_names = [s['name'] for s in agg_specs] + output_schema = create_groupby_schema_from_fields(gby_names, agg_names) + op = SemanticGroupByOp( + gby_fields=gby_specs, + agg_fields=agg_specs, + agg_funcs=funcs, + model=_model, + prompt_strategy=_prompt_strategy, + reasoning_effort=reasoning_effort, + output_schema=output_schema, + input_schema=self.schema, + ) + if level == len(groupby_fields) - 1: + # Final level: full groupby with aggregation. + # Extract per-group RecordOpStats and fold into accumulated_stats. + dataset_result = op(candidates) + for ros in dataset_result.record_op_stats: + accumulated_stats.total_input_tokens += ros.total_input_tokens + accumulated_stats.total_output_tokens += ros.total_output_tokens + accumulated_stats.total_input_cost += ros.total_input_cost + accumulated_stats.total_output_cost += ros.total_output_cost + accumulated_stats.llm_call_duration_secs += ros.llm_call_duration_secs + return dataset_result + # Intermediate level: LLM assigns group labels without aggregation. + # Capture and accumulate the GenerationStats that were previously discarded. + group_labels, gen_stats = op._assign_groups_llm(candidates) + accumulated_stats += gen_stats + outer_groups = {} + for candidate, label in zip(candidates, group_labels): + key = (label,) if not isinstance(label, tuple) else label + outer_groups.setdefault(key, []).append(candidate) + return {key: run_level(grp, level + 1) for key, grp in outer_groups.items()} + + nested_result = run_level(candidates, 0) + return nested_result, accumulated_stats + def sem_agg(self, col: dict | type[BaseModel], agg: str, depends_on: str | list[str] | None = None) -> Dataset: """ Apply a semantic aggregation to this set. The `agg` string will be applied using an LLM diff --git a/src/palimpzest/query/generators/generators.py b/src/palimpzest/query/generators/generators.py index a61839fe3..dfe36e3b8 100644 --- a/src/palimpzest/query/generators/generators.py +++ b/src/palimpzest/query/generators/generators.py @@ -419,6 +419,9 @@ def __call__(self, candidate: DataRecord | list[DataRecord], fields: dict[str, F logger.debug(f"PROMPT:\n{prompt}") logger.debug(Fore.GREEN + f"{completion_text}\n" + Style.RESET_ALL) + print(f"PROMPT:\n{prompt}") + print(Fore.GREEN + f"{completion_text}\n" + Style.RESET_ALL) + # parse reasoning reasoning = None try: diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index aee9d606b..cffe3d99c 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -209,6 +209,55 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: # construct and return DataRecordSet return DataRecordSet(drs, record_op_stats_lst) + def hierarchical_groupby( + self, + candidates: list[DataRecord], + groupby_fields: list[list[str]], + agg_fields: list[list[str]], + agg_funcs: list[list[str]], + ) -> dict: + """ + Perform hierarchical (nested) exact groupby operations across multiple levels. + + At each intermediate level records are partitioned by exact field values without + aggregation; the final level applies full aggregation via ApplyGroupByOp.__call__. + + Args: + candidates: Input DataRecords. + groupby_fields: List of lists of field names per level. + agg_fields: List of lists of aggregate field names per level. + agg_funcs: List of lists of aggregation function names per level. + + Returns: + A DataRecordSet for a single level, or a nested dict for multiple levels. + """ + from palimpzest.core.lib.schemas import create_groupby_schema_from_fields + + assert len(groupby_fields) == len(agg_fields) == len(agg_funcs), \ + "groupby_fields, agg_fields, and agg_funcs must all have the same length" + + def run_level(candidates, level): + gby_names = groupby_fields[level] + agg_names = agg_fields[level] + funcs = agg_funcs[level] + output_schema = create_groupby_schema_from_fields(gby_names, agg_names) + op = ApplyGroupByOp( + gby_fields=gby_names, + agg_fields=agg_names, + agg_funcs=funcs, + output_schema=output_schema, + input_schema=self.input_schema, + ) + if level == len(groupby_fields) - 1: + return op(candidates) + outer_groups = {} + for candidate in candidates: + key = tuple(getattr(candidate, f, None) for f in gby_names) + outer_groups.setdefault(key, []).append(candidate) + return {key: run_level(grp, level + 1) for key, grp in outer_groups.items()} + + return run_level(candidates, 0) + class AverageAggregateOp(AggregateOp): # NOTE: we don't actually need / use agg_func here (yet) @@ -889,6 +938,7 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: llm_call_duration_secs=gen_stats.llm_call_duration_secs, fn_call_duration_secs=gen_stats.fn_call_duration_secs, total_llm_calls=gen_stats.total_llm_calls, + total_embedding_llm_calls=gen_stats.total_embedding_llm_calls, op_details={k: str(v) for k, v in self.get_id_params().items()}, ) record_op_stats_lst.append(record_op_stats) @@ -911,7 +961,7 @@ def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], a first_gby_spec = self.gby_fields_spec[0] if isinstance(first_gby_spec, dict): - field_desc = first_gby_spec.get('desc', f"The semantic category for {first_gby_spec['name']}") + field_desc = first_gby_spec["desc"] field_name = first_gby_spec['name'] field_type = first_gby_spec.get('type', str) else: @@ -934,7 +984,7 @@ def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], a fields = {self.gby_fields[0]: str} - # Build the aggregation instruction that includes the field description + # Build the aggregation instruction that includes the field descriptions from field spec # This tells the LLM HOW to categorize/group the values semantically agg_instruction = f"Categorize this record into a semantic group based on the field '{field_name}' Return the category name (one of those specified in '{field_desc}'s)" @@ -955,14 +1005,91 @@ def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], a field_answers, _, gen_stats, _ = self.generator(candidate, fields, **gen_kwargs) # Extract the group label - field_answers returns dict with field->list mapping - group_label = field_answers.get(self.gby_fields[0], [None])[0] - if group_label is None: - # Fallback: use a default group + field_answer = field_answers.get(self.gby_fields[0]) + if field_answer is None or not isinstance(field_answer, list) or len(field_answer) == 0: group_label = "unknown" + else: + group_label = field_answer[0] group_labels.append(group_label) # Accumulate stats total_stats += gen_stats print(f" Completed! Found {len(set(group_labels))} unique groups from {len(candidates)} records") - return group_labels, total_stats \ No newline at end of file + return group_labels, total_stats + + def hierarchical_groupby( + self, + candidates: list[DataRecord], + groupby_fields: list[list[str | dict]], + agg_fields: list[list[str | dict]], + agg_funcs: list[list[str]], + model: Model = None, + prompt_strategy: PromptStrategy = PromptStrategy.AGG, + reasoning_effort: str | None = None, + ) -> dict: + """ + Perform hierarchical (nested) semantic groupby operations using LLMs. + + At each intermediate level the LLM assigns group labels to the original records + (without aggregation) so that inner levels operate on the same raw records. + The final level runs a full semantic groupby with aggregation. + + Args: + candidates: Input DataRecords. + groupby_fields: List of lists of field specs per level. + agg_fields: List of lists of aggregate field specs per level. + agg_funcs: List of lists of aggregation function names per level. + model: Optional LLM model override (falls back to self.model). + prompt_strategy: Prompt strategy (defaults to AGG). + reasoning_effort: Optional reasoning effort override. + + Returns: + A DataRecordSet for a single level, or a nested dict for multiple levels. + """ + from palimpzest.core.lib.schemas import create_groupby_schema_from_fields + + assert len(groupby_fields) == len(agg_fields) == len(agg_funcs), \ + "groupby_fields, agg_fields, and agg_funcs must all have the same length" + + def normalize_fields(fields): + out = [] + for f in fields: + if isinstance(f, str): + out.append({'name': f, 'desc': f'Group by {f}', 'type': str}) + else: + out.append(f) + return out + + _model = model or self.model + _prompt_strategy = prompt_strategy or self.prompt_strategy + _reasoning_effort = reasoning_effort or self.reasoning_effort + + def run_level(candidates, level): + gby_specs = normalize_fields(groupby_fields[level]) + agg_specs = normalize_fields(agg_fields[level]) + funcs = agg_funcs[level] + gby_names = [s['name'] for s in gby_specs] + agg_names = [s['name'] for s in agg_specs] + output_schema = create_groupby_schema_from_fields(gby_names, agg_names) + op = SemanticGroupByOp( + gby_fields=gby_specs, + agg_fields=agg_specs, + agg_funcs=funcs, + model=_model, + prompt_strategy=_prompt_strategy, + reasoning_effort=_reasoning_effort, + output_schema=output_schema, + input_schema=self.input_schema, + ) + if level == len(groupby_fields) - 1: + return op(candidates) + # Intermediate: LLM assigns labels, original records are forwarded + group_labels, _ = op._assign_groups_llm(candidates) + outer_groups = {} + for candidate, label in zip(candidates, group_labels): + key = (label,) if not isinstance(label, tuple) else label + outer_groups.setdefault(key, []).append(candidate) + return {key: run_level(grp, level + 1) for key, grp in outer_groups.items()} + + return run_level(candidates, 0) \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_1.py b/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_1.py deleted file mode 100644 index 979936355..000000000 --- a/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_1.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Amazon Sales — Review Analysis - -Query NL: "Group by review type and return average cost of the products" - -group_cols: [LLM("reviewText")] -agg_cols: ["price"] -semantic group: yes (review type/sentiment inferred from review text) -semantic agg: no (average is a standard aggregate) -""" - -import pandas as pd - -df = pd.read_csv("amazon.csv") -# assume columns: productID, reviewText, price, reviewType (LLM inferred: positive/negative/neutral) - -# Group by review type and compute average price -result = ( - df - .groupby("reviewType") - .agg({"price": "mean"}) - .reset_index() - .rename(columns={"price": "avg_price"}) -) - -result.to_csv("amazon-review-type-avg-price.csv", index=False) \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_2.py b/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_2.py deleted file mode 100644 index 44cfea61e..000000000 --- a/tests/semantic groupBy tests/ground truth results/amazon reviews/amazon_2.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -Amazon Sales — Product Sentiment - -Query NL: "Group by user product review title" -Categories: -- Good overall -- Neutral -- Bad overall - -group_cols: [LLM("reviewTitle")] -agg_cols: ["productID"] -semantic group: yes (sentiment category inferred from review title) -semantic agg: no -""" - -import pandas as pd - -df = pd.read_csv("amazon_sales.csv") -# assume columns: productID, reviewTitle, sentimentCategory (LLM inferred: good_overall/good_with_negatives/bad_with_positives/bad_overall) - -# Group by sentiment category and count products -result = ( - df - .groupby("sentimentCategory") - .agg({"productID": "count"}) - .reset_index() - .rename(columns={"productID": "product_count"}) -) - -result.to_csv("amazon-sentiment-category-count.csv", index=False) diff --git a/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_1.py b/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_1.py deleted file mode 100644 index 7fa462024..000000000 --- a/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_1.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -E-Commerce — Color Analysis - -Query NL: "Group by color of images and return the count" - -group_cols: [LLM("imageFile")] -agg_cols: ["productID"] -semantic group: yes (color inferred from product image) -semantic agg: no -""" - -import pandas as pd - -df = pd.read_csv("ecommerce_products.csv") -# assume columns: productID, imageFile, productColor (LLM inferred from image) - -# Group by color and count products -result = ( - df - .groupby("baseColour") - .agg({"productID": "count"}) - .reset_index() - .rename(columns={"productID": "product_count"}) -) - -result.to_csv("ecommerce_1.csv", index=False) - -#TODO: join images.csv and styles.csv by productID to get imageFile and productColor \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_2.py b/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_2.py deleted file mode 100644 index 253a685fa..000000000 --- a/tests/semantic groupBy tests/ground truth results/ecommerce/ecommerce_2.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -E-Commerce — Brand Grouping - -Query NL: "Group by brand and by color return the ratio between topwear - (apparel and accessories that are worn above the waist) and - bottomwear (worn at and below the waist)" - -group_cols: ["color", LLM("productDisplayName, imageFile")] -agg_cols: [LLM("productDisplayName")] -semantic group: mixed (color is direct, brand inferred from display name and image) -semantic agg: yes (clothing category inferred from product name/image) -""" - -import pandas as pd - -def topwear_bottomwear_ratio(series): - topwear_count = (series == "topwear").sum() - bottomwear_count = (series == "bottomwear").sum() - if bottomwear_count == 0: - return float('inf') if topwear_count > 0 else 0 - return topwear_count / bottomwear_count - -df = pd.read_csv("ecommerce_products.csv") -# assume columns: productID, brand, productDisplayName, productColor (LLM inferred), clothingCategory (LLM inferred: topwear/bottomwear) - -# Group by brand and color, compute ratio -result = ( - df - .groupby(["brand", "baseColour"]) - .agg({"subCategory": topwear_bottomwear_ratio}) - .reset_index() - .rename(columns={"subCategory": "topwear_bottomwear_ratio"}) -) - -result.to_csv("ecommerce_2.csv", index=False) - -#TODO: augmenting the brand to styles.csv \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_1.py b/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_1.py deleted file mode 100644 index 8479a8faa..000000000 --- a/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_1.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -Wildlife — Audio-to-Logic - -Query NL: "Group by animals that are carnivorous (from audio) and return the count for all such animals." - -group_cols: [LLM("audioFile")] -agg_cols: ["animalID"] -semantic group: yes (diet type inferred from audio) -semantic agg: no -""" - -import pandas as pd - -df = pd.read_csv("wildlife_audio.csv") -# assume columns: animalID, animalName, audioFile, dietType (LLM inferred from audio) - -# Filter by carnivorous animals (LLM output already materialized) -carnivorous_df = df[df["dietType"] == "carnivorous"] - -# Count the number of carnivorous animals -result = pd.DataFrame({ - "dietType": ["carnivorous"], - "animal_count": [len(carnivorous_df)] -}) - -result.to_csv("wildlife_1.csv", index=False) - -#TODO: Augment dietType to the dataset \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_2.py b/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_2.py deleted file mode 100644 index 7bcf63d80..000000000 --- a/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_2.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Wildlife — Lat/Long Extraction - -Query NL: "Group by country (from the longitude and latitude). - Compute the count of {animal} for every country." - -group_cols: [LLM("latitude", "longitude")] -agg_cols: [LLM("imageFile")] -semantic group: yes (country inferred from coordinates) -semantic agg: yes (animal type inferred from image) -""" - -import pandas as pd - -df = pd.read_csv("wildlife_location.csv") -# assume columns: animalID, latitude, longitude, imageFile, country (LLM inferred), animalType (LLM inferred from image) - -ANIMAL_TYPE = "lion" - -# Filter by animal type -filtered_df = df[df["animalType"] == ANIMAL_TYPE] - -# Group by country and animal type, count animals -result = ( - filtered_df - .groupby(["country", "animalType"]) - .agg({"animalID": "count"}) - .reset_index() - .rename(columns={"animalID": "animal_count"}) -) - -result.to_csv("wildlife_2.csv", index=False) - -#TODO: Augment country to the dataset \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_3.py b/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_3.py deleted file mode 100644 index 1bdb14b5a..000000000 --- a/tests/semantic groupBy tests/ground truth results/wildlife/wildlife_3.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -Wildlife — Average Age - -Query NL: "Group by small animals (from image) and return their average age." -Note: Small = an animal that weighs less than 50kg and has dimensions less than 1m - -group_cols: [LLM("imageFile")] -agg_cols: ["age"] -semantic group: yes (size category inferred from image, weight and dimensions) -semantic agg: no -""" - -import pandas as pd - -df = pd.read_csv("wildlife_detailed.csv") -# assume columns: animalID, imageFile, age, weight_kg, max_dimension_m, isSmall (LLM inferred: weight < 50kg AND dimension < 1m) - -# Filter by small animals (LLM output already materialized) -small_animals_df = df[df["isSmall"] == True] - -# Calculate average age -result = pd.DataFrame({ - "size_category": ["small"], - "avg_age": [small_animals_df["age"].mean()] -}) - -result.to_csv("wildlife_3.csv", index=False) - -# TODO: Augment size_category to the dataset \ No newline at end of file diff --git a/tests/semantic groupBy tests/ground truth results/movies/movies_1.py b/tests/semantic groupBy tests/movies/movies_1.py similarity index 100% rename from tests/semantic groupBy tests/ground truth results/movies/movies_1.py rename to tests/semantic groupBy tests/movies/movies_1.py diff --git a/tests/semantic groupBy tests/ground truth results/movies/movies_1_pz.py b/tests/semantic groupBy tests/movies/movies_1_pz.py similarity index 98% rename from tests/semantic groupBy tests/ground truth results/movies/movies_1_pz.py rename to tests/semantic groupBy tests/movies/movies_1_pz.py index 4cf5391bf..97181f31d 100644 --- a/tests/semantic groupBy tests/ground truth results/movies/movies_1_pz.py +++ b/tests/semantic groupBy tests/movies/movies_1_pz.py @@ -103,7 +103,7 @@ def main(): agg_fields = ["scoreSentiment"] agg_funcs = ["count"] # We'll use count initially to demonstrate grouping - grouped_reviews = reviews.groupby(gby_fields, agg_fields, agg_funcs) + grouped_reviews = reviews.sem_groupby(gby_fields, agg_fields, agg_funcs) # Configure and run the query config = pz.QueryProcessorConfig( diff --git a/tests/semantic groupBy tests/ground truth results/movies/movies_2.py b/tests/semantic groupBy tests/movies/movies_2.py similarity index 100% rename from tests/semantic groupBy tests/ground truth results/movies/movies_2.py rename to tests/semantic groupBy tests/movies/movies_2.py diff --git a/tests/semantic groupBy tests/movies/pz-programs/compare_query6_results.py b/tests/semantic groupBy tests/movies/pz-programs/compare_query6_results.py new file mode 100644 index 000000000..38c6d8fb9 --- /dev/null +++ b/tests/semantic groupBy tests/movies/pz-programs/compare_query6_results.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +""" +Compare Query 6 results from PZ with ground truth. +Generates a styled summary table image similar to the analysis summary_table.png. +""" + +import math +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import pandas as pd + + +# ─── Styling (matches analyze.py) ───────────────────────────────────────────── + +HEADER_COLOR = "#1E3A5F" +ROW_ALT_COLOR = "#F7F9FC" +ROW_LABEL_COLOR = "#E8EDF5" +EDGE_COLOR = "#CCCCCC" + + +def style_table(tbl, n_data_rows: int): + """Apply the shared header/row styling.""" + tbl.auto_set_font_size(False) + tbl.set_fontsize(9) + tbl.scale(1.2, 1.6) + for (r, c), cell in tbl.get_celld().items(): + if r == 0: + cell.set_facecolor(HEADER_COLOR) + cell.set_text_props(color="white", fontweight="bold") + elif c == -1: + cell.set_facecolor(ROW_LABEL_COLOR) + cell.set_text_props(fontweight="bold") + else: + cell.set_facecolor(ROW_ALT_COLOR if r % 2 == 0 else "white") + cell.set_edgecolor(EDGE_COLOR) + + +def make_stats_subtable(ax, comparison: pd.DataFrame): + """Left sub-table: summary statistics.""" + ax.axis("off") + + exact = (comparison["Difference"] < 1e-9).sum() + close = (comparison["Difference"] <= 0.1).sum() + n = len(comparison) + + rows = [ + ["Directors compared", str(n)], + ["Exact matches", f"{exact} ({100*exact/n:.1f}%)"], + ["Within ±0.1", f"{close} ({100*close/n:.1f}%)"], + ["Mean |difference|", f"{comparison['Difference'].mean():.4f}"], + ["Std deviation", f"{comparison['Difference'].std():.4f}"], + ["Min difference", f"{comparison['Difference'].min():.4f}"], + ["Max difference", f"{comparison['Difference'].max():.4f}"], + ] + + tbl = ax.table( + cellText=[[r[1]] for r in rows], + rowLabels=[r[0] for r in rows], + colLabels=["Value"], + cellLoc="center", + loc="center", + ) + style_table(tbl, len(rows)) + # ax.set_title("Summary Statistics", fontsize=11, fontweight="bold", pad=10) + + +def make_score_subtable(ax, comparison: pd.DataFrame): + """Middle sub-table: distribution of differences by bucket.""" + ax.axis("off") + + diff = comparison["Difference"] + buckets = [ + ("= 0.0", diff < 1e-9), + ("0.0 – 0.1", (diff >= 1e-9) & (diff <= 0.1)), + ("0.1 – 0.2", (diff > 0.1) & (diff <= 0.2)), + ("0.2 – 0.3", (diff > 0.2) & (diff <= 0.3)), + ("> 0.3", diff > 0.3), + ] + n = len(comparison) + cell_data = [[str(mask.sum()), f"{100*mask.sum()/n:.1f}%"] for _, mask in buckets] + row_labels = [label for label, _ in buckets] + + tbl = ax.table( + cellText=cell_data, + rowLabels=row_labels, + colLabels=["Count", "% of Total"], + cellLoc="center", + loc="center", + ) + style_table(tbl, len(buckets)) + # ax.set_title("Difference Distribution", fontsize=11, fontweight="bold", pad=10) + + +def make_all_directors_subtable(ax, comparison: pd.DataFrame): + """Full directors table showing all rows.""" + ax.axis("off") + + cell_data = [ + [row["Director"], f"{row['PZ Score']:.3f}", f"{row['Ground Truth Score']:.3f}", f"{row['Difference']:.3f}"] + for _, row in comparison.iterrows() + ] + + tbl = ax.table( + cellText=cell_data, + colLabels=["Director", "PZ", "GT", "Diff"], + cellLoc="center", + loc="center", + ) + tbl.auto_set_font_size(False) + tbl.set_fontsize(8) + tbl.scale(1.0, 1.4) + for (r, c), cell in tbl.get_celld().items(): + if r == 0: + cell.set_facecolor(HEADER_COLOR) + cell.set_text_props(color="white", fontweight="bold") + else: + cell.set_facecolor(ROW_ALT_COLOR if r % 2 == 0 else "white") + cell.set_edgecolor(EDGE_COLOR) + + # ax.set_title("All Directors", fontsize=11, fontweight="bold", pad=10) + + +def save_summary_figure(comparison_table: pd.DataFrame, output_path: Path): + plt.rcParams.update({ + "font.family": "sans-serif", + "font.size": 11, + "axes.spines.top": False, + "axes.spines.right": False, + }) + + n = len(comparison_table) + # Top section: stats + distribution side by side + # Bottom section: full directors table spanning full width + fig = plt.figure(figsize=(14, 4.5 + n * 0.28)) + fig.suptitle( + "Query 6 — PZ vs Ground Truth", + fontsize=13, fontweight="bold", + ) + + import matplotlib.gridspec as gridspec + gs = gridspec.GridSpec(2, 2, figure=fig, height_ratios=[1, n * 0.28 / 4.5], width_ratios=[1, 1]) + + ax_stats = fig.add_subplot(gs[0, 0]) + ax_dist = fig.add_subplot(gs[0, 1]) + ax_all = fig.add_subplot(gs[1, :]) + + make_stats_subtable(ax_stats, comparison_table) + make_score_subtable(ax_dist, comparison_table) + make_all_directors_subtable(ax_all, comparison_table) + + fig.tight_layout() + fig.savefig(output_path, dpi=150, bbox_inches="tight") + plt.close(fig) + print(f" Summary figure saved to: {output_path}") + + +# ─── Main ───────────────────────────────────────────────────────────────────── + +def main(): + script_dir = Path(__file__).parent + + pz_results = pd.read_csv(script_dir / "query6_pz_output.csv") + ground_truth = pd.read_csv(script_dir / "../queries/query6_ground_truth.csv") + + comparison = pz_results.merge( + ground_truth[["director", "normalizedScore"]], + on="director", + how="inner", + suffixes=("_pz", "_gt"), + ) + comparison["difference"] = abs(comparison["normalizedScore_pz"] - comparison["normalizedScore_gt"]) + + comparison_table = comparison[["director", "normalizedScore_pz", "normalizedScore_gt", "difference"]].copy() + comparison_table.columns = ["Director", "PZ Score", "Ground Truth Score", "Difference"] + + avg_difference = comparison_table["Difference"].mean() + variance_difference = comparison_table["Difference"].var() + std_difference = comparison_table["Difference"].std() + + print("\n" + "="*80) + print("QUERY 6 COMPARISON: PZ vs Ground Truth") + print("="*80 + "\n") + display_table = comparison_table.copy() + display_table["PZ Score"] = display_table["PZ Score"].map(lambda x: f"{x:.3f}") + display_table["Ground Truth Score"] = display_table["Ground Truth Score"].map(lambda x: f"{x:.3f}") + display_table["Difference"] = display_table["Difference"].map(lambda x: f"{x:.3f}") + print(display_table.to_string(index=False)) + print("\n" + "="*80) + print("STATISTICS") + print("="*80) + print(f"Number of directors compared: {len(comparison_table)}") + print(f"Average difference: {avg_difference:.4f}") + print(f"Variance of difference: {variance_difference:.4f}") + print(f"Standard deviation: {std_difference:.4f}") + print(f"Min difference: {comparison_table['Difference'].min():.4f}") + print(f"Max difference: {comparison_table['Difference'].max():.4f}") + print("="*80 + "\n") + + output_file = script_dir / "query6_comparison.csv" + comparison_table.to_csv(output_file, index=False) + + stats_file = script_dir / "query6_comparison_stats.txt" + with open(stats_file, "w") as f: + f.write("QUERY 6 COMPARISON STATISTICS\n") + f.write("="*50 + "\n") + f.write(f"Number of directors compared: {len(comparison_table)}\n") + f.write(f"Average difference: {avg_difference:.4f}\n") + f.write(f"Variance of difference: {variance_difference:.4f}\n") + f.write(f"Standard deviation: {std_difference:.4f}\n") + f.write(f"Min difference: {comparison_table['Difference'].min():.4f}\n") + f.write(f"Max difference: {comparison_table['Difference'].max():.4f}\n") + + print(f"Comparison table saved to: {output_file}") + print(f"Statistics saved to: {stats_file}") + + figure_file = script_dir / "query6_summary_table.png" + save_summary_figure(comparison_table, figure_file) + + +if __name__ == "__main__": + main() diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_2_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_2_pz.py index 300a06c8f..c8ae1dccb 100644 --- a/tests/semantic groupBy tests/movies/pz-programs/query_2_pz.py +++ b/tests/semantic groupBy tests/movies/pz-programs/query_2_pz.py @@ -30,7 +30,7 @@ def main(): - parser = argparse.ArgumentParser(description="Query 2: Reviews by Era") + parser = argparse.ArgumentParser(description="Reviews can be categorized into pre-2000, 2000s, 2010s, 2020s, or unknown. Return which era category the review falls into") parser.add_argument("--verbose", default=False, action="store_true") parser.add_argument("--policy", type=str, default="maxquality") parser.add_argument("--output", type=str, default="query2_pz_output.csv") @@ -61,8 +61,20 @@ def main(): # sem_groupby: LLM infers era from releaseDateTheaters, count reviewId per era grouped = reviews.sem_groupby( - gby_fields=["releaseDateTheaters"], - agg_fields=["reviewId"], + gby_fields=[ + { + "name": "releaseDateTheaters", + "type": str, + "desc": "Reviews can be categorized into pre-2000, 2000s, 2010s, 2020s, or unknown. Return which era category the review falls into)", + } + ], + agg_fields=[ + { + "name": "reviewId", + "type": int, + "desc": "Identifier of the review", + } + ], agg_funcs=["count"], ) @@ -71,7 +83,8 @@ def main(): config = pz.QueryProcessorConfig( policy=policy, verbose=args.verbose, - execution_strategy=args.execution_strategy, + execution_strategy="sequential", + available_models=[pz.Model.GPT_5], ) data_record_collection = grouped.run(config) exec_time = time.time() - start_time diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_3_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_3_pz.py index d904838c5..23af681ab 100644 --- a/tests/semantic groupBy tests/movies/pz-programs/query_3_pz.py +++ b/tests/semantic groupBy tests/movies/pz-programs/query_3_pz.py @@ -10,7 +10,7 @@ 2. sem_groupby – LLM semantically normalises the MPAA rating into audience-type buckets (Children, Teen, Adult, Unrated); lists scoreSentiment per group. - 3. Post-process list → frac_positive. + 3. Post-process list for frac_positive. """ import argparse @@ -56,7 +56,7 @@ def main(): script_dir = Path(__file__).parent # Load and filter data - reviews_df = pd.read_csv(script_dir / "../movie_reviews.csv").head(500) + reviews_df = pd.read_csv(script_dir / "../movie_reviews.csv") movies_df = pd.read_csv(script_dir / "../movies.csv") # Filter for director's movies and keep the rating column @@ -71,8 +71,20 @@ def main(): # sem_groupby: LLM maps MPAA rating → audience type bucket, list scoreSentiment grouped = reviews.sem_groupby( - gby_fields=["rating"], - agg_fields=["scoreSentiment"], + gby_fields=[ + { + "name": "rating", + "type": str, + "desc": "MPAA rating string (e.g., 'Adult', 'Teen', 'Children', 'Unrated')", + } + ], + agg_fields=[ + { + "name": "scoreSentiment", + "type": str, + "desc": "Sentiment label for the review (e.g., 'POSITIVE'/'NEGATIVE')", + } + ], agg_funcs=["list"], ) diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_4_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_4_pz.py index 76edf0860..3d345b90a 100644 --- a/tests/semantic groupBy tests/movies/pz-programs/query_4_pz.py +++ b/tests/semantic groupBy tests/movies/pz-programs/query_4_pz.py @@ -57,49 +57,93 @@ def main(): reviews = pz.MemoryDataset(id="reviews", vals=reviews_df) - # sem_groupby: LLM infers primaryGenre from reviewText, - # groups by [reviewText (→ genre), isTopCritic], - # lists scoreSentiment per group. - grouped = reviews.sem_groupby( - gby_fields=["reviewText", "isTopCritic"], - agg_fields=["scoreSentiment"], - agg_funcs=["list"], - ) + # Hierarchical semantic groupby: + # Level 0 — infer primary movie genre from reviewText (constrained to 11 values) + # Level 1 — group by the existing isTopCritic boolean field + groupby_fields = [ + [ + { + "name": "reviewText", + "type": str, + "desc": ( + "The primary genre of the movie being reviewed, inferred from the review text. " + "Must be exactly one of these values — no other labels are allowed: " + "'Action', 'Adventure', 'Comedy', 'Crime', 'Documentary', " + "'Drama', 'History', 'Mystery & thriller', 'Romance', 'Sci-fi', 'War'." + ), + } + ], + [ + { + "name": "isTopCritic", + "type": str, + "desc": ( + "Whether the reviewer is a top critic. " + "Use the existing isTopCritic field value directly — " + "True maps to 'Top Critic', False maps to 'Not Top Critic'. " + "Do not use any other labels." + ), + } + ], + ] + agg_fields = [ + [{"name": "scoreSentiment", "type": str, "desc": "Sentiment label for the review"}], + [{"name": "scoreSentiment", "type": str, "desc": "Sentiment label for the review"}], + ] + agg_funcs = [ + ["list"], + ["list"] + ] - # Execute start_time = time.time() - config = pz.QueryProcessorConfig( - policy=policy, - verbose=args.verbose, - execution_strategy=args.execution_strategy, + # hierarchical_sem_groupby now returns (nested_result, accumulated_gen_stats) + nested_result, gen_stats = reviews.hierarchical_sem_groupby( + groupby_fields=groupby_fields, + agg_fields=agg_fields, + agg_funcs=agg_funcs ) - data_record_collection = grouped.run(config) exec_time = time.time() - start_time - # Post-process: compute frac_positive from the sentiment lists - result_df = pd.DataFrame([ - { - "primaryGenre": r.reviewText, - "isTopCritic": r.isTopCritic, - "frac_positive": ( - sum(1 for s in r.scoreSentiment if str(s).upper() == "POSITIVE") - / len(r.scoreSentiment) - if len(r.scoreSentiment) > 0 - else 0.0 - ), - "review_count": len(r.scoreSentiment), - } - for r in data_record_collection - ]) + # Flatten nested results and compute frac_positive + rows = [] + for genre_key, inner_result in nested_result.items(): + genre = genre_key[0] if isinstance(genre_key, tuple) else genre_key + for r in inner_result.data_records: + # Normalize LLM string → boolean to match the GT's isTopCritic format + raw_itc = str(r.isTopCritic).strip().lower() + is_top_critic = raw_itc in ("top critic", "true", "yes", "1") + sentiments = r.scoreSentiment + frac_pos = ( + sum(1 for s in sentiments if str(s).upper() == "POSITIVE") / len(sentiments) + if len(sentiments) > 0 else 0.0 + ) + rows.append({ + "primaryGenre": genre, + "isTopCritic": is_top_critic, + "frac_positive": frac_pos, + "review_count": len(sentiments) + }) + result_df = pd.DataFrame(rows) os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) result_df.to_csv(args.output, index=False) if args.stats_output is not None: os.makedirs(os.path.dirname(args.stats_output) or ".", exist_ok=True) + total_cost = gen_stats.total_input_cost + gen_stats.total_output_cost + total_tokens = int(gen_stats.total_input_tokens + gen_stats.total_output_tokens) + stats = { + "total_execution_time": exec_time, + "total_execution_cost": total_cost, + "total_tokens": total_tokens, + "optimization_time": 0.0, + "plan_execution_time": exec_time, + } with open(args.stats_output, "w") as f: - json.dump(data_record_collection.execution_stats.to_json(), f, indent=2) + json.dump(stats, f, indent=2) print(f"\nExecution time: {exec_time:.2f}s") + print(f"Total cost: ${gen_stats.total_input_cost + gen_stats.total_output_cost:.4f}") + print(f"Total tokens: {int(gen_stats.total_input_tokens + gen_stats.total_output_tokens):,}") print(f"Results saved to: {args.output}") if args.stats_output is not None: print(f"Execution stats saved to: {args.stats_output}") diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_5_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_5_pz.py index ceab5f651..8f86b6e52 100644 --- a/tests/semantic groupBy tests/movies/pz-programs/query_5_pz.py +++ b/tests/semantic groupBy tests/movies/pz-programs/query_5_pz.py @@ -53,7 +53,7 @@ def main(): script_dir = Path(__file__).parent # Load and filter data - reviews_df = pd.read_csv(script_dir / "../movie_reviews.csv").head(500) + reviews_df = pd.read_csv(script_dir / "../movie_reviews.csv") movies_df = pd.read_csv(script_dir / "../movies.csv") filtered_movies = movies_df[ @@ -68,8 +68,26 @@ def main(): # sem_groupby: LLM reads reviewText and groups by emotional tone, count reviewId grouped = reviews.sem_groupby( - gby_fields=["reviewText"], - agg_fields=["reviewId"], + gby_fields=[ + { + "name": "reviewText", + "type": str, + "desc": ( + "The emotional tone of the review. " + "Must be exactly one of these three values — no other labels are allowed: " + "'Enthusiastic', " + "'Measured', " + "'Disappointed'." + ), + } + ], + agg_fields=[ + { + "name": "reviewId", + "type": int, + "desc": "Identifier of the review", + } + ], agg_funcs=["count"], ) diff --git a/tests/semantic groupBy tests/movies/pz-programs/query_6_pz.py b/tests/semantic groupBy tests/movies/pz-programs/query_6_pz.py index e69de29bb..5b1c4e8e1 100644 --- a/tests/semantic groupBy tests/movies/pz-programs/query_6_pz.py +++ b/tests/semantic groupBy tests/movies/pz-programs/query_6_pz.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +""" +Query 6 — Most Positive Review by Director (Palimpzest) + +Pipeline: + 1. Join movie_reviews with movies to get director per review. + 2. Drop records with missing or unparseable originalScore; normalise to [0, 1]. + 3. Python groupby("director") — exact, non-semantic. + 4. For each director group: sem_map to score each review's positivity. + 5. Find the review with the highest positivity score. + +Comparison metric: |ground_truth_normalizedScore − pz_normalizedScore| per director. +""" + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +import pandas as pd +from dotenv import load_dotenv + +repo_root = Path(__file__).resolve().parents[4] +sys.path.insert(0, str(repo_root / "src")) + +import palimpzest as pz + +load_dotenv() + + +def parse_score(score_str) -> float | None: + """Parse "3.5/4", "4/5", etc. into a float in [0, 1]. Returns None if unparseable.""" + if pd.isna(score_str) or str(score_str).strip() == "": + return None + parts = str(score_str).strip().split("/") + if len(parts) == 2: + try: + num, den = float(parts[0]), float(parts[1]) + return num / den if den != 0 else None + except ValueError: + return None + return None + + +def main(): + parser = argparse.ArgumentParser(description="Query 6: Most Positive Review by Director") + parser.add_argument("--verbose", default=False, action="store_true") + parser.add_argument("--policy", type=str, default="maxquality", + help="One of 'mincost', 'mintime', 'maxquality'") + parser.add_argument("--output", type=str, default="query6_pz_output.csv") + parser.add_argument("--stats-output", type=str, default=None, + help="Optional path to write execution stats JSON") + parser.add_argument( + "--execution-strategy", type=str, default="sequential", + help="One of 'sequential', 'pipelined', 'parallel'", + ) + args = parser.parse_args() + + policy_map = { + "mincost": pz.MinCost(), + "mintime": pz.MinTime(), + "maxquality": pz.MaxQuality(), + } + policy = policy_map.get(args.policy, pz.MaxQuality()) + + script_dir = Path(__file__).parent + + # ── Load and prepare data ───────────────────────────────────────── + reviews_df = pd.read_csv(script_dir / "../movie_reviews.csv") + movies_df = pd.read_csv(script_dir / "../movies.csv")[["id", "director"]] + + merged_df = reviews_df.merge(movies_df, on="id", how="left") + merged_df = merged_df.dropna(subset=["originalScore"]) + merged_df = merged_df[merged_df["originalScore"].str.strip() != ""] + merged_df["normalizedScore"] = merged_df["originalScore"].apply(parse_score) + merged_df = merged_df.dropna(subset=["normalizedScore", "director"]) + + directors = merged_df["director"].unique() + print(f"Loaded {len(merged_df)} reviews across {len(directors)} directors") + + # ── Non-semantic groupby + sem_agg per director ─────────────────── + rows = [] + # Accumulated execution stats across all sem_agg calls + acc_input_tokens = 0 + acc_output_tokens = 0 + acc_exec_cost = 0.0 + acc_opt_time = 0.0 + acc_plan_time = 0.0 + + wall_start = time.time() + + count = 0 + for director, group_df in merged_df.groupby("director"): + if count >= 40: + break + + count += 1 + # Keep the full group_df for lookup later + full_group_df = group_df[["reviewText", "normalizedScore"]].reset_index(drop=True) + + # Build a PZ dataset with reviewText and normalizedScore + ds = pz.MemoryDataset(id="reviews", vals=full_group_df) + + # Use sem_map to score each review's positivity (0-10 scale) + scored_ds = ds.sem_map( + cols=[{ + "name": "positivityScore", + "type": float, + "desc": "A score from 0 to 10 indicating how positive this review is, where 10 is extremely positive and 0 is very negative.", + }], + depends_on="reviewText", + ) + + # Create fresh config for each director group + config = pz.QueryProcessorConfig( + policy=policy, + verbose=args.verbose, + execution_strategy=args.execution_strategy, + ) + + result_collection = scored_ds.run(config) + + # Find the review with the highest positivity score + max_score_idx = -1 + max_positivity = -1 + scored_reviews = [] + for idx, r in enumerate(result_collection): + scored_reviews.append(r) + if r.positivityScore > max_positivity: + max_positivity = r.positivityScore + max_score_idx = idx + + # Get the most positive review + most_positive = None + norm_score = None + if max_score_idx >= 0: + best_review = scored_reviews[max_score_idx] + most_positive = best_review.reviewText + norm_score = float(best_review.normalizedScore) + + rows.append({ + "director": director, + "mostPositiveReview": most_positive, + "normalizedScore": norm_score, + }) + + # Accumulate execution stats from this director's run + es = result_collection.execution_stats + acc_input_tokens += es.total_input_tokens + acc_output_tokens += es.total_output_tokens + acc_exec_cost += es.total_execution_cost + acc_opt_time += es.optimization_time + acc_plan_time += es.plan_execution_time + + wall_time = time.time() - wall_start + + # ── Save results ────────────────────────────────────────────────── + result_df = pd.DataFrame(rows).sort_values("director").reset_index(drop=True) + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + result_df.to_csv(args.output, index=False) + + # ── Save execution stats ────────────────────────────────────────── + if args.stats_output is not None: + stats = { + "total_execution_time": wall_time, + "total_optimization_time": acc_opt_time, + "plan_execution_time": acc_plan_time, + "total_input_tokens": acc_input_tokens, + "total_output_tokens": acc_output_tokens, + "total_tokens": acc_input_tokens + acc_output_tokens, + "total_execution_cost": acc_exec_cost, + "num_directors": len(rows), + } + os.makedirs(os.path.dirname(args.stats_output) or ".", exist_ok=True) + with open(args.stats_output, "w") as f: + json.dump(stats, f, indent=2) + + print(f"\nExecution time: {wall_time:.2f}s") + print(f"Total tokens: {acc_input_tokens + acc_output_tokens:,}") + print(f"Total cost: ${acc_exec_cost:.4f}") + print(f"Results saved to: {args.output}") + if args.stats_output is not None: + print(f"Execution stats saved to: {args.stats_output}") + print(f"Generated {len(result_df)} director groups") + + +if __name__ == "__main__": + main() diff --git a/tests/semantic groupBy tests/movies/queries/query_6.py b/tests/semantic groupBy tests/movies/queries/query_6.py index e69de29bb..e53951944 100644 --- a/tests/semantic groupBy tests/movies/queries/query_6.py +++ b/tests/semantic groupBy tests/movies/queries/query_6.py @@ -0,0 +1,78 @@ +""" +Query 6 — Most Positive Review by Director (Semantic GroupBy + Numeric Agg) + +Query NL: "Group by director and find the most positive review per director" +- group_cols: ["director" (literal, from movies table)] +- agg_cols: [max(normalizedScore) from originalScore] +- semantic group: no (director is a literal column) +- semantic agg: yes (LLM("reviewText") used in PZ to score sentiment) + +Ground truth: + 1. Join movie_reviews with movies on id to get director per review. + 2. Drop records where originalScore is missing or unparseable. + 3. Normalize originalScore ("3.5/4", "4/5", etc.) to [0, 1]. + 4. For each director, select the review with the highest normalized score. + +do it for each director and compute the distance between the score of the most positive +review using sem_groupBy (LLM(reviewText)) to actual output from python (ground truth). + +doing directionally better. (don't worry about the exact numbers, just want to see if +it's improving or not). Show that these optimisations can get better performance and +then bake it into the query optimiser. (put it into the PZ and show the optimiser +can pick the best one) +""" + +import pandas as pd + + +def parse_score(score_str): + """ + Parse scores like "3.5/4", "4/5", "1/10" into a float in [0, 1]. + Returns None if the string is missing or unparseable. + """ + if pd.isna(score_str) or str(score_str).strip() == "": + return None + parts = str(score_str).strip().split("/") + if len(parts) == 2: + try: + numerator = float(parts[0]) + denominator = float(parts[1]) + if denominator == 0: + return None + return numerator / denominator + except ValueError: + return None + return None + + +reviews = pd.read_csv("../movie_reviews.csv") +movies = pd.read_csv("../movies.csv")[["id", "director"]] + +# Join to get director for each review +merged = reviews.merge(movies, on="id", how="left") + +# Drop records with missing originalScore +merged = merged.dropna(subset=["originalScore"]) +merged = merged[merged["originalScore"].str.strip() != ""] + +# Normalize originalScore to [0, 1] +merged["normalizedScore"] = merged["originalScore"].apply(parse_score) + +# Drop records where score could not be parsed +merged = merged.dropna(subset=["normalizedScore"]) + +# Drop records with missing director +merged = merged.dropna(subset=["director"]) + +# For each director, pick the review with the highest normalized score +result = ( + merged + .sort_values("normalizedScore", ascending=False) + .groupby("director", as_index=False) + .first()[["director", "normalizedScore", "reviewText", "originalScore"]] +) + +result = result.sort_values("director").reset_index(drop=True) + +result.to_csv("query6_ground_truth.csv", index=False) +print(f"Generated ground truth with {len(result)} directors") \ No newline at end of file diff --git a/tests/semantic groupBy tests/movies/rerun_comparison.py b/tests/semantic groupBy tests/movies/rerun_comparison.py new file mode 100644 index 000000000..ca26dc949 --- /dev/null +++ b/tests/semantic groupBy tests/movies/rerun_comparison.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +Rerun comparisons against already-generated PZ / baseline outputs. + +What this script does: + 1. Regenerates the ground-truth CSVs for Q3 and Q5 (fixing input inconsistencies). + 2. Recomputes comparison metrics (using normalized MAE quality) for every query + and system (pz / baseline) using the *existing* output CSVs — the PZ and + baseline programs themselves are NOT re-run. + 3. Writes updated comparison JSONs. + 4. Calls analyze.py to regenerate all figures and tables. + +Usage: + python rerun_comparison.py [--policies maxquality] [--ids 3,5] +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path +from typing import Any + +import pandas as pd +from pandas.api.types import is_numeric_dtype + +BASE_DIR = Path(__file__).resolve().parent +QUERIES_DIR = BASE_DIR / "queries" +RESULTS_DIR = BASE_DIR / "results" +ANALYZE_SCRIPT = RESULTS_DIR / "analyze.py" + + +# ─── Quality metric (normalized MAE) ────────────────────────────────────────── + +def _compare_outputs( + gt_df: pd.DataFrame, + pred_df: pd.DataFrame, + pred_suffix: str, # "pz" or "baseline" + tol: float, +) -> dict[str, Any]: + common_cols = sorted(set(gt_df.columns).intersection(pred_df.columns)) + if not common_cols: + return { + "pass": False, + "reason": "no_common_columns", + "num_rows_gt": len(gt_df), + f"num_rows_{pred_suffix}": len(pred_df), + } + + key_cols, numeric_cols = [], [] + for col in common_cols: + if is_numeric_dtype(gt_df[col]) and is_numeric_dtype(pred_df[col]): + numeric_cols.append(col) + else: + key_cols.append(col) + + if key_cols: + merged = gt_df.merge( + pred_df, + on=key_cols, + how="outer", + suffixes=("_gt", f"_{pred_suffix}"), + indicator=True, + ) + missing_in_pred = int((merged["_merge"] == "left_only").sum()) + missing_in_gt = int((merged["_merge"] == "right_only").sum()) + compare_rows = merged[merged["_merge"] == "both"] + else: + gt_s = gt_df.sort_values(by=common_cols).reset_index(drop=True) + pred_s = pred_df.sort_values(by=common_cols).reset_index(drop=True) + n = min(len(gt_s), len(pred_s)) + compare_rows = pd.concat( + [gt_s.iloc[:n].add_suffix("_gt"), pred_s.iloc[:n].add_suffix(f"_{pred_suffix}")], + axis=1, + ) + missing_in_pred = max(0, len(gt_s) - len(pred_s)) + missing_in_gt = max(0, len(pred_s) - len(gt_s)) + + metrics: dict[str, Any] = { + "num_rows_gt": len(gt_df), + f"num_rows_{pred_suffix}": len(pred_df), + f"missing_in_{pred_suffix}": missing_in_pred, + "missing_in_gt": missing_in_gt, + "num_compared": len(compare_rows), + } + + max_abs_error = mean_abs_error = mean_norm_error = None + mismatched_rows = 0 + + if numeric_cols and len(compare_rows) > 0: + abs_errors, norm_errors = [], [] + for col in numeric_cols: + gt_col = f"{col}_gt" + pred_col = f"{col}_{pred_suffix}" + if gt_col not in compare_rows or pred_col not in compare_rows: + continue + diff = (compare_rows[gt_col] - compare_rows[pred_col]).abs() + abs_errors.append(diff) + gt_mean = compare_rows[gt_col].abs().mean() + norm_errors.append(diff / gt_mean if gt_mean > 0 else diff) + + if abs_errors: + all_abs = pd.concat(abs_errors, axis=1) + all_norm = pd.concat(norm_errors, axis=1) + max_abs_error = float(all_abs.max().max()) + mean_abs_error = float(all_abs.mean().mean()) + mean_norm_error = float(all_norm.mean().mean()) + mismatched_rows = int((all_abs.max(axis=1) > tol).sum()) + + metrics.update({ + "max_abs_error": max_abs_error, + "mean_abs_error": mean_abs_error, + "mismatched_rows": mismatched_rows, + }) + + passed = ( + missing_in_pred == 0 + and missing_in_gt == 0 + and (max_abs_error is None or max_abs_error <= tol) + and mismatched_rows == 0 + ) + metrics["pass"] = bool(passed) + + if mean_norm_error is not None: + metrics["quality_score"] = float(max(0.0, 1.0 - mean_norm_error)) + + return metrics + + +# ─── Ground-truth regeneration ──────────────────────────────────────────────── + +def _regen_ground_truth(query_ids: list[int]) -> None: + """Re-run the GT scripts for the given query IDs.""" + for qid in query_ids: + script = QUERIES_DIR / f"query_{qid}.py" + if not script.exists(): + print(f" [GT] query_{qid}.py not found — skipping") + continue + print(f" [GT] Regenerating ground truth for query {qid}...") + subprocess.run( + [sys.executable, str(script)], + cwd=str(QUERIES_DIR), + check=True, + ) + + +# ─── Main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + parser = argparse.ArgumentParser(description="Rerun comparison (no PZ re-execution)") + parser.add_argument("--policies", default="maxquality", + help="Comma-separated policies (default: maxquality)") + parser.add_argument("--ids", default="", + help="Comma-separated query IDs to update (default: all found)") + parser.add_argument("--regen-gt-ids", default="5", + help="Comma-separated IDs whose GT CSV should be regenerated " + "(default: 5, which had the approxTone→emotionalTone fix)") + parser.add_argument("--tolerance", type=float, default=1e-6) + parser.add_argument("--skip-analyze", action="store_true", + help="Skip calling analyze.py at the end") + args = parser.parse_args() + + policies = [p.strip() for p in args.policies.split(",") if p.strip()] + regen_gt_ids = [int(x) for x in args.regen_gt_ids.split(",") if x.strip().isdigit()] + requested_ids = {int(x) for x in args.ids.split(",") if x.strip().isdigit()} + + # Step 1 – regenerate ground-truth CSVs for the fixed queries + if regen_gt_ids: + print("\n── Regenerating ground-truth CSVs ──") + _regen_ground_truth(regen_gt_ids) + + # Step 2 – find all queries that have a ground-truth CSV + gt_paths = { + int(p.stem.replace("query", "").replace("_ground_truth", "")): p + for p in QUERIES_DIR.glob("query*_ground_truth.csv") + } + if requested_ids: + gt_paths = {k: v for k, v in gt_paths.items() if k in requested_ids} + + print(f"\n── Recomputing comparisons for queries: {sorted(gt_paths)} ──") + + for policy in policies: + policy_dir = RESULTS_DIR / policy + if not policy_dir.exists(): + print(f" Policy dir not found: {policy_dir} — skipping") + continue + + for qid, gt_path in sorted(gt_paths.items()): + gt_df = pd.read_csv(gt_path) + + for pred_suffix, json_name, csv_name in [ + ("pz", f"query{qid}_comparison.json", f"query{qid}_pz_output.csv"), + ("baseline", f"query{qid}_baseline_comparison.json", f"query{qid}_baseline_output.csv"), + ]: + pred_csv = policy_dir / csv_name + json_path = policy_dir / json_name + + if not pred_csv.exists(): + print(f" [Q{qid}][{policy}][{pred_suffix}] output CSV missing — skipping") + continue + + pred_df = pd.read_csv(pred_csv) + compare = _compare_outputs(gt_df, pred_df, pred_suffix, args.tolerance) + + # Preserve execution stats from the existing JSON + exec_stats: dict[str, Any] = {} + if json_path.exists(): + with open(json_path) as f: + old = json.load(f) + for key in ("total_execution_time", "total_execution_cost", + "total_tokens", "optimization_time", "plan_execution_time"): + exec_stats[key] = old.get(key) + + row = { + "test_id": qid, + "policy": policy, + **exec_stats, + **compare, + } + with open(json_path, "w") as f: + json.dump(row, f, indent=2) + + q_score = compare.get("quality_score", "n/a") + status = "PASS" if compare.get("pass") else "FAIL" + print(f" [Q{qid}][{policy}][{pred_suffix}] {status} quality={q_score:.4f}" if isinstance(q_score, float) else f" [Q{qid}][{policy}][{pred_suffix}] {status} quality={q_score}") + + # Step 3 – regenerate figures + if not args.skip_analyze: + if ANALYZE_SCRIPT.exists(): + print(f"\n── Regenerating figures ({ANALYZE_SCRIPT.name}) ──") + subprocess.run([sys.executable, str(ANALYZE_SCRIPT)], check=True) + else: + print(f"\nanalyze.py not found at {ANALYZE_SCRIPT} — skipping figure generation") + + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/tests/semantic groupBy tests/movies/run_baseline_tests.py b/tests/semantic groupBy tests/movies/run_baseline_tests.py new file mode 100644 index 000000000..31269d01c --- /dev/null +++ b/tests/semantic groupBy tests/movies/run_baseline_tests.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +""" +Run baseline group-by tests: execute baseline implementations (sem_map + groupby), +compare outputs against ground truth, and log performance metrics. + +Results are written to the same results/ folder as the sem_groupby tests, +with '_baseline' suffixed filenames so both approaches can be compared side-by-side. +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path +from typing import Any + +import pandas as pd +from pandas.api.types import is_numeric_dtype + + +BASE_DIR = Path(__file__).resolve().parent +QUERIES_DIR = BASE_DIR / "queries" +BASELINE_DIR = BASE_DIR / "pz-baseline" +RESULTS_DIR = BASE_DIR / "results" + + +def _discover_tests() -> list[dict[str, Path]]: + """Find matching pairs of ground-truth query scripts and baseline scripts.""" + query_files = {} + for path in QUERIES_DIR.glob("query_*.py"): + parts = path.stem.split("_") + if len(parts) == 2 and parts[1].isdigit(): + query_files[int(parts[1])] = path + + baseline_files = {} + for path in BASELINE_DIR.glob("query_*_baseline.py"): + parts = path.stem.split("_") + if len(parts) == 3 and parts[1].isdigit() and parts[2] == "baseline": + baseline_files[int(parts[1])] = path + + test_ids = sorted(set(query_files).intersection(baseline_files)) + tests = [] + for test_id in test_ids: + tests.append({ + "id": test_id, + "query_script": query_files[test_id], + "baseline_script": baseline_files[test_id], + }) + return tests + + +def _run_script(script_path: Path, cwd: Path, args: list[str]) -> None: + cmd = [sys.executable, str(script_path), *args] + subprocess.run(cmd, cwd=str(cwd), check=True) + + +def _ground_truth_output_path(test_id: int) -> Path: + return QUERIES_DIR / f"query{test_id}_ground_truth.csv" + + +def _compare_outputs(gt_df: pd.DataFrame, baseline_df: pd.DataFrame, tol: float) -> dict[str, Any]: + common_cols = sorted(set(gt_df.columns).intersection(baseline_df.columns)) + if not common_cols: + return { + "pass": False, + "reason": "no_common_columns", + "num_rows_gt": len(gt_df), + "num_rows_baseline": len(baseline_df), + } + + key_cols = [] + numeric_cols = [] + for col in common_cols: + gt_is_num = is_numeric_dtype(gt_df[col]) + bl_is_num = is_numeric_dtype(baseline_df[col]) + if gt_is_num and bl_is_num: + numeric_cols.append(col) + else: + key_cols.append(col) + + if key_cols: + merged = gt_df.merge( + baseline_df, + on=key_cols, + how="outer", + suffixes=("_gt", "_baseline"), + indicator=True, + ) + missing_in_baseline = (merged["_merge"] == "left_only").sum() + missing_in_gt = (merged["_merge"] == "right_only").sum() + compare_rows = merged[merged["_merge"] == "both"] + else: + gt_sorted = gt_df.sort_values(by=common_cols).reset_index(drop=True) + bl_sorted = baseline_df.sort_values(by=common_cols).reset_index(drop=True) + min_len = min(len(gt_sorted), len(bl_sorted)) + compare_rows = pd.concat( + [ + gt_sorted.iloc[:min_len].add_suffix("_gt"), + bl_sorted.iloc[:min_len].add_suffix("_baseline"), + ], + axis=1, + ) + missing_in_baseline = max(0, len(gt_sorted) - len(bl_sorted)) + missing_in_gt = max(0, len(bl_sorted) - len(gt_sorted)) + + metrics: dict[str, Any] = { + "num_rows_gt": len(gt_df), + "num_rows_baseline": len(baseline_df), + "missing_in_baseline": int(missing_in_baseline), + "missing_in_gt": int(missing_in_gt), + "num_compared": int(len(compare_rows)), + } + + max_abs_error = None + mean_abs_error = None + mismatched_rows = 0 + + if numeric_cols: + abs_errors = [] + norm_errors = [] + for col in numeric_cols: + gt_col = f"{col}_gt" + bl_col = f"{col}_baseline" + if gt_col not in compare_rows or bl_col not in compare_rows: + continue + diff = (compare_rows[gt_col] - compare_rows[bl_col]).abs() + abs_errors.append(diff) + # Normalize by GT column mean so different-scale metrics contribute equally + gt_mean = compare_rows[gt_col].abs().mean() + norm_diff = diff / gt_mean if gt_mean > 0 else diff + norm_errors.append(norm_diff) + + if abs_errors: + all_errors = pd.concat(abs_errors, axis=1) + all_norm_errors = pd.concat(norm_errors, axis=1) + max_abs_error = float(all_errors.max().max()) + mean_abs_error = float(all_errors.mean().mean()) + mean_norm_error = float(all_norm_errors.mean().mean()) + mismatched_rows = int((all_errors.max(axis=1) > tol).sum()) + + metrics.update({ + "max_abs_error": max_abs_error, + "mean_abs_error": mean_abs_error, + "mismatched_rows": mismatched_rows, + }) + + passed = ( + missing_in_baseline == 0 + and missing_in_gt == 0 + and (max_abs_error is None or max_abs_error <= tol) + and mismatched_rows == 0 + ) + + metrics["pass"] = bool(passed) + if mean_abs_error is not None: + # Normalized MAE: each column's errors are scaled by its GT mean, + # so large-magnitude metrics (e.g. review_count) don't drown out + # small-magnitude ones (e.g. frac_positive). + metrics["quality_score"] = float(max(0.0, 1.0 - mean_norm_error)) + + return metrics + + +def main() -> None: + parser = argparse.ArgumentParser(description="Run baseline group-by tests") + parser.add_argument("--policies", type=str, default="maxquality,mincost,mintime", + help="Comma-separated list of policies to run") + parser.add_argument("--execution-strategy", type=str, default="sequential", + help="One of 'sequential', 'pipelined', 'parallel'") + parser.add_argument("--tolerance", type=float, default=1e-6) + parser.add_argument("--regen-ground-truth", action="store_true", + help="Re-run ground truth scripts even if output already exists") + parser.add_argument("--ids", type=str, default="", + help="Comma-separated test ids to run (e.g., '1,2,3')") + args = parser.parse_args() + + policies = [p.strip() for p in args.policies.split(",") if p.strip()] + requested_ids = {int(x) for x in args.ids.split(",") if x.strip().isdigit()} + + tests = _discover_tests() + if requested_ids: + tests = [t for t in tests if t["id"] in requested_ids] + + if not tests: + print("No baseline tests found.") + return + + print(f"Found {len(tests)} test(s): {[t['id'] for t in tests]}") + print(f"Policies: {policies}") + print(f"Execution strategy: {args.execution_strategy}\n") + + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + summary_rows = [] + + for test in tests: + test_id = test["id"] + + # Generate / load ground truth + gt_output = _ground_truth_output_path(test_id) + if args.regen_ground_truth or not gt_output.exists(): + print(f"[query {test_id}] Generating ground truth...") + _run_script(test["query_script"], QUERIES_DIR, []) + + if not gt_output.exists(): + print(f"[query {test_id}] Ground truth missing: {gt_output} — skipping") + continue + + gt_df = pd.read_csv(gt_output) + + for policy in policies: + policy_dir = RESULTS_DIR / policy + policy_dir.mkdir(parents=True, exist_ok=True) + + baseline_output = policy_dir / f"query{test_id}_baseline_output.csv" + stats_output = policy_dir / f"query{test_id}_baseline_stats.json" + + print(f"[query {test_id}][{policy}] Running baseline...") + _run_script( + test["baseline_script"], + BASELINE_DIR, + [ + "--policy", policy, + "--execution-strategy", args.execution_strategy, + "--output", str(baseline_output), + "--stats-output", str(stats_output), + ], + ) + + baseline_df = pd.read_csv(baseline_output) if baseline_output.exists() else pd.DataFrame() + compare_metrics = _compare_outputs(gt_df, baseline_df, args.tolerance) + + exec_metrics: dict[str, Any] = {} + if stats_output.exists(): + with open(stats_output) as f: + stats = json.load(f) + exec_metrics = { + "total_execution_time": stats.get("total_execution_time"), + "total_execution_cost": stats.get("total_execution_cost"), + "total_tokens": stats.get("total_tokens"), + "optimization_time": stats.get("optimization_time"), + "plan_execution_time": stats.get("plan_execution_time"), + } + + row = { + "test_id": test_id, + "policy": policy, + **exec_metrics, + **compare_metrics, + } + summary_rows.append(row) + + result_json = policy_dir / f"query{test_id}_baseline_comparison.json" + with open(result_json, "w") as f: + json.dump(row, f, indent=2) + + status = "PASS" if compare_metrics.get("pass") else "FAIL" + print(f"[query {test_id}][{policy}] {status}") + + summary_path = RESULTS_DIR / "baseline_summary.csv" + pd.DataFrame(summary_rows).to_csv(summary_path, index=False) + print(f"\nSummary written to: {summary_path}") + + +if __name__ == "__main__": + main() diff --git a/tests/semantic groupBy tests/movies/run_groupby_tests.py b/tests/semantic groupBy tests/movies/run_groupby_tests.py new file mode 100644 index 000000000..1cf5c7dab --- /dev/null +++ b/tests/semantic groupBy tests/movies/run_groupby_tests.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +""" +Run semantic group-by tests: generate ground truth, execute PZ programs, +compare outputs, and log performance metrics. +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path +from typing import Any + +import pandas as pd +from pandas.api.types import is_numeric_dtype + + +BASE_DIR = Path(__file__).resolve().parent +QUERIES_DIR = BASE_DIR / "queries" +PZ_DIR = BASE_DIR / "pz-programs" +RESULTS_DIR = BASE_DIR / "results" + + +def _discover_tests() -> list[dict[str, Path]]: + query_files = {} + for path in QUERIES_DIR.glob("query_*.py"): + parts = path.stem.split("_") + if len(parts) == 2 and parts[1].isdigit(): + query_files[int(parts[1])] = path + + pz_files = {} + for path in PZ_DIR.glob("query_*_pz.py"): + parts = path.stem.split("_") + if len(parts) == 3 and parts[1].isdigit() and parts[2] == "pz": + pz_files[int(parts[1])] = path + + test_ids = sorted(set(query_files).intersection(pz_files)) + tests = [] + for test_id in test_ids: + tests.append({ + "id": test_id, + "query_script": query_files[test_id], + "pz_script": pz_files[test_id], + }) + return tests + + +def _run_script(script_path: Path, cwd: Path, args: list[str]) -> None: + cmd = [sys.executable, str(script_path), *args] + subprocess.run(cmd, cwd=str(cwd), check=True) + + +def _ground_truth_output_path(test_id: int) -> Path: + return QUERIES_DIR / f"query{test_id}_ground_truth.csv" + + +def _compare_outputs(gt_df: pd.DataFrame, pz_df: pd.DataFrame, tol: float) -> dict[str, Any]: + common_cols = sorted(set(gt_df.columns).intersection(pz_df.columns)) + if not common_cols: + return { + "pass": False, + "reason": "no_common_columns", + "num_rows_gt": len(gt_df), + "num_rows_pz": len(pz_df), + } + + key_cols = [] + numeric_cols = [] + for col in common_cols: + gt_is_num = is_numeric_dtype(gt_df[col]) + pz_is_num = is_numeric_dtype(pz_df[col]) + if gt_is_num and pz_is_num: + numeric_cols.append(col) + else: + key_cols.append(col) + + if key_cols: + merged = gt_df.merge( + pz_df, + on=key_cols, + how="outer", + suffixes=("_gt", "_pz"), + indicator=True, + ) + missing_in_pz = (merged["_merge"] == "left_only").sum() + missing_in_gt = (merged["_merge"] == "right_only").sum() + compare_rows = merged[merged["_merge"] == "both"] + else: + gt_sorted = gt_df.sort_values(by=common_cols).reset_index(drop=True) + pz_sorted = pz_df.sort_values(by=common_cols).reset_index(drop=True) + min_len = min(len(gt_sorted), len(pz_sorted)) + compare_rows = pd.concat( + [ + gt_sorted.iloc[:min_len].add_suffix("_gt"), + pz_sorted.iloc[:min_len].add_suffix("_pz"), + ], + axis=1, + ) + missing_in_pz = max(0, len(gt_sorted) - len(pz_sorted)) + missing_in_gt = max(0, len(pz_sorted) - len(gt_sorted)) + + metrics: dict[str, Any] = { + "num_rows_gt": len(gt_df), + "num_rows_pz": len(pz_df), + "missing_in_pz": int(missing_in_pz), + "missing_in_gt": int(missing_in_gt), + "num_compared": int(len(compare_rows)), + } + + max_abs_error = None + mean_abs_error = None + mismatched_rows = 0 + + if numeric_cols: + abs_errors = [] + norm_errors = [] + for col in numeric_cols: + gt_col = f"{col}_gt" + pz_col = f"{col}_pz" + if gt_col not in compare_rows or pz_col not in compare_rows: + continue + diff = (compare_rows[gt_col] - compare_rows[pz_col]).abs() + abs_errors.append(diff) + # Normalize by GT column mean so different-scale metrics contribute equally + gt_mean = compare_rows[gt_col].abs().mean() + norm_diff = diff / gt_mean if gt_mean > 0 else diff + norm_errors.append(norm_diff) + + if abs_errors: + all_errors = pd.concat(abs_errors, axis=1) + all_norm_errors = pd.concat(norm_errors, axis=1) + max_abs_error = float(all_errors.max().max()) + mean_abs_error = float(all_errors.mean().mean()) + mean_norm_error = float(all_norm_errors.mean().mean()) + mismatched_rows = int((all_errors.max(axis=1) > tol).sum()) + + metrics.update({ + "max_abs_error": max_abs_error, + "mean_abs_error": mean_abs_error, + "mismatched_rows": mismatched_rows, + }) + + passed = ( + missing_in_pz == 0 + and missing_in_gt == 0 + and (max_abs_error is None or max_abs_error <= tol) + and mismatched_rows == 0 + ) + + metrics["pass"] = bool(passed) + if mean_abs_error is not None: + # Normalized MAE: each column's errors are scaled by its GT mean, + # so large-magnitude metrics (e.g. review_count) don't drown out + # small-magnitude ones (e.g. frac_positive). + quality_score = max(0.0, 1.0 - mean_norm_error) + metrics["quality_score"] = float(quality_score) + + return metrics + + +def main() -> None: + parser = argparse.ArgumentParser(description="Run semantic group-by tests") + parser.add_argument("--policies", type=str, default="maxquality,mincost,mintime", + help="Comma-separated list of policies to run") + parser.add_argument("--execution-strategy", type=str, default="sequential", + help="One of 'sequential', 'pipelined', 'parallel'") + parser.add_argument("--tolerance", type=float, default=1e-6) + parser.add_argument("--regen-ground-truth", action="store_true") + parser.add_argument("--ids", type=str, default="", + help="Comma-separated test ids to run (e.g., '1,2,3')") + args = parser.parse_args() + + policies = [p.strip() for p in args.policies.split(",") if p.strip()] + requested_ids = {int(x) for x in args.ids.split(",") if x.strip().isdigit()} + + tests = _discover_tests() + if requested_ids: + tests = [t for t in tests if t["id"] in requested_ids] + + if not tests: + print("No tests found.") + return + + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + summary_rows = [] + + for test in tests: + test_id = test["id"] + gt_output = _ground_truth_output_path(test_id) + if args.regen_ground_truth or not gt_output.exists(): + _run_script(test["query_script"], QUERIES_DIR, []) + + if not gt_output.exists(): + print(f"Ground truth missing for query {test_id}: {gt_output}") + continue + + gt_df = pd.read_csv(gt_output) + + for policy in policies: + policy_dir = RESULTS_DIR / policy + policy_dir.mkdir(parents=True, exist_ok=True) + pz_output = policy_dir / f"query{test_id}_pz_output.csv" + stats_output = policy_dir / f"query{test_id}_pz_stats.json" + + _run_script( + test["pz_script"], + PZ_DIR, + [ + "--policy", policy, + "--execution-strategy", args.execution_strategy, + "--output", str(pz_output), + "--stats-output", str(stats_output), + ], + ) + + pz_df = pd.read_csv(pz_output) if pz_output.exists() else pd.DataFrame() + compare_metrics = _compare_outputs(gt_df, pz_df, args.tolerance) + + exec_metrics: dict[str, Any] = {} + if stats_output.exists(): + with open(stats_output, "r") as f: + stats = json.load(f) + exec_metrics = { + "total_execution_time": stats.get("total_execution_time"), + "total_execution_cost": stats.get("total_execution_cost"), + "total_tokens": stats.get("total_tokens"), + "optimization_time": stats.get("optimization_time"), + "plan_execution_time": stats.get("plan_execution_time"), + } + + row = { + "test_id": test_id, + "policy": policy, + **exec_metrics, + **compare_metrics, + } + summary_rows.append(row) + + result_json = policy_dir / f"query{test_id}_comparison.json" + with open(result_json, "w") as f: + json.dump(row, f, indent=2) + + status = "PASS" if compare_metrics.get("pass") else "FAIL" + print(f"[query {test_id}][{policy}] {status}") + + summary_path = RESULTS_DIR / "summary.csv" + pd.DataFrame(summary_rows).to_csv(summary_path, index=False) + print(f"\nSummary written to: {summary_path}") + + +if __name__ == "__main__": + main() From e1478e62e1fe67afd2e2ab136f28d1fbbb8aa63f Mon Sep 17 00:00:00 2001 From: kepler11c Date: Sat, 14 Mar 2026 22:12:25 -0400 Subject: [PATCH 27/28] updated __call__ structure for SemanticGroupByOp --- src/palimpzest/query/operators/aggregate.py | 44 +++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index 35f5fee81..704ea5b97 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -732,6 +732,9 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: return DataRecordSet([dr], [record_op_stats]) +# group by and aggregate functions must follow a prespecified spec + # how do I enforce this + class SemanticGroupByOp(AggregateOp): """ Implementation of a semantic GroupBy operator using LLMs. This operator groups records by a set @@ -821,6 +824,47 @@ def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) quality=quality, ) + def __updated_call__(self, candidates: list[DataRecord]) -> DataRecordSet: + """ + Update: Group By now handles the following: + 1. multi-col groupBys (doesn't check semantic or not, but instead makes one LLM over the groups) + 2. differentiates between semantic and non-semantic group bys and aggregates. + + The groupBy call specifies the group by field as well as the description of the type of grouping + to be performed on the field. For example, if the field is "product name", the description might be "group products by their category". + + Args: + candidates: List of DataRecords to group and aggregate + + Returns: + DataRecordSet containing one DataRecord per group with aggregated values + """ + start_time = time.time() + + # Handle empty input + if len(candidates) == 0: + return DataRecordSet([], []) + + # Check if there are any semantic group by fields + is_semantic_gby = any(isinstance(f, dict) for f in self.gby_fields_spec) + + # Check if there are any semantic aggregation functions + is_semantic_agg = any(f not in ["avg", "count", "sum", "min", "max", "list", "set"] for f in self.agg_funcs) + + # Phase 1: Perform grouping (semantic or non-semantic) + group_assignments, groupby_stats = self._perform_groupby(candidates, is_semantic_gby) + + # Phase 2: Perform aggregation for each group (semantic or non-semantic) + grouped_records = self._group_candidates_by_assignment(candidates, group_assignments) + + # Phase 3: Apply aggregation functions to each group + drs, agg_stats_list = self._perform_aggregation( + grouped_records, is_semantic_agg, groupby_stats, start_time + ) + + return DataRecordSet(drs, agg_stats_list) + + def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: """ Execute the semantic group by operation on the given candidates using a two-phase approach: From 509b4dda802d7c144947e2986d02025e38e0691f Mon Sep 17 00:00:00 2001 From: kepler11c Date: Tue, 17 Mar 2026 13:12:51 -0400 Subject: [PATCH 28/28] WIP: updated Semantic group-by implementation --- src/palimpzest/query/operators/aggregate.py | 969 ++++++++++++++------ 1 file changed, 705 insertions(+), 264 deletions(-) diff --git a/src/palimpzest/query/operators/aggregate.py b/src/palimpzest/query/operators/aggregate.py index 704ea5b97..33e66a9ab 100644 --- a/src/palimpzest/query/operators/aggregate.py +++ b/src/palimpzest/query/operators/aggregate.py @@ -1,7 +1,10 @@ from __future__ import annotations import contextlib +import logging +import threading import time +from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Any from palimpzest.constants import ( @@ -14,10 +17,12 @@ ) from palimpzest.core.elements.records import DataRecord, DataRecordSet from palimpzest.core.lib.schemas import Average, Count, Max, Min, Sum -from palimpzest.core.models import OperatorCostEstimates, RecordOpStats, GenerationStats +from palimpzest.core.models import GenerationStats, OperatorCostEstimates, RecordOpStats from palimpzest.query.generators.generators import Generator from palimpzest.query.operators.physical import PhysicalOperator +logger = logging.getLogger(__name__) + class AggregateOp(PhysicalOperator): """ @@ -731,215 +736,643 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: ) return DataRecordSet([dr], [record_op_stats]) - -# group by and aggregate functions must follow a prespecified spec - # how do I enforce this - + + +# --------------------------------------------------------------------------- +# Constants for batching / parallelism defaults +# --------------------------------------------------------------------------- +DEFAULT_GROUPBY_BATCH_SIZE = 10 +"""Default number of records to send in a single LLM call for group assignment.""" + +DEFAULT_GROUPBY_PARALLELISM = 8 +"""Default number of concurrent threads for LLM calls in semantic groupby.""" + +DEFAULT_AGG_PARALLELISM = 4 +"""Default number of concurrent threads for semantic aggregation across groups.""" + +# Standard (non-semantic) aggregation function names recognised by the operator. +STANDARD_AGG_FUNCS = frozenset({"avg", "average", "count", "sum", "min", "max", "list", "set"}) + + class SemanticGroupByOp(AggregateOp): + """Semantic GroupBy operator backed by LLM calls. + + This operator supports: + * **Semantic grouping** -- the LLM determines which group each record belongs + to based on a natural-language description. + * **Exact grouping** -- records are partitioned by literal field values (no LLM + needed for the grouping phase). + * **Standard aggregation** -- count / sum / avg / min / max / list / set applied + per-group without an LLM. + * **Semantic aggregation** -- an LLM-based aggregation function (e.g. "summarise + the most positive review") applied per-group. + + Optimisation knobs + ------------------ + ``batch_size`` + Number of records to include in a *single* LLM prompt when assigning + groups (Phase 1). Larger batches amortise prompt overhead but increase + context length and risk of the model losing track of records. Set to 1 + to fall back to one-record-at-a-time mode. + + ``groupby_parallelism`` + Number of concurrent ``ThreadPoolExecutor`` workers for the LLM calls in + the grouping phase. Each worker processes one batch. This is modelled + after ``join_parallelism`` in ``NestedLoopsJoin``. + + ``agg_parallelism`` + Number of concurrent workers for semantic aggregation calls (one call per + group x semantic-agg-field combination). """ - Implementation of a semantic GroupBy operator using LLMs. This operator groups records by a set - of fields and applies aggregation functions to each group using an LLM to determine the groups. - """ - def __init__(self, gby_fields: list[str] | list[dict], agg_fields: list[str] | list[dict], agg_funcs: list[str], - model: Model | None = None, prompt_strategy: PromptStrategy = PromptStrategy.AGG, - reasoning_effort: str | None = None, *args, **kwargs): + + def __init__( + self, + gby_fields: list[str] | list[dict], + agg_fields: list[str] | list[dict], + agg_funcs: list[str], + model: Model | None = None, + prompt_strategy: PromptStrategy = PromptStrategy.AGG, + reasoning_effort: str | None = None, + batch_size: int = DEFAULT_GROUPBY_BATCH_SIZE, + groupby_parallelism: int = DEFAULT_GROUPBY_PARALLELISM, + agg_parallelism: int = DEFAULT_AGG_PARALLELISM, + *args, + **kwargs, + ): super().__init__(*args, **kwargs) - - # Store original field specifications (may be dicts or strings) + + # -- field specs ------------------------------------------------- self.gby_fields_spec = gby_fields self.agg_fields_spec = agg_fields - - # Extract field names for backward compatibility - self.gby_fields = [f['name'] if isinstance(f, dict) else f for f in gby_fields] - self.agg_fields = [f['name'] if isinstance(f, dict) else f for f in agg_fields] - + + # Extract plain field names for backward compatibility / quick access + self.gby_fields = [f["name"] if isinstance(f, dict) else f for f in gby_fields] + self.agg_fields = [f["name"] if isinstance(f, dict) else f for f in agg_fields] + self.agg_funcs = agg_funcs self.model = model self.prompt_strategy = prompt_strategy self.reasoning_effort = reasoning_effort - - # Initialize the generator for LLM calls - self.generator = Generator(self.model, self.prompt_strategy, self.reasoning_effort, self.api_base) + # -- optimisation knobs ------------------------------------------ + self.batch_size = max(1, batch_size) + self.groupby_parallelism = max(1, groupby_parallelism) + self.agg_parallelism = max(1, agg_parallelism) + + # -- generator (lazily initialised for exact-only operators) ----- + self._generator: Generator | None = None + if self.model is not None: + self._generator = Generator( + self.model, + self.prompt_strategy, + self.reasoning_effort, + ) + + # Thread-safety lock for stats accumulation + self._stats_lock = threading.Lock() + + # ------------------------------------------------------------------ + # Properties / accessors + # ------------------------------------------------------------------ + @property + def generator(self) -> Generator: + """Return the generator, raising if not initialised.""" + if self._generator is None: + raise RuntimeError( + "SemanticGroupByOp.generator accessed but no model was provided. " + "Semantic operations require a model." + ) + return self._generator + + def get_model_name(self) -> str | None: + return self.model.value if self.model is not None else None + + # ------------------------------------------------------------------ + # Repr helpers + # ------------------------------------------------------------------ def __str__(self): op = super().__str__() op += f" Group-by Fields: {self.gby_fields}\n" op += f" Agg. Fields: {self.agg_fields}\n" op += f" Agg. Funcs: {self.agg_funcs}\n" - op += f" Model: {self.model.value}\n" + if self.model is not None: + op += f" Model: {self.model.value}\n" op += f" Prompt Strategy: {self.prompt_strategy}\n" + op += f" Batch Size: {self.batch_size}\n" + op += f" GroupBy Parallelism: {self.groupby_parallelism}\n" + op += f" Agg Parallelism: {self.agg_parallelism}\n" return op def get_id_params(self): id_params = super().get_id_params() return { - "gby_fields": self.gby_fields, - "agg_fields": self.agg_fields, + "gby_fields": self.gby_fields, + "agg_fields": self.agg_fields, "agg_funcs": self.agg_funcs, - "model": self.model.value, - "prompt_strategy": self.prompt_strategy.value, + "model": self.model.value if self.model else None, + "prompt_strategy": self.prompt_strategy.value if self.prompt_strategy else None, "reasoning_effort": self.reasoning_effort, - **id_params + "batch_size": self.batch_size, + **id_params, } def get_op_params(self): op_params = super().get_op_params() return { - "gby_fields": self.gby_fields, - "agg_fields": self.agg_fields, + "gby_fields": self.gby_fields_spec, + "agg_fields": self.agg_fields_spec, "agg_funcs": self.agg_funcs, "model": self.model, "prompt_strategy": self.prompt_strategy, "reasoning_effort": self.reasoning_effort, - **op_params + "batch_size": self.batch_size, + "groupby_parallelism": self.groupby_parallelism, + "agg_parallelism": self.agg_parallelism, + **op_params, } - - def get_model_name(self) -> str: - return self.model.value + # ------------------------------------------------------------------ + # Cost estimation + # ------------------------------------------------------------------ def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates: - """ - Compute naive cost estimates for the semantic group by operation using an LLM. - """ - # estimate number of input and output tokens + """Naive cost estimate -- follows the same pattern as ``SemanticAggregate``.""" est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS * source_op_cost_estimates.cardinality est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS * NAIVE_EST_NUM_GROUPS - # get est. of conversion time per record from model card - model_name = self.model.value - model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens + if self.model is None: + # Exact-only groupby: negligible cost + return OperatorCostEstimates( + cardinality=NAIVE_EST_NUM_GROUPS, + time_per_record=0, + cost_per_record=0, + quality=1.0, + ) + + time_per_record = self.model.get_seconds_per_output_token() * est_num_output_tokens - # get est. of conversion cost (in USD) per record from model card usd_per_input_token = self.model.get_usd_per_input_token() if getattr(self, "prompt_strategy", None) is not None and self.is_audio_op(): usd_per_input_token = self.model.get_usd_per_audio_input_token() - # estimate quality of output based on the strength of the model being used - quality = (MODEL_CARDS[model_name]["overall"] / 100.0) + cost_per_record = ( + usd_per_input_token * est_num_input_tokens + + self.model.get_usd_per_output_token() * est_num_output_tokens + ) + + quality = self.model.get_overall_score() / 100.0 return OperatorCostEstimates( cardinality=NAIVE_EST_NUM_GROUPS, - time_per_record=model_conversion_time_per_record, - cost_per_record=model_conversion_usd_per_record, + time_per_record=time_per_record, + cost_per_record=cost_per_record, quality=quality, ) - def __updated_call__(self, candidates: list[DataRecord]) -> DataRecordSet: - """ - Update: Group By now handles the following: - 1. multi-col groupBys (doesn't check semantic or not, but instead makes one LLM over the groups) - 2. differentiates between semantic and non-semantic group bys and aggregates. + # ================================================================== + # MAIN ENTRY POINT + # ================================================================== + def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: + """Execute the semantic group-by operation. - The groupBy call specifies the group by field as well as the description of the type of grouping - to be performed on the field. For example, if the field is "product name", the description might be "group products by their category". + The pipeline has three phases: - Args: - candidates: List of DataRecords to group and aggregate - - Returns: - DataRecordSet containing one DataRecord per group with aggregated values + 1. **Grouping** -- assign each record to a group key (semantic or exact). + 2. **Partitioning** -- bucket records by their group key. + 3. **Aggregation** -- compute each agg function per group (semantic or + standard). + + Batching and parallelism are applied in Phase 1 and Phase 3. """ start_time = time.time() - - # Handle empty input + if len(candidates) == 0: return DataRecordSet([], []) - - # Check if there are any semantic group by fields - is_semantic_gby = any(isinstance(f, dict) for f in self.gby_fields_spec) - - # Check if there are any semantic aggregation functions - is_semantic_agg = any(f not in ["avg", "count", "sum", "min", "max", "list", "set"] for f in self.agg_funcs) - - # Phase 1: Perform grouping (semantic or non-semantic) + + # Detect modes + # A field is semantic if it was user-provided as a dict (needs LLM inference). + # Fields derived from plain column names have 'semantic': False. + is_semantic_gby = any( + (isinstance(f, dict) and f.get('semantic', True)) + for f in self.gby_fields_spec + ) + is_semantic_agg = any(f.lower() not in STANDARD_AGG_FUNCS for f in self.agg_funcs) + + # Phase 1 -- grouping group_assignments, groupby_stats = self._perform_groupby(candidates, is_semantic_gby) - - # Phase 2: Perform aggregation for each group (semantic or non-semantic) - grouped_records = self._group_candidates_by_assignment(candidates, group_assignments) - - # Phase 3: Apply aggregation functions to each group - drs, agg_stats_list = self._perform_aggregation( - grouped_records, is_semantic_agg, groupby_stats, start_time + + # Phase 2 -- partition + grouped_records = self._partition_by_group(candidates, group_assignments) + + # Phase 3 -- aggregation + drs, stats_lst = self._perform_aggregation( + grouped_records, is_semantic_agg, groupby_stats, candidates, start_time, ) - - return DataRecordSet(drs, agg_stats_list) + return DataRecordSet(drs, stats_lst) - def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: - """ - Execute the semantic group by operation on the given candidates using a two-phase approach: - Phase 1: LLM assigns each record to a group (MAP) - Phase 2: Apply aggregation functions to each group (REDUCE) - - Args: - candidates: List of DataRecords to group and aggregate - - Returns: - DataRecordSet containing one DataRecord per group with aggregated values + # ================================================================== + # PHASE 1: GROUPING + # ================================================================== + def _perform_groupby( + self, + candidates: list[DataRecord], + is_semantic: bool, + ) -> tuple[list[tuple], GenerationStats]: + """Route to semantic or exact grouping.""" + if is_semantic: + return self._perform_semantic_groupby(candidates) + return self._perform_exact_groupby(candidates) + + # -- exact groupby ------------------------------------------------- + def _perform_exact_groupby( + self, + candidates: list[DataRecord], + ) -> tuple[list[tuple], GenerationStats]: + """Group records by literal field values -- no LLM needed.""" + assignments: list[tuple] = [] + for candidate in candidates: + key = tuple(getattr(candidate, f, None) for f in self.gby_fields) + assignments.append(key) + return assignments, GenerationStats() + + # -- semantic groupby (batched + parallel) ------------------------- + def _perform_semantic_groupby( + self, + candidates: list[DataRecord], + ) -> tuple[list[tuple], GenerationStats]: + """Assign records to groups via LLM, with batching & parallelism. + + Records are split into batches of ``self.batch_size`` and submitted + to a ``ThreadPoolExecutor`` with ``self.groupby_parallelism`` workers. """ - start_time = time.time() - - # Handle empty input - if len(candidates) == 0: - return DataRecordSet([], []) - - # Use LLM to assign each record to a semantic group - group_assignments, gen_stats = self._assign_groups_llm(candidates) - - # Group candidates by their assigned group labels and compute aggregations - # Using the same approach as ApplyGroupByOp but with LLM-determined groups - agg_state = {} - for candidate, group_label in zip(candidates, group_assignments): - # Use group_label as the group key (tuple with single element) - group = (group_label,) - - # Initialize aggregation state for new groups - if group not in agg_state: - state = [] - for fun in self.agg_funcs: - state.append(ApplyGroupByOp.agg_init(fun)) + from palimpzest.core.lib.schemas import create_schema_from_fields + + # Build a tiny schema that the LLM fills in for each record + gby_schema_fields = [] + for spec in self.gby_fields_spec: + if isinstance(spec, dict): + gby_schema_fields.append({ + "name": spec["name"], + "type": spec.get("type", str), + "desc": spec.get("desc", f"Semantic group for {spec['name']}"), + }) else: - state = agg_state[group] - - # Merge values from this candidate into the aggregation state - for i in range(0, len(self.agg_funcs)): - fun = self.agg_funcs[i] - if not hasattr(candidate, self.agg_fields[i]): - raise TypeError(f"SemanticGroupByOp record missing expected field {self.agg_fields[i]}") - field = getattr(candidate, self.agg_fields[i]) - state[i] = ApplyGroupByOp.agg_merge(fun, state[i], field) - - agg_state[group] = state - - # Create output DataRecords (one per group) - drs = [] - record_op_stats_lst = [] - - # Get the output field names from the output schema - output_field_names = [f for f in self.output_schema.model_fields if f not in self.gby_fields] - - for group_key in agg_state: - # Build aggregated data item for this group - data_item = {} - - # Add group-by field value (extract from tuple) - data_item[self.gby_fields[0]] = group_key[0] - - # Add aggregation results (using agg_final to compute final values) - vals = agg_state[group_key] - for i in range(0, len(vals)): - agg_func = self.agg_funcs[i] - output_field_name = output_field_names[i] - v = ApplyGroupByOp.agg_final(agg_func, vals[i]) - data_item[output_field_name] = v - - # Create the DataRecord for this group - data_item_obj = self.output_schema(**data_item) - dr = DataRecord.from_agg_parents(data_item_obj, parent_records=candidates) + gby_schema_fields.append({ + "name": spec, + "type": str, + "desc": f"The semantic category for {spec}", + }) + groupby_schema = create_schema_from_fields(gby_schema_fields) + + # Natural-language instruction for the LLM + field_descs = "; ".join( + f"'{s['name']}': {s.get('desc', s['name'])}" + for s in ( + self.gby_fields_spec + if all(isinstance(s, dict) for s in self.gby_fields_spec) + else gby_schema_fields + ) + ) + agg_instruction = ( + f"Categorise each input record into a semantic group. " + f"The grouping fields and their descriptions are: {field_descs}. " + f"Return the group label(s) for each record." + ) + + # Split candidates into batches + batches: list[list[DataRecord]] = [ + candidates[i : i + self.batch_size] + for i in range(0, len(candidates), self.batch_size) + ] + + # Prepare output containers (order-preserving) + all_labels: list[list[str | tuple] | None] = [None] * len(batches) + accumulated_stats = GenerationStats() + + logger.info( + "SemanticGroupByOp: assigning %d records across %d batches " + "(batch_size=%d, parallelism=%d)", + len(candidates), len(batches), self.batch_size, self.groupby_parallelism, + ) + + def _process_batch( + batch_idx: int, batch: list[DataRecord], + ) -> tuple[int, list[str | tuple], GenerationStats]: + """Process a single batch of records through the LLM.""" + batch_labels: list[str | tuple] = [] + batch_stats = GenerationStats() + + input_fields = list(self.gby_fields) + fields = {f: str for f in self.gby_fields} + + gen_kwargs = { + "project_cols": input_fields, + "output_schema": groupby_schema, + "agg_instruction": agg_instruction, + } + + if len(batch) == 1: + # Single-record batch -- call generator directly + field_answers, _, gen_stats, _ = self.generator( + batch[0], fields, **gen_kwargs, + ) + label = self._extract_group_label(field_answers) + batch_labels.append(label) + if gen_stats is not None: + batch_stats += gen_stats + else: + # Multi-record batch -- pass list of candidates + field_answers, _, gen_stats, _ = self.generator( + batch, fields, **gen_kwargs, + ) + if gen_stats is not None: + batch_stats += gen_stats + + # The generator may return a list per field or a single value + # depending on cardinality; normalise to one label per record + batch_labels = self._extract_batch_group_labels( + field_answers, len(batch), + ) + + return batch_idx, batch_labels, batch_stats + + # Execute batches in parallel + with ThreadPoolExecutor(max_workers=self.groupby_parallelism) as executor: + futures = { + executor.submit(_process_batch, idx, batch): idx + for idx, batch in enumerate(batches) + } + for future in as_completed(futures): + batch_idx, labels, stats = future.result() + all_labels[batch_idx] = labels + with self._stats_lock: + accumulated_stats += stats + + # Flatten ordered labels -> one tuple per candidate + group_assignments: list[tuple] = [] + for batch_labels in all_labels: + for label in batch_labels: + if isinstance(label, tuple): + group_assignments.append(label) + else: + group_assignments.append((label,)) + + logger.info( + "SemanticGroupByOp: found %d unique groups from %d records", + len(set(group_assignments)), len(candidates), + ) + + return group_assignments, accumulated_stats + + # -- label extraction helpers -------------------------------------- + @staticmethod + def _coerce_to_str(val) -> str: + """Unwrap nested lists and coerce to a hashable string.""" + while isinstance(val, list): + val = val[0] if len(val) > 0 else None + if val is None: + return "unknown" + return str(val) + + def _extract_group_label(self, field_answers: dict) -> str | tuple: + """Extract a single group label from generator output.""" + if len(self.gby_fields) == 1: + val = field_answers.get(self.gby_fields[0]) + return self._coerce_to_str(val) + + # Multi-column groupby -> tuple + parts = [] + for f in self.gby_fields: + val = field_answers.get(f) + parts.append(self._coerce_to_str(val)) + return tuple(parts) + + @staticmethod + def _unwrap_generator_list(vals: list) -> list: + """Unwrap the extra nesting added by Generator._prepare_field_answers. + + The Generator with ONE_TO_ONE cardinality wraps every field value in a + list, so ``["a", "b", "c"]`` becomes ``[["a", "b", "c"]]``. For + batch group-label extraction we need the inner flat list. + """ + if len(vals) == 1 and isinstance(vals[0], list): + return vals[0] + return vals + + def _extract_batch_group_labels( + self, field_answers: dict, batch_size: int, + ) -> list[str | tuple]: + """Extract per-record group labels from a batched generator response.""" + labels: list[str | tuple] = [] + + if len(self.gby_fields) == 1: + field = self.gby_fields[0] + vals = field_answers.get(field, []) + if not isinstance(vals, list): + vals = [vals] + + # Unwrap double-nesting from Generator._prepare_field_answers + vals = self._unwrap_generator_list(vals) + + # Pad / truncate to batch_size + while len(vals) < batch_size: + vals.append("unknown") + for v in vals[:batch_size]: + labels.append(self._coerce_to_str(v)) + else: + # Multi-column: zip columns together + columns = [] + for f in self.gby_fields: + col_vals = field_answers.get(f, []) + if not isinstance(col_vals, list): + col_vals = [col_vals] + + # Unwrap double-nesting from Generator._prepare_field_answers + col_vals = self._unwrap_generator_list(col_vals) + + while len(col_vals) < batch_size: + col_vals.append("unknown") + columns.append(col_vals[:batch_size]) + + for row_vals in zip(*columns): + labels.append( + tuple(self._coerce_to_str(v) for v in row_vals), + ) + + return labels + + # ================================================================== + # PHASE 2: PARTITION + # ================================================================== + @staticmethod + def _partition_by_group( + candidates: list[DataRecord], + group_assignments: list[tuple], + ) -> dict[tuple, list[DataRecord]]: + """Bucket candidates into a dict keyed by their group assignment.""" + grouped: dict[tuple, list[DataRecord]] = {} + for candidate, key in zip(candidates, group_assignments): + grouped.setdefault(key, []).append(candidate) + return grouped + + # ================================================================== + # PHASE 3: AGGREGATION + # ================================================================== + def _perform_aggregation( + self, + grouped_records: dict[tuple, list[DataRecord]], + is_semantic_agg: bool, + groupby_stats: GenerationStats, + all_candidates: list[DataRecord], + start_time: float, + ) -> tuple[list[DataRecord], list[RecordOpStats]]: + """Dispatch to exact or semantic aggregation.""" + if is_semantic_agg: + return self._aggregate_semantic( + grouped_records, groupby_stats, all_candidates, start_time, + ) + return self._aggregate_exact( + grouped_records, groupby_stats, all_candidates, start_time, + ) + + # -- exact aggregation --------------------------------------------- + def _aggregate_exact( + self, + grouped_records: dict[tuple, list[DataRecord]], + groupby_stats: GenerationStats, + all_candidates: list[DataRecord], + start_time: float, + ) -> tuple[list[DataRecord], list[RecordOpStats]]: + """Apply standard agg functions (count/sum/...) per group -- no LLM.""" + drs: list[DataRecord] = [] + stats_lst: list[RecordOpStats] = [] + output_field_names = [ + f for f in self.output_schema.model_fields if f not in self.gby_fields + ] + num_groups = len(grouped_records) + + for group_key, group_candidates in grouped_records.items(): + # Initialise & merge aggregation state + state = [ApplyGroupByOp.agg_init(fun) for fun in self.agg_funcs] + for candidate in group_candidates: + for i, (fun, agg_field) in enumerate( + zip(self.agg_funcs, self.agg_fields), + ): + if not hasattr(candidate, agg_field): + raise TypeError( + f"SemanticGroupByOp record missing expected field {agg_field}" + ) + state[i] = ApplyGroupByOp.agg_merge( + fun, state[i], getattr(candidate, agg_field), + ) + + # Build output data item + data_item: dict[str, Any] = {} + for i, gby_field in enumerate(self.gby_fields): + data_item[gby_field] = group_key[i] + for i, agg_func in enumerate(self.agg_funcs): + data_item[output_field_names[i]] = ApplyGroupByOp.agg_final( + agg_func, state[i], + ) + + dr = DataRecord.from_agg_parents( + self.output_schema(**data_item), parent_records=all_candidates, + ) drs.append(dr) - - # Create RecordOpStats for this group - # Cost is from LLM group assignment only (aggregation is free) + + cost = ( + groupby_stats.cost_per_record / num_groups + if groupby_stats.cost_per_record > 0 + else 0.0 + ) + stats_lst.append( + RecordOpStats( + record_id=dr._id, + record_parent_ids=dr._parent_ids, + record_source_indices=dr._source_indices, + record_state=dr.to_dict(include_bytes=False), + full_op_id=self.get_full_op_id(), + logical_op_id=self.logical_op_id or "semantic-groupby", + op_name=self.op_name(), + time_per_record=(time.time() - start_time) / num_groups, + cost_per_record=cost, + model_name=self.get_model_name(), + input_fields=self.get_input_fields(), + generated_fields=list(self.output_schema.model_fields.keys()), + input_text_tokens=groupby_stats.input_text_tokens / num_groups, + output_text_tokens=groupby_stats.output_text_tokens / num_groups, + llm_call_duration_secs=groupby_stats.llm_call_duration_secs / num_groups, + total_llm_calls=groupby_stats.total_llm_calls / num_groups, + op_details={k: str(v) for k, v in self.get_id_params().items()}, + ) + ) + + return drs, stats_lst + + # -- semantic aggregation (parallel across groups) ----------------- + def _aggregate_semantic( + self, + grouped_records: dict[tuple, list[DataRecord]], + groupby_stats: GenerationStats, + all_candidates: list[DataRecord], + start_time: float, + ) -> tuple[list[DataRecord], list[RecordOpStats]]: + """Apply aggregation per group; semantic agg functions use the LLM. + + Groups are processed in parallel with ``self.agg_parallelism`` workers. + """ + num_groups = len(grouped_records) + output_field_names = [ + f for f in self.output_schema.model_fields if f not in self.gby_fields + ] + + # Container for ordered results + ordered_keys = list(grouped_records.keys()) + results: list[tuple[DataRecord, RecordOpStats] | None] = [None] * num_groups + + def _aggregate_one_group( + idx: int, group_key: tuple, + ) -> tuple[int, DataRecord, RecordOpStats]: + """Aggregate a single group (may involve LLM calls).""" + group_candidates = grouped_records[group_key] + data_item: dict[str, Any] = {} + group_agg_stats = GenerationStats() + + # Group-by field values + for i, gby_field in enumerate(self.gby_fields): + data_item[gby_field] = group_key[i] + + # Aggregate each field + for i, (agg_func, agg_field) in enumerate( + zip(self.agg_funcs, self.agg_fields), + ): + if agg_func.lower() not in STANDARD_AGG_FUNCS: + # Semantic aggregation via LLM + value, gen_stats = self._apply_semantic_agg_llm( + group_candidates, agg_field, agg_func, + ) + group_agg_stats += gen_stats + else: + # Standard aggregation + state = ApplyGroupByOp.agg_init(agg_func) + for candidate in group_candidates: + if not hasattr(candidate, agg_field): + raise TypeError( + f"SemanticGroupByOp record missing expected field " + f"{agg_field}" + ) + state = ApplyGroupByOp.agg_merge( + agg_func, state, getattr(candidate, agg_field), + ) + value = ApplyGroupByOp.agg_final(agg_func, state) + + data_item[output_field_names[i]] = value + + dr = DataRecord.from_agg_parents( + self.output_schema(**data_item), parent_records=all_candidates, + ) + + combined = groupby_stats + group_agg_stats record_op_stats = RecordOpStats( record_id=dr._id, record_parent_ids=dr._parent_ids, @@ -948,170 +1381,178 @@ def __call__(self, candidates: list[DataRecord]) -> DataRecordSet: full_op_id=self.get_full_op_id(), logical_op_id=self.logical_op_id or "semantic-groupby", op_name=self.op_name(), - time_per_record=(time.time() - start_time) / len(agg_state), - cost_per_record=gen_stats.cost_per_record / len(agg_state), + time_per_record=(time.time() - start_time) / num_groups, + cost_per_record=combined.cost_per_record / num_groups, model_name=self.get_model_name(), input_fields=self.get_input_fields(), generated_fields=list(self.output_schema.model_fields.keys()), - total_input_tokens=gen_stats.total_input_tokens, - total_output_tokens=gen_stats.total_output_tokens, - total_input_cost=gen_stats.total_input_cost, - total_output_cost=gen_stats.total_output_cost, - llm_call_duration_secs=gen_stats.llm_call_duration_secs, - fn_call_duration_secs=gen_stats.fn_call_duration_secs, - total_llm_calls=gen_stats.total_llm_calls, - total_embedding_llm_calls=gen_stats.total_embedding_llm_calls, + input_text_tokens=combined.input_text_tokens / num_groups, + output_text_tokens=combined.output_text_tokens / num_groups, + llm_call_duration_secs=combined.llm_call_duration_secs / num_groups, + total_llm_calls=combined.total_llm_calls / num_groups, op_details={k: str(v) for k, v in self.get_id_params().items()}, ) - record_op_stats_lst.append(record_op_stats) - - return DataRecordSet(drs, record_op_stats_lst) - - def _assign_groups_llm(self, candidates: list[DataRecord]) -> tuple[list[str], GenerationStats]: - """ - Phase 1: Use LLM to assign each candidate to a semantic group. - + + return idx, dr, record_op_stats + + # Execute group aggregations in parallel + with ThreadPoolExecutor(max_workers=self.agg_parallelism) as executor: + futures = { + executor.submit(_aggregate_one_group, idx, key): idx + for idx, key in enumerate(ordered_keys) + } + for future in as_completed(futures): + idx, dr, stats = future.result() + results[idx] = (dr, stats) + + drs = [r[0] for r in results] # type: ignore[index] + stats_lst = [r[1] for r in results] # type: ignore[index] + return drs, stats_lst + + # -- single semantic aggregation call ------------------------------ + def _apply_semantic_agg_llm( + self, + group_candidates: list[DataRecord], + agg_field: str, + agg_func: str, + ) -> tuple[Any, GenerationStats]: + """Call the LLM to perform a semantic aggregation on *group_candidates*. + Args: - candidates: List of DataRecords to classify into groups - + group_candidates: Records belonging to one group. + agg_field: The field name being aggregated. + agg_func: Natural-language description of the aggregation + (e.g. ``"most positive review"``). + Returns: - Tuple of (list of group labels, generation stats) + ``(aggregated_value, generation_stats)`` """ - # Create a schema that just extracts the group-by field - # Use the description from the field spec if available from palimpzest.core.lib.schemas import create_schema_from_fields - - first_gby_spec = self.gby_fields_spec[0] - if isinstance(first_gby_spec, dict): - field_desc = first_gby_spec["desc"] - field_name = first_gby_spec['name'] - field_type = first_gby_spec.get('type', str) + + # Determine output type for this field + field_type: type = str + for spec in self.agg_fields_spec: + if isinstance(spec, dict) and spec.get("name") == agg_field: + field_type = spec.get("type", str) + break else: - field_desc = f"The semantic category for {first_gby_spec}" - field_name = first_gby_spec - field_type = str - - groupby_schema = create_schema_from_fields([ - {"name": field_name, "type": field_type, "desc": field_desc} + if agg_field in self.output_schema.model_fields: + field_type = self.output_schema.model_fields[agg_field].annotation or str + + agg_schema = create_schema_from_fields([ + {"name": agg_field, "type": field_type, "desc": agg_func}, ]) - - # Process candidates to extract group labels - group_labels = [] - total_stats = GenerationStats() - - # Get input fields - but only use the groupby field to avoid image detection issues - # Since ImageFilepath is just an alias for str, passing all string fields causes - # the prompt factory to try to open them as image files - input_fields = [self.gby_fields[0]] # Only pass the groupby field - fields = {self.gby_fields[0]: str} - - # Build the aggregation instruction that includes the field descriptions from field spec - # This tells the LLM HOW to categorize/group the values semantically - agg_instruction = f"Categorize this record into a semantic group based on the field '{field_name}' Return the category name (one of those specified in '{field_desc}'s)" - - print(f"\nSemanticGroupByOp: Processing {len(candidates)} records for group assignment...") - print(f" Grouping instruction: {agg_instruction}") - for idx, candidate in enumerate(candidates): - # Show progress every 10 records - if idx % 10 == 0: - print(f" Processing record {idx+1}/{len(candidates)}...") - - # Ask LLM to categorize the record according to the field description - gen_kwargs = { - "project_cols": input_fields, - "output_schema": groupby_schema, - "agg_instruction": agg_instruction - } - - field_answers, _, gen_stats, _ = self.generator(candidate, fields, **gen_kwargs) - - # Extract the group label - field_answers returns dict with field->list mapping - field_answer = field_answers.get(self.gby_fields[0]) - if field_answer is None or not isinstance(field_answer, list) or len(field_answer) == 0: - group_label = "unknown" - else: - group_label = field_answer[0] - group_labels.append(group_label) - - # Accumulate stats - total_stats += gen_stats - - print(f" Completed! Found {len(set(group_labels))} unique groups from {len(candidates)} records") - return group_labels, total_stats + agg_instruction = ( + f"Apply the following aggregation: {agg_func} on field '{agg_field}'" + ) + input_fields = [agg_field] + fields = {agg_field: field_type} + + gen_kwargs = { + "project_cols": input_fields, + "output_schema": agg_schema, + "agg_instruction": agg_instruction, + } + field_answers, _, gen_stats, _ = self.generator( + group_candidates, fields, **gen_kwargs, + ) + + value = None + answer = field_answers.get(agg_field) + if isinstance(answer, list) and len(answer) > 0: + value = answer[0] + elif answer is not None: + value = answer + + return value, gen_stats if gen_stats is not None else GenerationStats() + + # ================================================================== + # HIERARCHICAL GROUPBY + # ================================================================== def hierarchical_groupby( self, candidates: list[DataRecord], groupby_fields: list[list[str | dict]], agg_fields: list[list[str | dict]], agg_funcs: list[list[str]], - model: Model = None, + model: Model | None = None, prompt_strategy: PromptStrategy = PromptStrategy.AGG, reasoning_effort: str | None = None, ) -> dict: - """ - Perform hierarchical (nested) semantic groupby operations using LLMs. + """Perform hierarchical (nested) semantic groupby operations. - At each intermediate level the LLM assigns group labels to the original records - (without aggregation) so that inner levels operate on the same raw records. - The final level runs a full semantic groupby with aggregation. + At each intermediate level the LLM assigns group labels to the original + records (without aggregation) so that inner levels operate on the same + raw records. The final level runs a full semantic groupby with + aggregation. Args: candidates: Input DataRecords. groupby_fields: List of lists of field specs per level. agg_fields: List of lists of aggregate field specs per level. agg_funcs: List of lists of aggregation function names per level. - model: Optional LLM model override (falls back to self.model). + model: Optional LLM model override (falls back to ``self.model``). prompt_strategy: Prompt strategy (defaults to AGG). reasoning_effort: Optional reasoning effort override. Returns: - A DataRecordSet for a single level, or a nested dict for multiple levels. + A ``DataRecordSet`` for a single level, or a nested dict for + multiple levels. """ from palimpzest.core.lib.schemas import create_groupby_schema_from_fields - assert len(groupby_fields) == len(agg_fields) == len(agg_funcs), \ + assert len(groupby_fields) == len(agg_fields) == len(agg_funcs), ( "groupby_fields, agg_fields, and agg_funcs must all have the same length" + ) - def normalize_fields(fields): - out = [] - for f in fields: - if isinstance(f, str): - out.append({'name': f, 'desc': f'Group by {f}', 'type': str}) - else: - out.append(f) - return out + def _normalize(fields): + return [ + f + if isinstance(f, dict) + else {"name": f, "desc": f"Group by {f}", "type": str} + for f in fields + ] _model = model or self.model - _prompt_strategy = prompt_strategy or self.prompt_strategy - _reasoning_effort = reasoning_effort or self.reasoning_effort + _ps = prompt_strategy or self.prompt_strategy + _re = reasoning_effort or self.reasoning_effort - def run_level(candidates, level): - gby_specs = normalize_fields(groupby_fields[level]) - agg_specs = normalize_fields(agg_fields[level]) + def _run_level(cands, level): + gby_specs = _normalize(groupby_fields[level]) + agg_specs = _normalize(agg_fields[level]) funcs = agg_funcs[level] - gby_names = [s['name'] for s in gby_specs] - agg_names = [s['name'] for s in agg_specs] - output_schema = create_groupby_schema_from_fields(gby_names, agg_names) + gby_names = [s["name"] for s in gby_specs] + agg_names = [s["name"] for s in agg_specs] + out_schema = create_groupby_schema_from_fields(gby_names, agg_names) + op = SemanticGroupByOp( gby_fields=gby_specs, agg_fields=agg_specs, agg_funcs=funcs, model=_model, - prompt_strategy=_prompt_strategy, - reasoning_effort=_reasoning_effort, - output_schema=output_schema, + prompt_strategy=_ps, + reasoning_effort=_re, + batch_size=self.batch_size, + groupby_parallelism=self.groupby_parallelism, + agg_parallelism=self.agg_parallelism, + output_schema=out_schema, input_schema=self.input_schema, ) + if level == len(groupby_fields) - 1: - return op(candidates) - # Intermediate: LLM assigns labels, original records are forwarded - group_labels, _ = op._assign_groups_llm(candidates) - outer_groups = {} - for candidate, label in zip(candidates, group_labels): - key = (label,) if not isinstance(label, tuple) else label - outer_groups.setdefault(key, []).append(candidate) - return {key: run_level(grp, level + 1) for key, grp in outer_groups.items()} + return op(cands) + + # Intermediate: assign labels, forward raw records + labels, _ = op._perform_semantic_groupby(cands) + outer_groups: dict[tuple, list[DataRecord]] = {} + for cand, label in zip(cands, labels): + key = label if isinstance(label, tuple) else (label,) + outer_groups.setdefault(key, []).append(cand) + return { + key: _run_level(grp, level + 1) + for key, grp in outer_groups.items() + } - return run_level(candidates, 0) + return _run_level(candidates, 0)