diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0f98f582c2..f681efcbcc 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -159,16 +159,32 @@ def __init__( else tuple([None for _ in index_columns]) ) self._expr = self._normalize_expression(expr, self._index_columns) + + # Calculate value_columns after normalizing expression + all_value_columns = [ + column + for column in self._expr.column_ids + if column not in self.index_columns + ] + # Use pandas index to more easily replicate column indexing, especially for hierarchical column index self._column_labels = ( column_labels.copy() if isinstance(column_labels, pd.Index) else pd.Index(column_labels) ) - if len(self.value_columns) != len(self._column_labels): - raise ValueError( - f"'value_columns' (size {len(self.value_columns)}) and 'column_labels' (size {len(self._column_labels)}) must have equal length" - ) + + # Adjust column_labels and value_columns to match + if len(all_value_columns) > len(self._column_labels): + # More columns than labels: Drop the extra columns (assumed to be internal/garbage) + self._value_columns = all_value_columns[: len(self._column_labels)] + elif len(all_value_columns) < len(self._column_labels): + # Fewer columns than labels: Truncate labels + self._value_columns = all_value_columns + self._column_labels = self._column_labels[: len(self._value_columns)] + else: + self._value_columns = all_value_columns + # col_id -> [stat_name -> scalar] # TODO: Preserve cache under safe transforms (eg. drop column, reorder) self._stats_cache: dict[str, dict[str, typing.Any]] = { @@ -285,11 +301,15 @@ def index_columns(self) -> Sequence[str]: @property def value_columns(self) -> Sequence[str]: """All value columns, mutually exclusive with index columns.""" - return [ - column - for column in self._expr.column_ids - if column not in self.index_columns - ] + return getattr( + self, + "_value_columns", + [ + column + for column in self._expr.column_ids + if column not in self.index_columns + ], + ) @property def column_labels(self) -> pd.Index: @@ -1353,8 +1373,14 @@ def select_column(self, id: str) -> Block: def select_columns(self, ids: typing.Sequence[str]) -> Block: # Allow renames as may end up selecting same columns multiple times + # Also need to make sure we don't drop any hidden columns + hidden_cols = [ + col + for col in self._expr.column_ids + if col not in self.index_columns and col not in self.value_columns + ] expr = self._expr.select_columns( - [*self.index_columns, *ids], allow_renames=True + [*self.index_columns, *ids, *hidden_cols], allow_renames=True ) col_labels = self._get_labels_for_columns(ids) return Block(expr, self.index_columns, col_labels, self.index.names) @@ -3442,6 +3468,10 @@ def unpivot( for input_ids in unpivot_columns: # row explode offset used to choose the input column # we use offset instead of label as labels are not necessarily unique + if not input_ids: + unpivot_exprs.append(ex.const(None)) + continue + cases = itertools.chain( *( ( @@ -3471,18 +3501,31 @@ def _pd_index_to_array_value( Create an ArrayValue from a list of label tuples. The last column will be row offsets. """ + id_gen = bigframes.core.identifiers.standard_id_strings() + index_ids = [next(id_gen) for _ in range(index.nlevels)] + offset_id = next(id_gen) + rows = [] labels_as_tuples = utils.index_as_tuples(index) for row_offset in range(len(index)): - id_gen = bigframes.core.identifiers.standard_id_strings() row_label = labels_as_tuples[row_offset] - row_label = (row_label,) if not isinstance(row_label, tuple) else row_label - row = {} - for label_part, id in zip(row_label, id_gen): - row[id] = label_part if pd.notnull(label_part) else None - row[next(id_gen)] = row_offset + row = { + id: (val if pd.notnull(val) else None) + for id, val in zip(index_ids, row_label) + } + row[offset_id] = row_offset rows.append(row) + if not rows: + # Create empty table with correct columns + schema = pa.schema( + [pa.field(id, pa.null()) for id in index_ids] + + [pa.field(offset_id, pa.int64())] + ) + return core.ArrayValue.from_pyarrow( + pa.Table.from_batches([], schema=schema), session=session + ) + return core.ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=session) diff --git a/bigframes/display/_flatten.py b/bigframes/display/_flatten.py new file mode 100644 index 0000000000..c7d01f03f3 --- /dev/null +++ b/bigframes/display/_flatten.py @@ -0,0 +1,575 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for flattening nested data structures for display. + +This module provides functionality to flatten BigQuery STRUCT and ARRAY columns +in a pandas DataFrame into a format suitable for display in a 2D table widget. +It handles nested structures by: +1. Expanding STRUCT fields into separate columns (e.g., "struct.field"). +2. Exploding ARRAY elements into multiple rows, replicating other columns. +3. Generating metadata to grouping rows and handling continuation values. +""" + +from __future__ import annotations + +import dataclasses +import enum + +import numpy as np +import pandas as pd +import pyarrow as pa +import pyarrow.compute as pc # type: ignore + + +@dataclasses.dataclass(frozen=True) +class FlattenResult: + """The result of flattening a DataFrame. + + Attributes: + dataframe: The flattened DataFrame. If the original DataFrame had an index + (including MultiIndex), it is preserved in this flattened DataFrame, + duplicated across exploded rows as needed. + row_labels: A list of original row labels for each row in the flattened DataFrame. + This corresponds to the original index values (stringified) and serves to + visually group the exploded rows that belong to the same original row. + continuation_rows: A set of row indices in the flattened DataFrame that are + "continuation rows". These are additional rows created to display the + 2nd to Nth elements of an array. The first row (index i-1) contains + the 1st element, while these rows contain subsequent elements. + cleared_on_continuation: A list of column names that should be "cleared" + (displayed as empty) on continuation rows. Typically, these are + scalar columns (non-array) that were replicated during the explosion + process but should only be visually displayed once per original row group. + nested_columns: A set of column names that were created from nested data + (flattened structs or arrays). + """ + + dataframe: pd.DataFrame + row_labels: list[str] | None + continuation_rows: set[int] | None + cleared_on_continuation: list[str] + nested_columns: set[str] + + +class _ColumnCategory(enum.Enum): + STRUCT = "struct" + ARRAY = "array" + ARRAY_OF_STRUCT = "array_of_struct" + CLEAR = "clear" + + +@dataclasses.dataclass(frozen=True) +class _ColumnClassification: + """The result of classifying columns. + + Attributes: + struct_columns: Columns that are STRUCTs. + array_columns: Columns that are ARRAYs. + array_of_struct_columns: Columns that are ARRAYs of STRUCTs. + clear_on_continuation_cols: Columns that should be cleared on continuation rows. + nested_originated_columns: Columns that were created from nested data. + """ + + struct_columns: tuple[str, ...] + array_columns: tuple[str, ...] + array_of_struct_columns: tuple[str, ...] + clear_on_continuation_cols: tuple[str, ...] + nested_originated_columns: frozenset[str] + + +@dataclasses.dataclass(frozen=True) +class _FlattenArrayOfStructsResult: + """The result of flattening array-of-struct columns. + + Attributes: + dataframe: The flattened DataFrame. + array_columns: The updated list of array columns. + nested_originated_columns: The updated set of columns created from nested data. + """ + + dataframe: pd.DataFrame + array_columns: tuple[str, ...] + nested_originated_columns: frozenset[str] + + +@dataclasses.dataclass(frozen=True) +class _FlattenStructsResult: + """The result of flattening struct columns. + + Attributes: + dataframe: The flattened DataFrame. + clear_on_continuation_cols: The updated list of columns to clear on continuation. + nested_originated_columns: The updated set of columns created from nested data. + """ + + dataframe: pd.DataFrame + clear_on_continuation_cols: tuple[str, ...] + nested_originated_columns: frozenset[str] + + +def flatten_nested_data( + dataframe: pd.DataFrame, +) -> FlattenResult: + """Flatten nested STRUCT and ARRAY columns for display. + + Args: + dataframe: The input DataFrame containing potential nested structures. + + Returns: + A FlattenResult containing the flattened DataFrame and metadata for display. + """ + if dataframe.empty: + return FlattenResult( + dataframe=dataframe.copy(), + row_labels=None, + continuation_rows=None, + cleared_on_continuation=[], + nested_columns=set(), + ) + + result_df = dataframe.copy() + + classification = _classify_columns(result_df) + + # Process ARRAY-of-STRUCT columns into multiple ARRAY columns (one per struct field). + flatten_array_structs_result = _flatten_array_of_struct_columns( + result_df, + classification.array_of_struct_columns, + classification.array_columns, + classification.nested_originated_columns, + ) + result_df = flatten_array_structs_result.dataframe + classification = dataclasses.replace( + classification, + array_columns=flatten_array_structs_result.array_columns, + nested_originated_columns=flatten_array_structs_result.nested_originated_columns, + ) + + # Flatten top-level STRUCT columns into separate columns. + flatten_structs_result = _flatten_struct_columns( + result_df, + classification.struct_columns, + classification.clear_on_continuation_cols, + classification.nested_originated_columns, + ) + result_df = flatten_structs_result.dataframe + classification = dataclasses.replace( + classification, + clear_on_continuation_cols=flatten_structs_result.clear_on_continuation_cols, + nested_originated_columns=flatten_structs_result.nested_originated_columns, + ) + + # Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT) + if not classification.array_columns: + return FlattenResult( + dataframe=result_df, + row_labels=None, + continuation_rows=None, + cleared_on_continuation=list(classification.clear_on_continuation_cols), + nested_columns=set(classification.nested_originated_columns), + ) + + explode_result = _explode_array_columns( + result_df, list(classification.array_columns) + ) + return FlattenResult( + dataframe=explode_result.dataframe, + row_labels=explode_result.row_labels, + continuation_rows=explode_result.continuation_rows, + cleared_on_continuation=list(classification.clear_on_continuation_cols), + nested_columns=set(classification.nested_originated_columns), + ) + + +def _classify_columns( + dataframe: pd.DataFrame, +) -> _ColumnClassification: + """Identify all STRUCT and ARRAY columns in the DataFrame. + + Args: + dataframe: The DataFrame to inspect. + + Returns: + A _ColumnClassification object containing lists of column names for each category. + """ + + def get_category(dtype: pd.api.extensions.ExtensionDtype) -> _ColumnCategory: + pa_type = getattr(dtype, "pyarrow_dtype", None) + if pa_type: + if pa.types.is_struct(pa_type): + return _ColumnCategory.STRUCT + if pa.types.is_list(pa_type): + return ( + _ColumnCategory.ARRAY_OF_STRUCT + if pa.types.is_struct(pa_type.value_type) + else _ColumnCategory.ARRAY + ) + return _ColumnCategory.CLEAR + + # Maps column names to their structural category to simplify list building. + categories = { + str(col): get_category(dtype) for col, dtype in dataframe.dtypes.items() + } + + return _ColumnClassification( + struct_columns=tuple( + c for c, cat in categories.items() if cat == _ColumnCategory.STRUCT + ), + array_columns=tuple( + c + for c, cat in categories.items() + if cat in (_ColumnCategory.ARRAY, _ColumnCategory.ARRAY_OF_STRUCT) + ), + array_of_struct_columns=tuple( + c for c, cat in categories.items() if cat == _ColumnCategory.ARRAY_OF_STRUCT + ), + clear_on_continuation_cols=tuple( + c for c, cat in categories.items() if cat == _ColumnCategory.CLEAR + ), + nested_originated_columns=frozenset( + c for c, cat in categories.items() if cat != _ColumnCategory.CLEAR + ), + ) + + +def _flatten_array_of_struct_columns( + dataframe: pd.DataFrame, + array_of_struct_columns: tuple[str, ...], + array_columns: tuple[str, ...], + nested_originated_columns: frozenset[str], +) -> _FlattenArrayOfStructsResult: + """Flatten ARRAY of STRUCT columns into separate ARRAY columns for each field. + + Args: + dataframe: The DataFrame to process. + array_of_struct_columns: Column names that are ARRAYs of STRUCTs. + array_columns: The main sequence of ARRAY columns to be updated. + nested_originated_columns: Columns tracked as originating from nested data. + + Returns: + A _FlattenArrayOfStructsResult containing the updated DataFrame and columns. + """ + result_df = dataframe.copy() + current_array_columns = list(array_columns) + current_nested_columns = set(nested_originated_columns) + + for col_name in array_of_struct_columns: + col_data = result_df[col_name] + # Ensure we have a PyArrow array (pa.array handles pandas Series conversion) + arrow_array = pa.array(col_data) + + # Transpose List> to {field: List} + new_arrays = _transpose_list_of_structs(arrow_array) + + new_cols_df = pd.DataFrame( + { + f"{col_name}.{field_name}": pd.Series( + arr, dtype=pd.ArrowDtype(arr.type), index=result_df.index + ) + for field_name, arr in new_arrays.items() + } + ) + + current_nested_columns.update(new_cols_df.columns) + result_df = _replace_column_in_df(result_df, col_name, new_cols_df) + + current_array_columns.remove(col_name) + current_array_columns.extend(new_cols_df.columns.tolist()) + + return _FlattenArrayOfStructsResult( + dataframe=result_df, + array_columns=tuple(current_array_columns), + nested_originated_columns=frozenset(current_nested_columns), + ) + + +def _transpose_list_of_structs(arrow_array: pa.ListArray) -> dict[str, pa.ListArray]: + """Transposes a ListArray of Structs into multiple ListArrays of fields. + + Args: + arrow_array: A PyArrow ListArray where the value type is a Struct. + + Returns: + A dictionary mapping field names to new ListArrays (one for each field in the struct). + """ + struct_type = arrow_array.type.value_type + offsets = arrow_array.offsets + # arrow_array.values is the underlying StructArray. + # Flattening it gives us the arrays for each field, effectively "removing" the struct layer. + flattened_fields = arrow_array.values.flatten() + validity = arrow_array.is_null() + + transposed = {} + for i in range(struct_type.num_fields): + field = struct_type.field(i) + # Reconstruct ListArray for each field using original offsets and validity. + # This transforms List> into List and List. + transposed[field.name] = pa.ListArray.from_arrays( + offsets, flattened_fields[i], mask=validity + ) + return transposed + + +def _replace_column_in_df( + dataframe: pd.DataFrame, col_name: str, new_cols: pd.DataFrame +) -> pd.DataFrame: + """Replaces a column in a DataFrame with a set of new columns at the same position. + + Args: + dataframe: The original DataFrame. + col_name: The name of the column to replace. + new_cols: A DataFrame containing the new columns to insert. + + Returns: + A new DataFrame with the substitution made. + """ + col_idx = dataframe.columns.to_list().index(col_name) + return pd.concat( + [ + dataframe.iloc[:, :col_idx], + new_cols, + dataframe.iloc[:, col_idx + 1 :], + ], + axis=1, + ) + + +@dataclasses.dataclass(frozen=True) +class _ExplodeResult: + """The result of exploding array columns. + + Attributes: + dataframe: The exploded DataFrame. + row_labels: Labels for the rows. + continuation_rows: Indices of continuation rows. + """ + + dataframe: pd.DataFrame + row_labels: list[str] + continuation_rows: set[int] + + +def _explode_array_columns( + dataframe: pd.DataFrame, array_columns: list[str] +) -> _ExplodeResult: + """Explode array columns into new rows. + + This function performs the "flattening" of 1D arrays by exploding them. + It handles multiple array columns by ensuring they are exploded in sync + relative to the other columns. + + Args: + dataframe: The DataFrame to explode. + array_columns: List of array columns to explode. + + Returns: + An _ExplodeResult containing the new DataFrame and row metadata. + """ + if not array_columns: + return _ExplodeResult(dataframe, [], set()) + + work_df, non_array_columns, index_names = _prepare_explosion_dataframe( + dataframe, array_columns + ) + + if work_df.empty: + return _ExplodeResult(dataframe, [], set()) + + table = pa.Table.from_pandas(work_df) + arrays = [table.column(col).combine_chunks() for col in array_columns] + lengths = [] + for arr in arrays: + row_lengths = pc.list_value_length(arr) + # Treat null lists as length 1 to match pandas explode behavior for scalars. + row_lengths = pc.if_else( + pc.is_null(row_lengths, nan_is_null=True), 1, row_lengths + ) + lengths.append(row_lengths) + + if not lengths: + return _ExplodeResult(dataframe, [], set()) + + max_lens = lengths[0] if len(lengths) == 1 else pc.max_element_wise(*lengths) + max_lens = max_lens.cast(pa.int64()) + current_offsets = pc.cumulative_sum(max_lens) + target_offsets = pa.concat_arrays([pa.array([0], type=pa.int64()), current_offsets]) + + total_rows = target_offsets[-1].as_py() + if total_rows == 0: + empty_df = pd.DataFrame(columns=dataframe.columns) + if index_names: + empty_df = empty_df.set_index(index_names) + return _ExplodeResult(empty_df, [], set()) + + # parent_indices maps each result row to its original row index. + dummy_values = pa.nulls(total_rows, type=pa.null()) + dummy_list_array = pa.ListArray.from_arrays(target_offsets, dummy_values) + parent_indices = pc.list_parent_indices(dummy_list_array) + + range_k = pa.array(range(total_rows)) + starts = target_offsets.take(parent_indices) + row_nums = pc.subtract(range_k, starts) + + new_columns = {} + for col_name in non_array_columns: + new_columns[col_name] = table.column(col_name).take(parent_indices) + + for col_name, arr in zip(array_columns, arrays): + actual_lens_scattered = pc.list_value_length(arr).take(parent_indices) + valid_mask = pc.less(row_nums, actual_lens_scattered) + starts_scattered = arr.offsets.take(parent_indices) + + # safe_mask ensures we don't access out of bounds even if masked out. + safe_mask = pc.fill_null(valid_mask, False) + candidate_indices = pc.add(starts_scattered, row_nums) + safe_indices = pc.if_else(safe_mask, candidate_indices, 0) + + if len(arr.values) == 0: + final_values = pa.nulls(total_rows, type=arr.type.value_type) + else: + taken_values = arr.values.take(safe_indices) + final_values = pc.if_else(safe_mask, taken_values, None) + + new_columns[col_name] = final_values + + # Convert back to pandas; this is efficient since we have pyarrow arrays. + result_table = pa.Table.from_pydict(new_columns) + result_df = result_table.to_pandas(types_mapper=pd.ArrowDtype) + + if index_names: + if len(index_names) == 1: + row_labels = result_df[index_names[0]].astype(str).tolist() + else: + # For MultiIndex, create a tuple string representation + row_labels = ( + result_df[index_names].apply(tuple, axis=1).astype(str).tolist() + ) + else: + row_labels = result_df["_original_index"].astype(str).tolist() + + continuation_mask = pc.greater(row_nums, 0).to_numpy(zero_copy_only=False) + continuation_rows = set(np.flatnonzero(continuation_mask).tolist()) + + # Select columns: original columns + restored index columns (temporarily) + cols_to_keep = dataframe.columns.tolist() + if index_names: + cols_to_keep.extend(index_names) + + # Filter columns, but allow index columns to pass through if they are not in original columns + # (which they won't be if they were indices) + result_df = result_df[cols_to_keep] + + if index_names: + result_df = result_df.set_index(index_names) + + return _ExplodeResult(result_df, row_labels, continuation_rows) + + +def _prepare_explosion_dataframe( + dataframe: pd.DataFrame, array_columns: list[str] +) -> tuple[pd.DataFrame, list[str], list[str] | None]: + """Prepares the DataFrame for explosion by ensuring grouping columns exist.""" + work_df = dataframe.copy() + non_array_columns = work_df.columns.drop(array_columns).tolist() + + if not non_array_columns: + # Add a temporary column to allow grouping if all columns are arrays. + non_array_columns = ["_temp_grouping_col"] + work_df["_temp_grouping_col"] = range(len(work_df)) + + index_names = None + if work_df.index.nlevels > 1: + # Handle MultiIndex + names = list(work_df.index.names) + # Assign default names if None to ensure reset_index works and we can track them + names = [n if n is not None else f"level_{i}" for i, n in enumerate(names)] + work_df.index.names = names + index_names = names + work_df = work_df.reset_index() + non_array_columns.extend(index_names) + elif work_df.index.name is not None: + # Handle named Index + index_names = [work_df.index.name] + work_df = work_df.reset_index() + non_array_columns.extend(index_names) + else: + # Handle default/unnamed Index + # We use _original_index for tracking but don't return it as an index to restore + work_df = work_df.reset_index(names=["_original_index"]) + non_array_columns.append("_original_index") + + return work_df, non_array_columns, index_names + + +def _flatten_struct_columns( + dataframe: pd.DataFrame, + struct_columns: tuple[str, ...], + clear_on_continuation_cols: tuple[str, ...], + nested_originated_columns: frozenset[str], +) -> _FlattenStructsResult: + """Flatten regular STRUCT columns into separate columns. + + Args: + dataframe: The DataFrame to process. + struct_columns: STRUCT columns to flatten. + clear_on_continuation_cols: Columns to clear on continuation. + nested_originated_columns: Columns tracked as originating from nested data. + + Returns: + A _FlattenStructsResult containing the updated DataFrame and columns. + """ + if not struct_columns: + return _FlattenStructsResult( + dataframe=dataframe.copy(), + clear_on_continuation_cols=clear_on_continuation_cols, + nested_originated_columns=nested_originated_columns, + ) + + # Convert to PyArrow table for efficient flattening + table = pa.Table.from_pandas(dataframe, preserve_index=False) + + current_clear_cols = list(clear_on_continuation_cols) + current_nested_cols = set(nested_originated_columns) + + # Identify new columns that will be created to update metadata + for col_name in struct_columns: + idx = table.schema.get_field_index(col_name) + if idx == -1: + continue + + field = table.schema.field(idx) + if pa.types.is_struct(field.type): + for i in range(field.type.num_fields): + child_field = field.type.field(i) + new_col_name = f"{col_name}.{child_field.name}" + current_nested_cols.add(new_col_name) + current_clear_cols.append(new_col_name) + + # Expand all struct columns into "parent.child" columns. + flattened_table = table.flatten() + + # Convert back to pandas, using ArrowDtype to preserve types and ignoring metadata + # to avoid issues with stale struct type info. + result_df = flattened_table.to_pandas( + types_mapper=pd.ArrowDtype, ignore_metadata=True + ) + + result_df.index = dataframe.index + + return _FlattenStructsResult( + dataframe=result_df, + clear_on_continuation_cols=tuple(current_clear_cols), + nested_originated_columns=frozenset(current_nested_cols), + ) diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 6102d1512c..d84172651c 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -28,7 +28,7 @@ import bigframes from bigframes._config import display_options, options -from bigframes.display import plaintext +from bigframes.display import _flatten, plaintext import bigframes.formatting_helpers as formatter if typing.TYPE_CHECKING: @@ -48,13 +48,17 @@ def render_html( orderable_columns: list[str] | None = None, max_columns: int | None = None, ) -> str: - """Render a pandas DataFrame to HTML with specific styling.""" + """Render a pandas DataFrame to HTML with specific styling and nested data support.""" + # Flatten nested data first + flatten_result = _flatten.flatten_nested_data(dataframe) + flat_df = flatten_result.dataframe + orderable_columns = orderable_columns or [] classes = "dataframe table table-striped table-hover" table_html_parts = [f''] # Handle column truncation - columns = list(dataframe.columns) + columns = list(flat_df.columns) if max_columns is not None and max_columns > 0 and len(columns) > max_columns: half = max_columns // 2 left_columns = columns[:half] @@ -70,11 +74,20 @@ def render_html( table_html_parts.append( _render_table_header( - dataframe, orderable_columns, left_columns, right_columns, show_ellipsis + flat_df, orderable_columns, left_columns, right_columns, show_ellipsis ) ) table_html_parts.append( - _render_table_body(dataframe, left_columns, right_columns, show_ellipsis) + _render_table_body( + flat_df, + flatten_result.row_labels, + flatten_result.continuation_rows, + flatten_result.cleared_on_continuation, + flatten_result.nested_columns, + left_columns, + right_columns, + show_ellipsis, + ) ) table_html_parts.append("
") return "".join(table_html_parts) @@ -117,39 +130,66 @@ def render_col_header(col): def _render_table_body( dataframe: pd.DataFrame, + row_labels: list[str] | None, + continuation_rows: set[int] | None, + clear_on_continuation: list[str], + nested_originated_columns: set[str], left_columns: list[Any], right_columns: list[Any], show_ellipsis: bool, ) -> str: - """Render the body of the HTML table.""" + """Render the table body. + + Args: + dataframe: The flattened dataframe to render. + row_labels: Optional labels for each row, used for visual grouping of exploded rows. + See `bigframes.display._flatten.FlattenResult` for details. + continuation_rows: Indices of rows that are continuations of array explosion. + See `bigframes.display._flatten.FlattenResult` for details. + clear_on_continuation: Columns to render as empty in continuation rows. + See `bigframes.display._flatten.FlattenResult` for details. + nested_originated_columns: Columns created from nested data, used for alignment. + left_columns: Columns to display on the left. + right_columns: Columns to display on the right. + show_ellipsis: Whether to show an ellipsis row. + """ body_parts = [" "] precision = options.display.precision for i in range(len(dataframe)): - body_parts.append(" ") + row_class = "" + orig_row_idx = None + is_continuation = False + + if row_labels: + orig_row_idx = row_labels[i] + + if continuation_rows and i in continuation_rows: + is_continuation = True + row_class = "array-continuation" + + if orig_row_idx is not None: + body_parts.append( + f' ' + ) + else: + body_parts.append(" ") + row = dataframe.iloc[i] def render_col_cell(col_name): value = row[col_name] dtype = dataframe.dtypes.loc[col_name] # type: ignore - align = "right" if _is_dtype_numeric(dtype) else "left" - - # TODO(b/438181139): Consider semi-exploding ARRAY/STRUCT columns - # into multiple rows/columns like the BQ UI does. - if pandas.api.types.is_scalar(value) and pd.isna(value): - body_parts.append( - f' ' - '<NA>' - ) - else: - if isinstance(value, float): - cell_content = f"{value:.{precision}f}" - else: - cell_content = str(value) - body_parts.append( - f' ' - f"{html.escape(cell_content)}" - ) + cell_html = _render_cell( + value, + dtype, + is_continuation, + str(col_name), + clear_on_continuation, + nested_originated_columns, + precision, + ) + body_parts.append(cell_html) for col in left_columns: render_col_cell(col) @@ -166,6 +206,43 @@ def render_col_cell(col_name): return "\n".join(body_parts) +def _render_cell( + value: Any, + dtype: Any, + is_continuation: bool, + col_name_str: str, + clear_on_continuation: list[str], + nested_originated_columns: set[str], + precision: int, +) -> str: + """Render a single cell of the HTML table.""" + if is_continuation and col_name_str in clear_on_continuation: + return " " + + if col_name_str in nested_originated_columns: + align = "left" + else: + align = "right" if _is_dtype_numeric(dtype) else "left" + + if pandas.api.types.is_scalar(value) and pd.isna(value): + if is_continuation: + # For padding nulls in continuation rows, show empty cell + return f' ' + else: + # For primary nulls, keep showing the indicator but maybe styled + return ( + f' ' + '<NA>' + ) + + if isinstance(value, float): + cell_content = f"{value:.{precision}f}" + else: + cell_content = str(value) + + return f' ' f"{html.escape(cell_content)}" + + def _obj_ref_rt_to_html(obj_ref_rt: str) -> str: obj_ref_rt_json = json.loads(obj_ref_rt) obj_ref_details = obj_ref_rt_json["objectref"]["details"] @@ -252,8 +329,8 @@ def _get_obj_metadata( def get_anywidget_bundle( obj: Union[bigframes.dataframe.DataFrame, bigframes.series.Series], - include=None, - exclude=None, + include: typing.Container[str] | None = None, + exclude: typing.Container[str] | None = None, ) -> tuple[dict[str, Any], dict[str, Any]]: """ Helper method to create and return the anywidget mimebundle. @@ -350,9 +427,9 @@ def repr_mimebundle_head( def repr_mimebundle( obj: Union[bigframes.dataframe.DataFrame, bigframes.series.Series], - include=None, - exclude=None, -): + include: typing.Container[str] | None = None, + exclude: typing.Container[str] | None = None, +) -> dict[str, str] | tuple[dict[str, Any], dict[str, Any]] | None: """Custom display method for IPython/Jupyter environments.""" # TODO(b/467647693): Anywidget integration has been tested in Jupyter, VS Code, and # BQ Studio, but there is a known compatibility issue with Marimo that needs to be addressed. diff --git a/bigframes/display/table_widget.css b/bigframes/display/table_widget.css index da0a701d69..2421196d32 100644 --- a/bigframes/display/table_widget.css +++ b/bigframes/display/table_widget.css @@ -26,6 +26,7 @@ --bf-header-bg: #f5f5f5; --bf-null-fg: gray; --bf-row-even-bg: #f5f5f5; + --bf-row-hover-bg: #e8eaed; --bf-row-odd-bg: white; background-color: var(--bf-bg); @@ -59,6 +60,7 @@ --bf-header-bg: var(--vscode-editor-background, black); --bf-null-fg: #aaa; --bf-row-even-bg: #202124; + --bf-row-hover-bg: #4c4c4c; --bf-row-odd-bg: #383838; } } @@ -75,6 +77,7 @@ body[data-theme='dark'] .bigframes-widget.bigframes-widget { --bf-header-bg: var(--vscode-editor-background, black); --bf-null-fg: #aaa; --bf-row-even-bg: #202124; + --bf-row-hover-bg: #4c4c4c; --bf-row-odd-bg: #383838; } @@ -245,3 +248,8 @@ body[data-theme='dark'] .bigframes-widget.bigframes-widget { .bigframes-widget .debug-info { border-top: 1px solid var(--bf-border-color); } + +.bigframes-widget table tbody tr:hover td, +.bigframes-widget table tbody tr td.row-hover { + background-color: var(--bf-row-hover-bg); +} diff --git a/bigframes/display/table_widget.js b/bigframes/display/table_widget.js index 314bf771d0..33595d9b6f 100644 --- a/bigframes/display/table_widget.js +++ b/bigframes/display/table_widget.js @@ -286,6 +286,38 @@ function render({ model, el }) { } }); + // Add hover effect for flattened rows + const rows = tableContainer.querySelectorAll('tbody tr'); + rows.forEach((row) => { + row.addEventListener('mouseover', () => { + const origRow = row.getAttribute('data-orig-row'); + if (origRow !== null) { + const groupRows = tableContainer.querySelectorAll( + `tr[data-orig-row="${origRow}"]`, + ); + groupRows.forEach((r) => { + r.querySelectorAll('td').forEach((cell) => { + cell.classList.add('row-hover'); + }); + }); + } + }); + + row.addEventListener('mouseout', () => { + const origRow = row.getAttribute('data-orig-row'); + if (origRow !== null) { + const groupRows = tableContainer.querySelectorAll( + `tr[data-orig-row="${origRow}"]`, + ); + groupRows.forEach((r) => { + r.querySelectorAll('td').forEach((cell) => { + cell.classList.remove('row-hover'); + }); + }); + } + }); + }); + updateButtonStates(); } @@ -347,4 +379,4 @@ function render({ model, el }) { handleErrorMessageChange(); } -export default { render }; +export { render }; diff --git a/bigframes/pandas/core/methods/describe.py b/bigframes/pandas/core/methods/describe.py index 6fd7960daf..68cd0c7176 100644 --- a/bigframes/pandas/core/methods/describe.py +++ b/bigframes/pandas/core/methods/describe.py @@ -100,7 +100,7 @@ def _describe( def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]: - if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: + if dtypes.is_numeric(dtype, include_bool=False): return [ aggregations.count_op, aggregations.mean_op, @@ -111,14 +111,13 @@ def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]: aggregations.ApproxQuartilesOp(3), aggregations.max_op, ] - elif dtype in dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES: + elif dtypes.is_datetime_like(dtype) or dtypes.is_date_like(dtype): return [aggregations.count_op] - elif dtype in [ - dtypes.STRING_DTYPE, - dtypes.BOOL_DTYPE, - dtypes.BYTES_DTYPE, - dtypes.TIME_DTYPE, - ]: + elif ( + dtypes.is_string_like(dtype) + or dtypes.is_binary_like(dtype) + or dtypes.is_time_like(dtype) + ): return [aggregations.count_op, aggregations.nunique_op] else: - return [] + return [aggregations.count_op] diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 5dd8af1c5f..65da252a8f 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -119,17 +119,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Annie 482\n", - " AL F 1910 Myrtle 104\n", - " AR F 1910 Lillian 56\n", - " CT F 1910 Anne 38\n", - " CT F 1910 Frances 45\n", - " FL F 1910 Margaret 53\n", - " GA F 1910 Mae 73\n", - " GA F 1910 Beatrice 96\n", - " GA F 1910 Lola 47\n", - " IA F 1910 Viola 49\n", + "state gender year name number\n", + " AL F 1910 Cora 61\n", + " AL F 1910 Anna 74\n", + " AR F 1910 Willie 132\n", + " CO F 1910 Anna 42\n", + " FL F 1910 Louise 70\n", + " GA F 1910 Catherine 57\n", + " IL F 1910 Jessie 43\n", + " IN F 1910 Anna 100\n", + " IN F 1910 Pauline 77\n", + " IN F 1910 Beulah 39\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -143,31 +143,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "220340b0", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "\n", - " Query started with request ID bigframes-dev:US.161c75bd-f9f8-4b21-8a45-1d7dfc659034.
SQL
SELECT\n",
-       "`state` AS `state`,\n",
-       "`gender` AS `gender`,\n",
-       "`year` AS `year`,\n",
-       "`name` AS `name`,\n",
-       "`number` AS `number`\n",
-       "FROM\n",
-       "(SELECT\n",
-       "  `t0`.`state`,\n",
-       "  `t0`.`gender`,\n",
-       "  `t0`.`year`,\n",
-       "  `t0`.`name`,\n",
-       "  `t0`.`number`,\n",
-       "  `t0`.`bfuid_col_2` AS `bfuid_col_15`\n",
-       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._c58be946_1477_4c00_b699_0ae022f13563_bqdf_8e323719-899f-4da2-89cd-2dbb53ab1dfc` AS `t0`)\n",
-       "ORDER BY `bfuid_col_15` ASC NULLS LAST
\n", - " " + "✅ Completed. " ], "text/plain": [ "" @@ -179,9 +162,7 @@ { "data": { "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 7 seconds of slot time. [
Job bigframes-dev:US.job_IuiJsjhfPtOrKuTIOqPIjnVLX820 details]\n", - " " + "✅ Completed. " ], "text/plain": [ "" @@ -193,7 +174,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e68fbb9eb4d24bab837c77730d31c8a1", + "model_id": "bc392b57fb07439a9256f0ceca00cee3", "version_major": 2, "version_minor": 1 }, @@ -229,23 +210,23 @@ " AL\n", " F\n", " 1910\n", - " Hazel\n", - " 51\n", + " Sadie\n", + " 40\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Lucy\n", - " 76\n", + " Mary\n", + " 875\n", " \n", " \n", " 2\n", " AR\n", " F\n", " 1910\n", - " Nellie\n", + " Vera\n", " 39\n", " \n", " \n", @@ -253,56 +234,56 @@ " AR\n", " F\n", " 1910\n", - " Lena\n", - " 40\n", + " Marie\n", + " 78\n", " \n", " \n", " 4\n", - " CO\n", + " AR\n", " F\n", " 1910\n", - " Thelma\n", - " 36\n", + " Lucille\n", + " 66\n", " \n", " \n", " 5\n", - " CO\n", + " CA\n", " F\n", " 1910\n", - " Ruth\n", - " 68\n", + " Virginia\n", + " 101\n", " \n", " \n", " 6\n", - " CT\n", + " DC\n", " F\n", " 1910\n", - " Elizabeth\n", - " 86\n", + " Margaret\n", + " 72\n", " \n", " \n", " 7\n", - " DC\n", + " GA\n", " F\n", " 1910\n", - " Mary\n", - " 80\n", + " Mildred\n", + " 133\n", " \n", " \n", " 8\n", - " FL\n", + " GA\n", " F\n", " 1910\n", - " Annie\n", - " 101\n", + " Vera\n", + " 51\n", " \n", " \n", " 9\n", - " FL\n", + " GA\n", " F\n", " 1910\n", - " Alma\n", - " 39\n", + " Sallie\n", + " 92\n", " \n", " \n", "\n", @@ -310,67 +291,25 @@ "[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Hazel 51\n", - " AL F 1910 Lucy 76\n", - " AR F 1910 Nellie 39\n", - " AR F 1910 Lena 40\n", - " CO F 1910 Thelma 36\n", - " CO F 1910 Ruth 68\n", - " CT F 1910 Elizabeth 86\n", - " DC F 1910 Mary 80\n", - " FL F 1910 Annie 101\n", - " FL F 1910 Alma 39\n", + "state gender year name number\n", + " AL F 1910 Sadie 40\n", + " AL F 1910 Mary 875\n", + " AR F 1910 Vera 39\n", + " AR F 1910 Marie 78\n", + " AR F 1910 Lucille 66\n", + " CA F 1910 Virginia 101\n", + " DC F 1910 Margaret 72\n", + " GA F 1910 Mildred 133\n", + " GA F 1910 Vera 51\n", + " GA F 1910 Sallie 92\n", "...\n", "\n", "[5552452 rows x 5 columns]" ] }, - "execution_count": 13, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 9 seconds of slot time. [Job bigframes-dev:US.job_IEjIRaqt2w-_pAttPw1VAVuRPxA7 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 5 seconds of slot time. [Job bigframes-dev:US.job_Mi-3m2AkEC1iPgWi7hmcWa1M1oIA details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 6 seconds of slot time. [Job bigframes-dev:US.job_j8pvY385WwIY7tGvhI7Yxc62aBwd details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ @@ -396,7 +335,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 171.4 MB in 30 seconds of slot time. [Job bigframes-dev:US.ff90d507-bec8-4d24-abc3-0209ac28e21f details]\n", + " Query processed 171.4 MB in 44 seconds of slot time. [Job bigframes-dev:US.29964a00-ea7f-4aa2-ba1e-db93720d1d99 details]\n", " " ], "text/plain": [ @@ -477,7 +416,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_517TdI--FMoURkV7QQNMltY_-dZ7 details]\n", + " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_jpPjJUONKAr0mhoqlPMzmfKjrfAg details]\n", " " ], "text/plain": [ @@ -491,7 +430,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_rCeYkeBPqmTKNFWFgwXjz5Ed8uWI details]\n", + " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_TFxhL7KPARf-izeKjl5qbOJwHXkD details]\n", " " ], "text/plain": [ @@ -504,7 +443,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3e630b1a56c740e781772ca5f5c7267a", + "model_id": "155a483cb0a147b7bc50d4c8027e1593", "version_major": 2, "version_minor": 1 }, @@ -606,7 +545,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 11 seconds of slot time. [Job bigframes-dev:US.job_XwXTDb6gWVkuyIFMeWA0waE33bSg details]\n", + " Query processed 215.9 MB in 11 seconds of slot time. [Job bigframes-dev:US.job_EwEyLXNPqlfkHlNiCIDBDf_lh24w details]\n", " " ], "text/plain": [ @@ -620,7 +559,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 7 seconds of slot time. [Job bigframes-dev:US.job_bCW0LYK5_PzyyGPf9OAg4YfNMG1C details]\n", + " Query processed 215.9 MB in 9 seconds of slot time. [Job bigframes-dev:US.job_6kGS6xLLirf3zNIpvLOaPD1MPrcz details]\n", " " ], "text/plain": [ @@ -640,12 +579,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a6a2b19314b04283a5a66ca9d66eb771", + "model_id": "96cd46c47ccf4acbb4ba21a6e7bcb10f", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -755,12 +694,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "beb362548a6b4fd4a163569edd6f1a90", + "model_id": "df6e748755e64bbb80af509f25900f07", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -804,7 +743,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 19 seconds of slot time.\n", + " Query processed 85.9 kB in 17 seconds of slot time.\n", " " ], "text/plain": [ @@ -865,7 +804,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "02a46cf499b442d4bfe03934195e67df", + "model_id": "d4b6460a994b4a5cbba99d81df5729b5", "version_major": 2, "version_minor": 1 }, @@ -913,16 +852,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " H01L 21/20\n", + " H05B 6/12\n", " <NA>\n", - " 18166536.5\n", - " 16.02.2016\n", + " 18165514.3\n", + " 03.04.2018\n", + " 30.03.2017\n", " <NA>\n", - " Scheider, Sascha et al\n", - " EV Group E. Thallner GmbH\n", - " Kurz, Florian\n", - " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", - " EP 3 382 744 A1\n", + " BSH Hausger√§te GmbH\n", + " Acero Acero, Jesus\n", + " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", + " EP 3 383 141 A2\n", " \n", " \n", " 1\n", @@ -931,16 +870,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " A01K 31/00\n", + " H01L 21/20\n", " <NA>\n", - " 18171005.4\n", - " 05.02.2015\n", - " 05.02.2014\n", - " Stork Bamberger Patentanw√§lte\n", - " Linco Food Systems A/S\n", - " Thrane, Uffe\n", - " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", - " EP 3 381 276 A1\n", + " 18166536.5\n", + " 16.02.2016\n", + " <NA>\n", + " Scheider, Sascha et al\n", + " EV Group E. Thallner GmbH\n", + " Kurz, Florian\n", + " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", + " EP 3 382 744 A1\n", " \n", " \n", " 2\n", @@ -967,16 +906,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " H05B 6/12\n", - " <NA>\n", - " 18165514.3\n", - " 03.04.2018\n", - " 30.03.2017\n", + " A01K 31/00\n", " <NA>\n", - " BSH Hausger√§te GmbH\n", - " Acero Acero, Jesus\n", - " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", - " EP 3 383 141 A2\n", + " 18171005.4\n", + " 05.02.2015\n", + " 05.02.2014\n", + " Stork Bamberger Patentanw√§lte\n", + " Linco Food Systems A/S\n", + " Thrane, Uffe\n", + " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", + " EP 3 381 276 A1\n", " \n", " \n", " 4\n", @@ -1017,31 +956,31 @@ "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", "\n", " publication_date class_international class_us application_number \\\n", - "0 03.10.2018 H01L 21/20 18166536.5 \n", - "1 03.10.2018 A01K 31/00 18171005.4 \n", + "0 03.10.2018 H05B 6/12 18165514.3 \n", + "1 03.10.2018 H01L 21/20 18166536.5 \n", "2 03.10.2018 G06F 11/30 18157347.8 \n", - "3 03.10.2018 H05B 6/12 18165514.3 \n", + "3 03.10.2018 A01K 31/00 18171005.4 \n", "4 29.08.018 E04H 6/12 18157874.1 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 16.02.2016 Scheider, Sascha et al \n", - "1 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", + "0 03.04.2018 30.03.2017 \n", + "1 16.02.2016 Scheider, Sascha et al \n", "2 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "3 03.04.2018 30.03.2017 \n", + "3 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", "4 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", "\n", " applicant_line_1 inventor_line_1 \\\n", - "0 EV Group E. Thallner GmbH Kurz, Florian \n", - "1 Linco Food Systems A/S Thrane, Uffe \n", + "0 BSH Hausger√§te GmbH Acero Acero, Jesus \n", + "1 EV Group E. Thallner GmbH Kurz, Florian \n", "2 FUJITSU LIMITED Kukihara, Kensuke \n", - "3 BSH Hausger√§te GmbH Acero Acero, Jesus \n", + "3 Linco Food Systems A/S Thrane, Uffe \n", "4 SHB Hebezeugbau GmbH VOLGER, Alexander \n", "\n", " title_line_1 number \n", - "0 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "1 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "0 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", + "1 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", "2 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "3 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", + "3 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", "4 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", "\n", "[5 rows x 15 columns]" @@ -1064,6 +1003,284 @@ " LIMIT 5;\n", "\"\"\")" ] + }, + { + "cell_type": "markdown", + "id": "nested_markdown", + "metadata": {}, + "source": [ + "### Displaying Nested Data (STRUCTs and ARRAYs)\n", + "BigQuery DataFrames automatically flattens nested STRUCT and ARRAY columns into separate, more manageable columns when displayed in `anywidget` mode. This approach simplifies interaction and readability, as it avoids deeply nested or collapsible elements.\n", + "\n", + "This flattening ensures that all data is directly visible and sortable, enhancing the interactive table experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "nested_code", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 0 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e301dfaa00124615805d3168a0f58ab9", + "version_major": 2, + "version_minor": 1 + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idstruct_colarray_colnested_struct_array
01{'name': 'Alice', 'age': 30}[10 20 30][{'item': 'A', 'value': 100} {'item': 'B', 'va...
12{'name': 'Bob', 'age': 25}[40 50][{'item': 'C', 'value': 300}]
23{'name': 'Charlie', 'age': 35}[60 70 80][{'item': 'D', 'value': 400} {'item': 'E', 'va...
34{'name': 'David', 'age': 40}[ 90 100 110][{'item': 'F', 'value': 600} {'item': 'G', 'va...
45{'name': 'Eve', 'age': 45}[120 130 140][{'item': 'H', 'value': 800} {'item': 'I', 'va...
56{'name': 'Frank', 'age': 50}[150 160 170][{'item': 'J', 'value': 1000} {'item': 'K', 'v...
67{'name': 'Grace', 'age': 55}[180 190][{'item': 'L', 'value': 1200}]
78{'name': 'Heidi', 'age': 60}[200 210 220][{'item': 'M', 'value': 1300} {'item': 'N', 'v...
89{'name': 'Ivan', 'age': 65}[230 240 250 260][{'item': 'O', 'value': 1500} {'item': 'P', 'v...
910{'name': 'Judy', 'age': 70}[270 280][{'item': 'Q', 'value': 1700}]
\n", + "

10 rows × 4 columns

\n", + "
[12 rows x 4 columns in total]" + ], + "text/plain": [ + " id struct_col array_col \\\n", + "0 1 {'name': 'Alice', 'age': 30} [10 20 30] \n", + "1 2 {'name': 'Bob', 'age': 25} [40 50] \n", + "2 3 {'name': 'Charlie', 'age': 35} [60 70 80] \n", + "3 4 {'name': 'David', 'age': 40} [ 90 100 110] \n", + "4 5 {'name': 'Eve', 'age': 45} [120 130 140] \n", + "5 6 {'name': 'Frank', 'age': 50} [150 160 170] \n", + "6 7 {'name': 'Grace', 'age': 55} [180 190] \n", + "7 8 {'name': 'Heidi', 'age': 60} [200 210 220] \n", + "8 9 {'name': 'Ivan', 'age': 65} [230 240 250 260] \n", + "9 10 {'name': 'Judy', 'age': 70} [270 280] \n", + "\n", + " nested_struct_array \n", + "0 [{'item': 'A', 'value': 100} {'item': 'B', 'va... \n", + "1 [{'item': 'C', 'value': 300}] \n", + "2 [{'item': 'D', 'value': 400} {'item': 'E', 'va... \n", + "3 [{'item': 'F', 'value': 600} {'item': 'G', 'va... \n", + "4 [{'item': 'H', 'value': 800} {'item': 'I', 'va... \n", + "5 [{'item': 'J', 'value': 1000} {'item': 'K', 'v... \n", + "6 [{'item': 'L', 'value': 1200}] \n", + "7 [{'item': 'M', 'value': 1300} {'item': 'N', 'v... \n", + "8 [{'item': 'O', 'value': 1500} {'item': 'P', 'v... \n", + "9 [{'item': 'Q', 'value': 1700}] \n", + "...\n", + "\n", + "[12 rows x 4 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql_nested_data = \"\"\"\n", + "SELECT\n", + " 1 AS id,\n", + " STRUCT('Alice' AS name, 30 AS age) AS struct_col,\n", + " [10, 20, 30] AS array_col,\n", + " [STRUCT('A' AS item, 100 AS value), STRUCT('B' AS item, 200 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 2 AS id,\n", + " STRUCT('Bob' AS name, 25 AS age) AS struct_col,\n", + " [40, 50] AS array_col,\n", + " [STRUCT('C' AS item, 300 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 3 AS id,\n", + " STRUCT('Charlie' AS name, 35 AS age) AS struct_col,\n", + " [60, 70, 80] AS array_col,\n", + " [STRUCT('D' AS item, 400 AS value), STRUCT('E' AS item, 500 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 4 AS id,\n", + " STRUCT('David' AS name, 40 AS age) AS struct_col,\n", + " [90, 100, 110] AS array_col,\n", + " [STRUCT('F' AS item, 600 AS value), STRUCT('G' AS item, 700 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 5 AS id,\n", + " STRUCT('Eve' AS name, 45 AS age) AS struct_col,\n", + " [120, 130, 140] AS array_col,\n", + " [STRUCT('H' AS item, 800 AS value), STRUCT('I' AS item, 900 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 6 AS id,\n", + " STRUCT('Frank' AS name, 50 AS age) AS struct_col,\n", + " [150, 160, 170] AS array_col,\n", + " [STRUCT('J' AS item, 1000 AS value), STRUCT('K' AS item, 1100 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 7 AS id,\n", + " STRUCT('Grace' AS name, 55 AS age) AS struct_col,\n", + " [180, 190] AS array_col,\n", + " [STRUCT('L' AS item, 1200 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 8 AS id,\n", + " STRUCT('Heidi' AS name, 60 AS age) AS struct_col,\n", + " [200, 210, 220] AS array_col,\n", + " [STRUCT('M' AS item, 1300 AS value), STRUCT('N' AS item, 1400 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 9 AS id,\n", + " STRUCT('Ivan' AS name, 65 AS age) AS struct_col,\n", + " [230, 240, 250, 260] AS array_col,\n", + " [STRUCT('O' AS item, 1500 AS value), STRUCT('P' AS item, 1600 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 10 AS id,\n", + " STRUCT('Judy' AS name, 70 AS age) AS struct_col,\n", + " [270, 280] AS array_col,\n", + " [STRUCT('Q' AS item, 1700 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 11 AS id,\n", + " STRUCT('Kevin' AS name, 75 AS age) AS struct_col,\n", + " [290, 300, 310] AS array_col,\n", + " [STRUCT('R' AS item, 1800 AS value), STRUCT('S' AS item, 1900 AS value), STRUCT('T' AS item, 2000 AS value), STRUCT('U' AS item, 2100 AS value)] AS nested_struct_array\n", + "UNION ALL\n", + "SELECT\n", + " 12 AS id,\n", + " STRUCT('Laura' AS name, 80 AS age) AS struct_col,\n", + " [320] AS array_col,\n", + " [STRUCT('V' AS item, 2200 AS value), STRUCT('W' AS item, 2300 AS value), STRUCT('X' AS item, 2400 AS value)] AS nested_struct_array\n", + "\"\"\"\n", + "\n", + "df_from_sql = bpd.read_gbq(sql_nested_data)\n", + "\n", + "# Display this DataFrame. The nested fields will be rendered as flattened elements.\n", + "df_from_sql" + ] } ], "metadata": { diff --git a/tests/js/table_widget.test.js b/tests/js/table_widget.test.js index d701d8692e..a4203fa6d2 100644 --- a/tests/js/table_widget.test.js +++ b/tests/js/table_widget.test.js @@ -1,3 +1,7 @@ +/** + * @fileoverview Tests for the anywidget-based table widget. + */ + /* * Copyright 2025 Google LLC * @@ -16,19 +20,28 @@ import { jest } from '@jest/globals'; +/* + * Test suite for the TableWidget frontend component. + */ describe('TableWidget', () => { + /** @type {any} */ let model; + /** @type {HTMLElement} */ let el; + /** @type {Function} */ let render; + /* + * Sets up the test environment before each test. + * This includes resetting modules, creating a DOM element, + * and mocking the widget model. + */ beforeEach(async () => { jest.resetModules(); document.body.innerHTML = '
'; el = document.body.querySelector('div'); - const tableWidget = ( - await import('../../bigframes/display/table_widget.js') - ).default; + const tableWidget = await import('../../bigframes/display/table_widget.js'); render = tableWidget.render; model = { @@ -43,6 +56,9 @@ describe('TableWidget', () => { expect(render).toBeDefined(); }); + /* + * Tests for the render function of the widget. + */ describe('render', () => { it('should create the basic structure', () => { // Mock the initial state @@ -73,6 +89,10 @@ describe('TableWidget', () => { expect(el.querySelector('div:nth-child(3)')).not.toBeNull(); }); + /* + * Verifies that clicking a sortable column header triggers a sort action + * with the correct parameters. + */ it('should sort when a sortable column is clicked', () => { // Mock the initial state model.get.mockImplementation((property) => { @@ -198,6 +218,10 @@ describe('TableWidget', () => { expect(indicator2.textContent).toBe('●'); }); + /* + * Tests that holding the Shift key while clicking a column header + * adds the new column to the existing sort context for multi-column sorting. + */ it('should add a column to sort when Shift+Click is used', () => { // Mock the initial state: already sorted by col1 asc model.get.mockImplementation((property) => { @@ -336,6 +360,136 @@ describe('TableWidget', () => { expect(headers[1].textContent).toBe('value'); }); + /* + * Verifies that hovering over a cell in a group of flattened rows + * (i.e., rows originating from the same nested data structure) + * adds a hover class to all cells in that group. + */ + it('should highlight all rows in a group when hovering over a nested data row', () => { + // Mock HTML with nested data structure (flattened rows) + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return ` + + + +
Row 1 Part A
Row 1 Part B
Row 2
`; + } + if (property === 'orderable_columns') { + return []; + } + return null; + }); + + render({ model, el }); + + // Manually trigger the table_html change handler + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const firstRowCell = el.querySelector('tr[data-orig-row="0"] td'); + const rowsInGroup = el.querySelectorAll('tr[data-orig-row="0"] td'); + + // Simulate mouseover + const mouseOverEvent = new MouseEvent('mouseover', { + bubbles: true, + cancelable: true, + }); + firstRowCell.dispatchEvent(mouseOverEvent); + + // Check if row-hover class is added to all cells in the group + + rowsInGroup.forEach((cell) => { + expect(cell.classList.contains('row-hover')).toBe(true); + }); + + // Simulate mouseout + const mouseOutEvent = new MouseEvent('mouseout', { + bubbles: true, + cancelable: true, + }); + firstRowCell.dispatchEvent(mouseOutEvent); + + // Check if row-hover class is removed + + rowsInGroup.forEach((cell) => { + expect(cell.classList.contains('row-hover')).toBe(false); + }); + }); + + it('should not highlight unrelated rows when hovering over a nested data row', () => { + // Mock HTML with nested data structure + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return ` + + + +
Row 1 Part A
Row 1 Part B
Row 2
`; + } + if (property === 'orderable_columns') { + return []; + } + return null; + }); + + render({ model, el }); + + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const row1Cell = el.querySelector('tr[data-orig-row="0"] td'); + const row2Cell = el.querySelector('tr[data-orig-row="1"] td'); + + const mouseOverEvent = new MouseEvent('mouseover', { + bubbles: true, + cancelable: true, + }); + row1Cell.dispatchEvent(mouseOverEvent); + + // Row 2 should NOT have the hover class + expect(row2Cell.classList.contains('row-hover')).toBe(false); + }); + + it('should not highlight other rows when hovering over a non-nested row', () => { + // Mock HTML with mixed data structure + model.get.mockImplementation((property) => { + if (property === 'table_html') { + return ` + + +
Standard Row
Nested Row
`; + } + if (property === 'orderable_columns') { + return []; + } + return null; + }); + + render({ model, el }); + + const tableHtmlChangeHandler = model.on.mock.calls.find( + (call) => call[0] === 'change:table_html', + )[1]; + tableHtmlChangeHandler(); + + const standardCell = el.querySelector('tr:not([data-orig-row]) td'); + const nestedCell = el.querySelector('tr[data-orig-row="0"] td'); + + const mouseOverEvent = new MouseEvent('mouseover', { + bubbles: true, + cancelable: true, + }); + standardCell.dispatchEvent(mouseOverEvent); + + // The nested row should NOT have the hover class + expect(nestedCell.classList.contains('row-hover')).toBe(false); + }); + /* * Tests that the widget correctly renders HTML with truncated columns (ellipsis) * and ensures that the ellipsis column is not treated as a sortable column. diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index da87568c91..44b19c5c9b 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -992,178 +992,102 @@ def test_dataframe_repr_mimebundle_should_return_widget_with_metadata_in_anywidg assert "colab" in metadata["application/vnd.jupyter.widget-view+json"] -@pytest.fixture(scope="module") -def custom_index_pandas_df() -> pd.DataFrame: - """Create a DataFrame with a custom named index for testing.""" - test_data = pd.DataFrame( +@pytest.fixture +def nested_data_df(): + """Fixture to provide a pandas DataFrame with nested data (STRUCT and ARRAY) using ArrowDtype.""" + import pyarrow as pa + + # Struct column + struct_type = pa.struct([("name", pa.string()), ("age", pa.int64())]) + struct_data = [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}] + struct_arr = pa.array(struct_data, type=struct_type) + + # Array column + array_type = pa.list_(pa.int64()) + array_data = [[10, 20, 30], [40, 50]] + array_arr = pa.array(array_data, type=array_type) + + # Array of Struct column + nested_struct_type = pa.struct([("item", pa.string()), ("value", pa.int64())]) + nested_array_type = pa.list_(nested_struct_type) + nested_data = [ + [{"item": "A", "value": 100}, {"item": "B", "value": 200}], + [{"item": "C", "value": 300}], + ] + nested_arr = pa.array(nested_data, type=nested_array_type) + + df = pd.DataFrame( { - "value_a": [10, 20, 30, 40, 50, 60], - "value_b": ["a", "b", "c", "d", "e", "f"], + "id": [1, 2], + "struct_col": pd.Series(struct_arr, dtype=pd.ArrowDtype(struct_type)), + "array_col": pd.Series(array_arr, dtype=pd.ArrowDtype(array_type)), + "nested_struct_array": pd.Series( + nested_arr, dtype=pd.ArrowDtype(nested_array_type) + ), } ) - test_data.index = pd.Index( - ["row_1", "row_2", "row_3", "row_4", "row_5", "row_6"], name="custom_idx" - ) - return test_data + return df -@pytest.fixture(scope="module") -def custom_index_bf_df( - session: bf.Session, custom_index_pandas_df: pd.DataFrame -) -> bf.dataframe.DataFrame: - return session.read_pandas(custom_index_pandas_df) +@pytest.fixture +def different_lengths_arrays_df(): + """Fixture to provide a DataFrame with arrays of different lengths using ArrowDtype.""" + import pyarrow as pa + array_type = pa.list_(pa.int64()) + array_col1 = pa.array([[10, 20, 30]], type=array_type) + array_col2 = pa.array([[100, 200]], type=array_type) -@pytest.fixture(scope="module") -def multiindex_pandas_df() -> pd.DataFrame: - """Create a DataFrame with MultiIndex for testing.""" - test_data = pd.DataFrame( + df = pd.DataFrame( { - "value": [100, 200, 300, 400, 500, 600], - "category": ["X", "Y", "Z", "X", "Y", "Z"], + "id": [1], + "array_col1": pd.Series(array_col1, dtype=pd.ArrowDtype(array_type)), + "array_col2": pd.Series(array_col2, dtype=pd.ArrowDtype(array_type)), } ) - test_data.index = pd.MultiIndex.from_arrays( - [ - ["group_A", "group_A", "group_A", "group_B", "group_B", "group_B"], - [1, 2, 3, 1, 2, 3], - ], - names=["group", "item"], - ) - return test_data - + return df -@pytest.fixture(scope="module") -def multiindex_bf_df( - session: bf.Session, multiindex_pandas_df: pd.DataFrame -) -> bf.dataframe.DataFrame: - return session.read_pandas(multiindex_pandas_df) +def test_render_html_with_nested_data(nested_data_df: pd.DataFrame): + """Verify that render_html correctly flattens nested STRUCT and ARRAY columns. -def test_widget_with_default_index_should_display_index_column_with_empty_header( - paginated_bf_df: bf.dataframe.DataFrame, -): - """ - Given a DataFrame with a default index, when the TableWidget is rendered, - then an index column should be visible with an empty header. + Updated to expect inline styles. """ - import re - - from bigframes.display.anywidget import TableWidget - - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(paginated_bf_df) - html = widget.table_html - - # The header for the index should be present but empty, matching the - # internal rendering logic. - thead = html.split("")[1].split("")[0] - # Find the first header cell and check that its content div is empty. - match = re.search(r"]*>]*>([^<]*)", thead) - assert match is not None, "Could not find table header cell in output." - assert ( - match.group(1) == "" - ), f"Expected empty index header, but found: {match.group(1)}" + from bigframes.display import html + result_html = html.render_html(dataframe=nested_data_df, table_id="test-table") -def test_widget_with_custom_index_should_display_index_column( - custom_index_bf_df: bf.dataframe.DataFrame, -): - """ - Given a DataFrame with a custom named index, when rendered, - then the index column and first page of rows should be visible. - """ - from bigframes.display.anywidget import TableWidget - - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(custom_index_bf_df) - html = widget.table_html + # Check that Alice's data is not repeated on the second row + assert 'class="cell-align-right">1' in result_html + assert 'class="cell-align-left">Alice' in result_html + assert 'class="cell-align-left">30' in result_html + assert 'class="cell-align-left">10' in result_html - assert "custom_idx" in html - assert "row_1" in html - assert "row_2" in html - assert "row_3" not in html # Verify pagination is working - assert "row_4" not in html + # Check continuation row + assert 'class="array-continuation" data-orig-row="0">' in result_html + # In continuation rows, non-array cells are empty + assert "" in result_html -def test_widget_with_custom_index_pagination_preserves_index( - custom_index_bf_df: bf.dataframe.DataFrame, +def test_render_html_with_arrays_of_different_lengths( + different_lengths_arrays_df: pd.DataFrame, ): - """ - Given a DataFrame with a custom index, when navigating to the second page, - then the second page's index values should be visible. - """ - from bigframes.display.anywidget import TableWidget - - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(custom_index_bf_df) - - widget.page = 1 # Navigate to page 2 - html = widget.table_html - - assert "row_3" in html - assert "row_4" in html - assert "row_1" not in html # Verify page 1 content is gone - assert "row_2" not in html - + """Verify that render_html handles arrays of different lengths correctly. -def test_widget_with_custom_index_matches_pandas_output( - custom_index_bf_df: bf.dataframe.DataFrame, -): - """ - Given a DataFrame with a custom index and max_rows=3, the widget's HTML - output should contain the first three index values. + Updated to expect inline styles. """ - from bigframes.display.anywidget import TableWidget - - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 3): - widget = TableWidget(custom_index_bf_df) - html = widget.table_html - - assert "row_1" in html - assert "row_2" in html - assert "row_3" in html - assert "row_4" not in html # Verify it respects max_rows - - -# TODO(b/438181139): Add tests for custom multiindex -# This may not be necessary for the SQL Cell use case but should be -# considered for completeness. + from bigframes.display import html - -def test_series_anywidget_integration_with_notebook_display( - paginated_bf_df: bf.dataframe.DataFrame, -): - """Test Series display integration in Jupyter-like environment.""" - pytest.importorskip("anywidget") - - with bf.option_context("display.repr_mode", "anywidget"): - series = paginated_bf_df["value"] - - # Test the full display pipeline - from IPython.display import display as ipython_display - - # This should work without errors - ipython_display(series) - - -def test_series_different_data_types_anywidget(session: bf.Session): - """Test Series with different data types in anywidget mode.""" - pytest.importorskip("anywidget") - - # Create Series with different types - test_data = pd.DataFrame( - { - "string_col": ["a", "b", "c"], - "int_col": [1, 2, 3], - "float_col": [1.1, 2.2, 3.3], - "bool_col": [True, False, True], - } + result_html = html.render_html( + dataframe=different_lengths_arrays_df, table_id="test-table" ) - bf_df = session.read_pandas(test_data) - with bf.option_context("display.repr_mode", "anywidget"): - for col_name in test_data.columns: - series = bf_df[col_name] - widget = bigframes.display.TableWidget(series.to_frame()) - assert widget.row_count == 3 + # The first row should contain the first element of both arrays + assert 'class="cell-align-right">1' in result_html + assert 'class="cell-align-left">10' in result_html + assert 'class="cell-align-left">100' in result_html + + # The second row should contain the second element of both arrays + assert 'class="array-continuation" data-orig-row="0">' in result_html + assert 'class="cell-align-left">20' in result_html + assert 'class="cell-align-left">200' in result_html diff --git a/tests/unit/core/test_blocks_split.py b/tests/unit/core/test_blocks_split.py new file mode 100644 index 0000000000..a986a96fab --- /dev/null +++ b/tests/unit/core/test_blocks_split.py @@ -0,0 +1,77 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pandas as pd + +import bigframes +import bigframes.core.blocks as blocks + + +def test_block_split_rounding(): + # Setup a mock block with a specific shape + mock_session = mock.create_autospec(spec=bigframes.Session) + # Block.from_local needs a real-ish session for some things, but we can mock shape[0] + + # Let's use a real Block with local data for simplicity if possible + df = pd.DataFrame({"a": range(29757)}) + block = blocks.Block.from_local(df, mock_session) + + # We need to mock the internal behavior of split or check the result sizes + # Since split returns new Blocks, we can check their shapes if they are computed. + # But split calls block.slice which calls block.expr.slice... + + # Instead of full execution, let's just test the rounding logic by mocking block.shape + with mock.patch.object( + blocks.Block, "shape", new_callable=mock.PropertyMock + ) as mock_shape: + mock_shape.return_value = (29757, 1) + + # We need to mock other things that split calls to avoid full execution + with mock.patch.object(blocks.Block, "create_constant") as mock_create_constant: + mock_create_constant.return_value = (block, "random_col") + with mock.patch.object( + blocks.Block, "promote_offsets" + ) as mock_promote_offsets: + mock_promote_offsets.return_value = (block, "offset_col") + with mock.patch.object( + blocks.Block, "apply_unary_op" + ) as mock_apply_unary_op: + mock_apply_unary_op.return_value = (block, "unary_col") + with mock.patch.object( + blocks.Block, "apply_binary_op" + ) as mock_apply_binary_op: + mock_apply_binary_op.return_value = (block, "binary_col") + with mock.patch.object( + blocks.Block, "order_by" + ) as mock_order_by: + mock_order_by.return_value = block + with mock.patch.object(blocks.Block, "slice") as mock_slice: + mock_slice.return_value = block + + # Call split + block.split(fracs=(0.8, 0.2)) + + # Check calls to slice + # Expected sample_sizes with round(): + # round(0.8 * 29757) = 23806 + # round(0.2 * 29757) = 5951 + + calls = mock_slice.call_args_list + assert len(calls) == 2 + assert calls[0].kwargs["start"] == 0 + assert calls[0].kwargs["stop"] == 23806 + assert calls[1].kwargs["start"] == 23806 + assert calls[1].kwargs["stop"] == 23806 + 5951 diff --git a/tests/unit/display/test_flatten.py b/tests/unit/display/test_flatten.py new file mode 100644 index 0000000000..f512039710 --- /dev/null +++ b/tests/unit/display/test_flatten.py @@ -0,0 +1,168 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pyarrow as pa + +from bigframes.display._flatten import flatten_nested_data + + +def test_flatten_nested_data_flattens_structs(): + """Verify that flatten_nested_data correctly flattens STRUCT columns.""" + struct_type = pa.struct([("name", pa.string()), ("age", pa.int64())]) + struct_arr = pa.array( + [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}], type=struct_type + ) + struct_data = pd.DataFrame( + { + "id": [1, 2], + "struct_col": pd.Series( + struct_arr, + dtype=pd.ArrowDtype(struct_type), + ), + } + ) + + result = flatten_nested_data(struct_data) + flattened = result.dataframe + nested_originated_columns = result.nested_columns + + assert "struct_col.name" in flattened.columns + assert "struct_col.age" in flattened.columns + assert flattened["struct_col.name"].tolist() == ["Alice", "Bob"] + assert "struct_col" in nested_originated_columns + assert "struct_col.name" in nested_originated_columns + assert "struct_col.age" in nested_originated_columns + + +def test_flatten_nested_data_explodes_arrays(): + """Verify that flatten_nested_data correctly explodes ARRAY columns.""" + array_type = pa.list_(pa.int64()) + array_arr = pa.array([[10, 20, 30], [40, 50]], type=array_type) + array_data = pd.DataFrame( + { + "id": [1, 2], + "array_col": pd.Series(array_arr, dtype=pd.ArrowDtype(array_type)), + } + ) + + result = flatten_nested_data(array_data) + flattened = result.dataframe + row_labels = result.row_labels + continuation_rows = result.continuation_rows + nested_originated_columns = result.nested_columns + + assert len(flattened) == 5 # 3 + 2 array elements + assert row_labels == ["0", "0", "0", "1", "1"] + assert continuation_rows == {1, 2, 4} + assert "array_col" in nested_originated_columns + + +def test_flatten_preserves_original_index(): + """Verify that original index is preserved (and duplicated) during flattening.""" + array_type = pa.list_(pa.int64()) + array_arr = pa.array([[10, 20], [30, 40]], type=array_type) + array_data = pd.DataFrame( + { + "array_col": pd.Series( + array_arr, dtype=pd.ArrowDtype(array_type), index=["row_a", "row_b"] + ), + } + ) + array_data.index.name = "my_index" + + result = flatten_nested_data(array_data) + flattened = result.dataframe + row_labels = result.row_labels + + assert flattened.index.name == "my_index" + assert flattened.index.tolist() == ["row_a", "row_a", "row_b", "row_b"] + assert row_labels == ["row_a", "row_a", "row_b", "row_b"] + + +def test_flatten_preserves_multiindex(): + """Verify that MultiIndex is preserved (and duplicated) during flattening.""" + index = pd.MultiIndex.from_tuples([("A", 1), ("B", 2)], names=["idx1", "idx2"]) + array_type = pa.list_(pa.int64()) + array_arr = pa.array([[10, 20], [30, 40]], type=array_type) + array_data = pd.DataFrame( + { + "array_col": pd.Series( + array_arr, dtype=pd.ArrowDtype(array_type), index=index + ), + } + ) + + result = flatten_nested_data(array_data) + flattened = result.dataframe + + assert flattened.index.names == ["idx1", "idx2"] + assert len(flattened) == 4 + assert flattened.index.tolist() == [("A", 1), ("A", 1), ("B", 2), ("B", 2)] + + +def test_flatten_empty_dataframe(): + """Verify behavior with an empty DataFrame.""" + empty_df = pd.DataFrame({"col": []}) + result = flatten_nested_data(empty_df) + + assert result.dataframe.empty + assert result.dataframe.columns.tolist() == ["col"] + assert result.row_labels is None + assert result.continuation_rows is None + + +def test_flatten_mixed_struct_array(): + """Verify flattening of a DataFrame with both STRUCT and ARRAY columns.""" + struct_type = pa.struct([("a", pa.int64())]) + struct_arr = pa.array([{"a": 1}, {"a": 2}], type=struct_type) + + array_type = pa.list_(pa.int64()) + array_arr = pa.array([[10, 20], [30]], type=array_type) + + df = pd.DataFrame( + { + "struct_col": pd.Series(struct_arr, dtype=pd.ArrowDtype(struct_type)), + "array_col": pd.Series(array_arr, dtype=pd.ArrowDtype(array_type)), + "scalar_col": [100, 200], + }, + index=[0, 1], + ) + + result = flatten_nested_data(df) + flattened = result.dataframe + continuation_rows = result.continuation_rows + cleared_on_continuation = result.cleared_on_continuation + + # Row 0 explodes to 2 rows (array len 2). Row 1 stays 1 row (array len 1). + # Total rows = 3. + assert len(flattened) == 3 + + # struct_col should be flattened to struct_col.a + assert "struct_col.a" in flattened.columns + assert flattened["struct_col.a"].tolist() == [1, 1, 2] + + # array_col should be exploded + assert flattened["array_col"].tolist() == [10, 20, 30] + + # scalar_col should be duplicated + assert flattened["scalar_col"].tolist() == [100, 100, 200] + + # Check metadata + # continuation_rows should only contain index 1 (the second element of the first row's array) + assert continuation_rows == {1} + + # struct_col.a and scalar_col should be in cleared_on_continuation + assert "struct_col.a" in cleared_on_continuation + assert "scalar_col" in cleared_on_continuation diff --git a/tests/unit/display/test_html.py b/tests/unit/display/test_html.py index 35a74d098a..9fa5b78a4f 100644 --- a/tests/unit/display/test_html.py +++ b/tests/unit/display/test_html.py @@ -106,7 +106,7 @@ { "array_col": "left", }, - ["[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], id="array", ), pytest.param( @@ -119,7 +119,7 @@ { "struct_col": "left", }, - ["{'v': 1}", "{'v': 2}", "{'v': 3}"], + ["1", "2", "3"], id="struct", ), ],