feat: Create TableFormatter and MarkdownTableFormatter for improved table formatting

yotam319-sparkbeyond · yotam319-sparkbeyond · commit cd0cdaecf2ca · 2026-01-12T14:44:47.000+02:00
diff --git a/agentune/core/formatter/base.py b/agentune/core/formatter/base.py
@@ -65,36 +65,37 @@ async def aformat_batch(self, input: Dataset, conn: DuckDBPyConnection) -> pl.Se
         ...
 
 
+
 @attrs.define
-class TablesFormatter(ABC, UseTypeTag):
-    """Abstract base class for formatting database tables.
+class TableFormatter(ABC, UseTypeTag):
+    """Abstract base class for formatting database table.
     
-    Formats information about available database tables (schemas and sample data) for use in LLM prompts.
+    Formats information about table (schemas and sample data) to string representations.
     """
 
-    primary_table_name: str = 'primary_table'
-
-    def _format_schema(self, schema: Schema) -> str:
-        """Format schema to human-readable string for LLM prompts."""
-        lines = []
-        for field in schema.cols:
-            # Convert Dtype to simple string representation
-            dtype_str = repr(field.dtype.polars_type)
-            lines.append(f'- {field.name}: {dtype_str}')
-
-        return '\n'.join(lines)
-
-    def _format_sample_data(self, dataset: Dataset) -> str:
-        """Format sample data rows as table for LLM prompts."""
-        return dataset.data.write_csv()
-    
     @abstractmethod
     def format_table(
         self,
         sample_data: Dataset,
     ) -> str:
-        """Format schema and sample data for a single table."""
+        """Format schema and sample data for a single table.
+
+        Args:
+            sample_data: Dataset containing the sample data to format
+        Returns:
+            String representation of the table with its schema and sample data
+        """
         ...
+
+
+@attrs.define
+class TablesFormatter(ABC, UseTypeTag):
+    """Abstract base class for formatting database tables.
+    
+    Formats information about available database tables (schemas and sample data) to string representations.
+    """
+
+    primary_table_name: str = 'primary_table'
         
     @abstractmethod
     def format_all_tables(
@@ -104,12 +105,13 @@ def format_all_tables(
         conn: DuckDBPyConnection,
         random_seed: int | None = None,
     ) -> str:
-        """Format the primary dataset and all auxiliary tables with their schemas and sample data for LLM prompts.
+        """Format the primary dataset and all auxiliary tables with their schemas and sample.
 
         Args:
             input: Input dataset (primary table)
             tables: Available tables with their join strategies
             conn: Database connection to query sample data
+            random_seed: Random seed for sampling (if applicable)
 
         Returns:
             String representation of all tables with their schemas and sample data
diff --git a/agentune/core/formatter/tables.py b/agentune/core/formatter/tables.py
@@ -7,35 +7,91 @@
 
 from agentune.analyze.join.base import TablesWithJoinStrategies
 from agentune.core.dataset import Dataset
-from agentune.core.formatter.base import TablesFormatter
-from agentune.core.sampler.base import DataSampler, HeadSampler
+from agentune.core.formatter.base import TableFormatter, TablesFormatter
+from agentune.core.sampler.base import DataSampler, RandomSampler, TableSampler
+from agentune.core.sampler.table_samples import HeadTableSampler
+from agentune.core.schema import Schema
 
 
 @attrs.frozen
-class SimpleTablesFormatter(TablesFormatter):
-    """Simple tables formatter for LLM prompts.
+class MarkdownTableFormatter(TableFormatter):
+    """Markdown table formatter.
     
-    Formats all available tables with their schemas and sample data.
-    Each table is formatted with:
-    - Table name
-    - Schema (list of columns with types)
-    - Sample data (CSV format)
+    Formats a single table with its schema and sample data using markdown headers.
+    The schema is displayed as a bulleted list of columns with their DuckDB types,
+    and the sample data is formatted as CSV for readability.
+    
+    Args:
+        markdown_level: The markdown header level to use for sections (default: 3).
     """
-    num_samples: int = 5
-    sampler: DataSampler = HeadSampler()
+    markdown_level: int = 3
+
+    def _format_schema(self, schema: Schema) -> str:
+        """Format schema to human-readable string."""
+        lines = []
+        for field in schema.cols:
+            # Convert Dtype to simple string representation using duckdb_type
+            dtype_str = repr(field.dtype.duckdb_type)
+            lines.append(f'- {field.name}: {dtype_str}')
+
+        return '\n'.join(lines)
+
+    def _format_sample_data(self, dataset: Dataset) -> str:
+        """Format sample data rows as table using CSV format."""
+        return dataset.data.write_csv()
+    
+    @override
+    def format_table(
+        self,
+        sample_data: Dataset,
+    ) -> str:
+        """Format schema and sample data for a single table.
+        
+        Includes markdown headers at the specified level. Formats the schema as a list
+        of columns with their DuckDB types, and formats the sample data as CSV.
+        
+        Args:
+            sample_data: Dataset containing the sample data to format
 
-    def format_table(self, sample_data: Dataset) -> str:
-        """Format schema and sample data for a single table."""
+        Returns:
+            String representation of the table with its schema and sample data
+        """
+        markdown_header = '#' * self.markdown_level
         # Schema
-        out = ['### Schema:']
+        out = [f'{markdown_header} Schema:']
         out.append(self._format_schema(sample_data.schema))
         out.append('')
         
         # Sample data
-        out.append(f'### Sample Data ({self.num_samples} rows):')
+        out.append(f'{markdown_header} Sample Data:')
         out.append(self._format_sample_data(sample_data))
         out.append('')
         return '\n'.join(out)
+    
+
+@attrs.frozen
+class MarkdownTablesFormatter(TablesFormatter):
+    """Markdown tables formatter.
+    
+    Formats all available tables (primary and secondary) with their schemas and sample data
+    in markdown format. Each table includes a header with its name, followed by its schema
+    and sample data sections.
+    
+    Args:
+        markdown_level: The markdown header level to use for table sections (default: 2).
+        num_samples: Number of sample rows to retrieve for each table (default: 5).
+        table_formatter: TableFormatter to use for formatting individual tables.
+                        Defaults to MarkdownTableFormatter with markdown_level + 1.
+        primary_dataset_sampler: DataSampler to use for sampling the primary dataset.
+                                Defaults to RandomSampler for representative sampling.
+        tables_sampler: TableSampler to use for sampling the secondary tables.
+                       Defaults to HeadTableSampler for consistent sampling.
+    """
+    markdown_level: int = 2
+    num_samples: int = 5
+    table_formatter: TableFormatter = attrs.field(default=attrs.Factory(lambda self: MarkdownTableFormatter(markdown_level=self.markdown_level + 1), takes_self=True))
+    primary_dataset_sampler: DataSampler = RandomSampler()
+    tables_sampler: TableSampler = HeadTableSampler()
 
     @override
     def format_all_tables(
@@ -46,31 +102,38 @@ def format_all_tables(
         random_seed: int | None = None,
     ) -> str:
         """Format all available tables with their schemas and sample data for LLM prompts.
+        
+        Formats the primary table followed by all secondary tables. Each table is formatted
+        with a header at the specified markdown level, followed by its schema and sample data
+        using the configured table formatter.
 
         Args:
-            input: Input dataset (primary table)
-            tables: Available tables with their join strategies
-            conn: Database connection to query sample data
+            input: Primary input dataset to format.
+            tables: Secondary tables with their join strategies to format.
+            conn: Database connection for querying sample data from secondary tables.
+            random_seed: Optional random seed for reproducible sampling.
 
         Returns:
-            String representation of all tables with their schemas and sample data
+            Markdown-formatted string containing all tables with their schemas and sample data.
         """
         sections = []
-        
+        markdown_header = '#' * self.markdown_level
         # get sample data for primary table
-        sample_data = self.sampler.sample(input, self.num_samples, random_seed=random_seed)
+        sample_data = self.primary_dataset_sampler.sample(input, self.num_samples, random_seed=random_seed)
         # Format primary table
-        sections.append(f'## Primary Table: {self.primary_table_name}\n')
-        sections.append(self.format_table(sample_data))
-
+        sections.append(f'{markdown_header} Primary Table: {self.primary_table_name}\n')
+        sections.append(self.table_formatter.format_table(sample_data))
         # Format secondary tables
         for table_with_strategies in tables:
             # get sample data for the table
-            table = table_with_strategies.table
-            dataset = table.as_source().to_dataset(conn)
-            sample_data = self.sampler.sample(dataset, self.num_samples, random_seed=random_seed)
+            sample_data = self.tables_sampler.sample(
+                table_with_strategies,
+                conn,
+                self.num_samples,
+                random_seed=random_seed,
+            )
             # Format table section
-            sections.append(f'## Table: {table.name.name}\n')
-            sections.append(self.format_table(sample_data))
+            sections.append(f'{markdown_header} Table: {table_with_strategies.table.name.name}\n')
+            sections.append(self.table_formatter.format_table(sample_data))
         
         return '\n'.join(sections)
diff --git a/agentune/core/sampler/base.py b/agentune/core/sampler/base.py
@@ -61,27 +61,3 @@ def sample(self, dataset: Dataset, sample_size: int, random_seed: int | None = N
             schema=dataset.schema,
             data=sampled_df,
         )
-
-@attrs.define
-class HeadSampler(DataSampler):
-    """Simple head sampling.
-
-    This sampler selects the first `sample_size` rows from the dataset.
-    """
-
-    @override
-    def sample(self, dataset: Dataset, sample_size: int, random_seed: int | None = None) -> Dataset:
-        """Sample data using head sampling."""
-        self._validate_inputs(dataset, sample_size)
-
-        # If sample size >= dataset size, return the entire dataset
-        if sample_size >= dataset.data.height:
-            return dataset
-
-        # Head sample
-        sampled_df = dataset.data.head(sample_size)
-
-        return Dataset(
-            schema=dataset.schema,
-            data=sampled_df,
-        )