Skip to content

Commit cd0cdae

Browse files
feat: Create TableFormatter and MarkdownTableFormatter for improved table formatting
1 parent e1a6435 commit cd0cdae

3 files changed

Lines changed: 115 additions & 74 deletions

File tree

agentune/core/formatter/base.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -65,36 +65,37 @@ async def aformat_batch(self, input: Dataset, conn: DuckDBPyConnection) -> pl.Se
6565
...
6666

6767

68+
6869
@attrs.define
69-
class TablesFormatter(ABC, UseTypeTag):
70-
"""Abstract base class for formatting database tables.
70+
class TableFormatter(ABC, UseTypeTag):
71+
"""Abstract base class for formatting database table.
7172
72-
Formats information about available database tables (schemas and sample data) for use in LLM prompts.
73+
Formats information about table (schemas and sample data) to string representations.
7374
"""
7475

75-
primary_table_name: str = 'primary_table'
76-
77-
def _format_schema(self, schema: Schema) -> str:
78-
"""Format schema to human-readable string for LLM prompts."""
79-
lines = []
80-
for field in schema.cols:
81-
# Convert Dtype to simple string representation
82-
dtype_str = repr(field.dtype.polars_type)
83-
lines.append(f'- {field.name}: {dtype_str}')
84-
85-
return '\n'.join(lines)
86-
87-
def _format_sample_data(self, dataset: Dataset) -> str:
88-
"""Format sample data rows as table for LLM prompts."""
89-
return dataset.data.write_csv()
90-
9176
@abstractmethod
9277
def format_table(
9378
self,
9479
sample_data: Dataset,
9580
) -> str:
96-
"""Format schema and sample data for a single table."""
81+
"""Format schema and sample data for a single table.
82+
83+
Args:
84+
sample_data: Dataset containing the sample data to format
85+
Returns:
86+
String representation of the table with its schema and sample data
87+
"""
9788
...
89+
90+
91+
@attrs.define
92+
class TablesFormatter(ABC, UseTypeTag):
93+
"""Abstract base class for formatting database tables.
94+
95+
Formats information about available database tables (schemas and sample data) to string representations.
96+
"""
97+
98+
primary_table_name: str = 'primary_table'
9899

99100
@abstractmethod
100101
def format_all_tables(
@@ -104,12 +105,13 @@ def format_all_tables(
104105
conn: DuckDBPyConnection,
105106
random_seed: int | None = None,
106107
) -> str:
107-
"""Format the primary dataset and all auxiliary tables with their schemas and sample data for LLM prompts.
108+
"""Format the primary dataset and all auxiliary tables with their schemas and sample.
108109
109110
Args:
110111
input: Input dataset (primary table)
111112
tables: Available tables with their join strategies
112113
conn: Database connection to query sample data
114+
random_seed: Random seed for sampling (if applicable)
113115
114116
Returns:
115117
String representation of all tables with their schemas and sample data

agentune/core/formatter/tables.py

Lines changed: 92 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -7,35 +7,91 @@
77

88
from agentune.analyze.join.base import TablesWithJoinStrategies
99
from agentune.core.dataset import Dataset
10-
from agentune.core.formatter.base import TablesFormatter
11-
from agentune.core.sampler.base import DataSampler, HeadSampler
10+
from agentune.core.formatter.base import TableFormatter, TablesFormatter
11+
from agentune.core.sampler.base import DataSampler, RandomSampler, TableSampler
12+
from agentune.core.sampler.table_samples import HeadTableSampler
13+
from agentune.core.schema import Schema
1214

1315

1416
@attrs.frozen
15-
class SimpleTablesFormatter(TablesFormatter):
16-
"""Simple tables formatter for LLM prompts.
17+
class MarkdownTableFormatter(TableFormatter):
18+
"""Markdown table formatter.
1719
18-
Formats all available tables with their schemas and sample data.
19-
Each table is formatted with:
20-
- Table name
21-
- Schema (list of columns with types)
22-
- Sample data (CSV format)
20+
Formats a single table with its schema and sample data using markdown headers.
21+
The schema is displayed as a bulleted list of columns with their DuckDB types,
22+
and the sample data is formatted as CSV for readability.
23+
24+
Args:
25+
markdown_level: The markdown header level to use for sections (default: 3).
2326
"""
24-
num_samples: int = 5
25-
sampler: DataSampler = HeadSampler()
27+
markdown_level: int = 3
28+
29+
def _format_schema(self, schema: Schema) -> str:
30+
"""Format schema to human-readable string."""
31+
lines = []
32+
for field in schema.cols:
33+
# Convert Dtype to simple string representation using duckdb_type
34+
dtype_str = repr(field.dtype.duckdb_type)
35+
lines.append(f'- {field.name}: {dtype_str}')
36+
37+
return '\n'.join(lines)
38+
39+
def _format_sample_data(self, dataset: Dataset) -> str:
40+
"""Format sample data rows as table using CSV format."""
41+
return dataset.data.write_csv()
42+
43+
@override
44+
def format_table(
45+
self,
46+
sample_data: Dataset,
47+
) -> str:
48+
"""Format schema and sample data for a single table.
49+
50+
Includes markdown headers at the specified level. Formats the schema as a list
51+
of columns with their DuckDB types, and formats the sample data as CSV.
52+
53+
Args:
54+
sample_data: Dataset containing the sample data to format
2655
27-
def format_table(self, sample_data: Dataset) -> str:
28-
"""Format schema and sample data for a single table."""
56+
Returns:
57+
String representation of the table with its schema and sample data
58+
"""
59+
markdown_header = '#' * self.markdown_level
2960
# Schema
30-
out = ['### Schema:']
61+
out = [f'{markdown_header} Schema:']
3162
out.append(self._format_schema(sample_data.schema))
3263
out.append('')
3364

3465
# Sample data
35-
out.append(f'### Sample Data ({self.num_samples} rows):')
66+
out.append(f'{markdown_header} Sample Data:')
3667
out.append(self._format_sample_data(sample_data))
3768
out.append('')
3869
return '\n'.join(out)
70+
71+
72+
@attrs.frozen
73+
class MarkdownTablesFormatter(TablesFormatter):
74+
"""Markdown tables formatter.
75+
76+
Formats all available tables (primary and secondary) with their schemas and sample data
77+
in markdown format. Each table includes a header with its name, followed by its schema
78+
and sample data sections.
79+
80+
Args:
81+
markdown_level: The markdown header level to use for table sections (default: 2).
82+
num_samples: Number of sample rows to retrieve for each table (default: 5).
83+
table_formatter: TableFormatter to use for formatting individual tables.
84+
Defaults to MarkdownTableFormatter with markdown_level + 1.
85+
primary_dataset_sampler: DataSampler to use for sampling the primary dataset.
86+
Defaults to RandomSampler for representative sampling.
87+
tables_sampler: TableSampler to use for sampling the secondary tables.
88+
Defaults to HeadTableSampler for consistent sampling.
89+
"""
90+
markdown_level: int = 2
91+
num_samples: int = 5
92+
table_formatter: TableFormatter = attrs.field(default=attrs.Factory(lambda self: MarkdownTableFormatter(markdown_level=self.markdown_level + 1), takes_self=True))
93+
primary_dataset_sampler: DataSampler = RandomSampler()
94+
tables_sampler: TableSampler = HeadTableSampler()
3995

4096
@override
4197
def format_all_tables(
@@ -46,31 +102,38 @@ def format_all_tables(
46102
random_seed: int | None = None,
47103
) -> str:
48104
"""Format all available tables with their schemas and sample data for LLM prompts.
105+
106+
Formats the primary table followed by all secondary tables. Each table is formatted
107+
with a header at the specified markdown level, followed by its schema and sample data
108+
using the configured table formatter.
49109
50110
Args:
51-
input: Input dataset (primary table)
52-
tables: Available tables with their join strategies
53-
conn: Database connection to query sample data
111+
input: Primary input dataset to format.
112+
tables: Secondary tables with their join strategies to format.
113+
conn: Database connection for querying sample data from secondary tables.
114+
random_seed: Optional random seed for reproducible sampling.
54115
55116
Returns:
56-
String representation of all tables with their schemas and sample data
117+
Markdown-formatted string containing all tables with their schemas and sample data.
57118
"""
58119
sections = []
59-
120+
markdown_header = '#' * self.markdown_level
60121
# get sample data for primary table
61-
sample_data = self.sampler.sample(input, self.num_samples, random_seed=random_seed)
122+
sample_data = self.primary_dataset_sampler.sample(input, self.num_samples, random_seed=random_seed)
62123
# Format primary table
63-
sections.append(f'## Primary Table: {self.primary_table_name}\n')
64-
sections.append(self.format_table(sample_data))
65-
124+
sections.append(f'{markdown_header} Primary Table: {self.primary_table_name}\n')
125+
sections.append(self.table_formatter.format_table(sample_data))
66126
# Format secondary tables
67127
for table_with_strategies in tables:
68128
# get sample data for the table
69-
table = table_with_strategies.table
70-
dataset = table.as_source().to_dataset(conn)
71-
sample_data = self.sampler.sample(dataset, self.num_samples, random_seed=random_seed)
129+
sample_data = self.tables_sampler.sample(
130+
table_with_strategies,
131+
conn,
132+
self.num_samples,
133+
random_seed=random_seed,
134+
)
72135
# Format table section
73-
sections.append(f'## Table: {table.name.name}\n')
74-
sections.append(self.format_table(sample_data))
136+
sections.append(f'{markdown_header} Table: {table_with_strategies.table.name.name}\n')
137+
sections.append(self.table_formatter.format_table(sample_data))
75138

76139
return '\n'.join(sections)

agentune/core/sampler/base.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -61,27 +61,3 @@ def sample(self, dataset: Dataset, sample_size: int, random_seed: int | None = N
6161
schema=dataset.schema,
6262
data=sampled_df,
6363
)
64-
65-
@attrs.define
66-
class HeadSampler(DataSampler):
67-
"""Simple head sampling.
68-
69-
This sampler selects the first `sample_size` rows from the dataset.
70-
"""
71-
72-
@override
73-
def sample(self, dataset: Dataset, sample_size: int, random_seed: int | None = None) -> Dataset:
74-
"""Sample data using head sampling."""
75-
self._validate_inputs(dataset, sample_size)
76-
77-
# If sample size >= dataset size, return the entire dataset
78-
if sample_size >= dataset.data.height:
79-
return dataset
80-
81-
# Head sample
82-
sampled_df = dataset.data.head(sample_size)
83-
84-
return Dataset(
85-
schema=dataset.schema,
86-
data=sampled_df,
87-
)

0 commit comments

Comments
 (0)