77
88from agentune .analyze .join .base import TablesWithJoinStrategies
99from agentune .core .dataset import Dataset
10- from agentune .core .formatter .base import TablesFormatter
11- from agentune .core .sampler .base import DataSampler , HeadSampler
10+ from agentune .core .formatter .base import TableFormatter , TablesFormatter
11+ from agentune .core .sampler .base import DataSampler , RandomSampler , TableSampler
12+ from agentune .core .sampler .table_samples import HeadTableSampler
13+ from agentune .core .schema import Schema
1214
1315
1416@attrs .frozen
15- class SimpleTablesFormatter ( TablesFormatter ):
16- """Simple tables formatter for LLM prompts .
17+ class MarkdownTableFormatter ( TableFormatter ):
18+ """Markdown table formatter.
1719
18- Formats all available tables with their schemas and sample data.
19- Each table is formatted with:
20- - Table name
21- - Schema (list of columns with types)
22- - Sample data (CSV format)
20+ Formats a single table with its schema and sample data using markdown headers.
21+ The schema is displayed as a bulleted list of columns with their DuckDB types,
22+ and the sample data is formatted as CSV for readability.
23+
24+ Args:
25+ markdown_level: The markdown header level to use for sections (default: 3).
2326 """
24- num_samples : int = 5
25- sampler : DataSampler = HeadSampler ()
27+ markdown_level : int = 3
28+
29+ def _format_schema (self , schema : Schema ) -> str :
30+ """Format schema to human-readable string."""
31+ lines = []
32+ for field in schema .cols :
33+ # Convert Dtype to simple string representation using duckdb_type
34+ dtype_str = repr (field .dtype .duckdb_type )
35+ lines .append (f'- { field .name } : { dtype_str } ' )
36+
37+ return '\n ' .join (lines )
38+
39+ def _format_sample_data (self , dataset : Dataset ) -> str :
40+ """Format sample data rows as table using CSV format."""
41+ return dataset .data .write_csv ()
42+
43+ @override
44+ def format_table (
45+ self ,
46+ sample_data : Dataset ,
47+ ) -> str :
48+ """Format schema and sample data for a single table.
49+
50+ Includes markdown headers at the specified level. Formats the schema as a list
51+ of columns with their DuckDB types, and formats the sample data as CSV.
52+
53+ Args:
54+ sample_data: Dataset containing the sample data to format
2655
27- def format_table (self , sample_data : Dataset ) -> str :
28- """Format schema and sample data for a single table."""
56+ Returns:
57+ String representation of the table with its schema and sample data
58+ """
59+ markdown_header = '#' * self .markdown_level
2960 # Schema
30- out = ['### Schema:' ]
61+ out = [f' { markdown_header } Schema:' ]
3162 out .append (self ._format_schema (sample_data .schema ))
3263 out .append ('' )
3364
3465 # Sample data
35- out .append (f'### Sample Data ( { self . num_samples } rows) :' )
66+ out .append (f'{ markdown_header } Sample Data:' )
3667 out .append (self ._format_sample_data (sample_data ))
3768 out .append ('' )
3869 return '\n ' .join (out )
70+
71+
72+ @attrs .frozen
73+ class MarkdownTablesFormatter (TablesFormatter ):
74+ """Markdown tables formatter.
75+
76+ Formats all available tables (primary and secondary) with their schemas and sample data
77+ in markdown format. Each table includes a header with its name, followed by its schema
78+ and sample data sections.
79+
80+ Args:
81+ markdown_level: The markdown header level to use for table sections (default: 2).
82+ num_samples: Number of sample rows to retrieve for each table (default: 5).
83+ table_formatter: TableFormatter to use for formatting individual tables.
84+ Defaults to MarkdownTableFormatter with markdown_level + 1.
85+ primary_dataset_sampler: DataSampler to use for sampling the primary dataset.
86+ Defaults to RandomSampler for representative sampling.
87+ tables_sampler: TableSampler to use for sampling the secondary tables.
88+ Defaults to HeadTableSampler for consistent sampling.
89+ """
90+ markdown_level : int = 2
91+ num_samples : int = 5
92+ table_formatter : TableFormatter = attrs .field (default = attrs .Factory (lambda self : MarkdownTableFormatter (markdown_level = self .markdown_level + 1 ), takes_self = True ))
93+ primary_dataset_sampler : DataSampler = RandomSampler ()
94+ tables_sampler : TableSampler = HeadTableSampler ()
3995
4096 @override
4197 def format_all_tables (
@@ -46,31 +102,38 @@ def format_all_tables(
46102 random_seed : int | None = None ,
47103 ) -> str :
48104 """Format all available tables with their schemas and sample data for LLM prompts.
105+
106+ Formats the primary table followed by all secondary tables. Each table is formatted
107+ with a header at the specified markdown level, followed by its schema and sample data
108+ using the configured table formatter.
49109
50110 Args:
51- input: Input dataset (primary table)
52- tables: Available tables with their join strategies
53- conn: Database connection to query sample data
111+ input: Primary input dataset to format.
112+ tables: Secondary tables with their join strategies to format.
113+ conn: Database connection for querying sample data from secondary tables.
114+ random_seed: Optional random seed for reproducible sampling.
54115
55116 Returns:
56- String representation of all tables with their schemas and sample data
117+ Markdown-formatted string containing all tables with their schemas and sample data.
57118 """
58119 sections = []
59-
120+ markdown_header = '#' * self . markdown_level
60121 # get sample data for primary table
61- sample_data = self .sampler .sample (input , self .num_samples , random_seed = random_seed )
122+ sample_data = self .primary_dataset_sampler .sample (input , self .num_samples , random_seed = random_seed )
62123 # Format primary table
63- sections .append (f'## Primary Table: { self .primary_table_name } \n ' )
64- sections .append (self .format_table (sample_data ))
65-
124+ sections .append (f'{ markdown_header } Primary Table: { self .primary_table_name } \n ' )
125+ sections .append (self .table_formatter .format_table (sample_data ))
66126 # Format secondary tables
67127 for table_with_strategies in tables :
68128 # get sample data for the table
69- table = table_with_strategies .table
70- dataset = table .as_source ().to_dataset (conn )
71- sample_data = self .sampler .sample (dataset , self .num_samples , random_seed = random_seed )
129+ sample_data = self .tables_sampler .sample (
130+ table_with_strategies ,
131+ conn ,
132+ self .num_samples ,
133+ random_seed = random_seed ,
134+ )
72135 # Format table section
73- sections .append (f'## Table: { table .name .name } \n ' )
74- sections .append (self .format_table (sample_data ))
136+ sections .append (f'{ markdown_header } Table: { table_with_strategies . table .name .name } \n ' )
137+ sections .append (self .table_formatter . format_table (sample_data ))
75138
76139 return '\n ' .join (sections )
0 commit comments