22
33from __future__ import annotations
44
5+ import re
56from dataclasses import dataclass , field
67from pathlib import Path
78from typing import Any
@@ -82,17 +83,31 @@ def plot_missing(self) -> plt.Figure:
8283 return fig
8384
8485
86+ @dataclass
87+ class SubsetReport :
88+ """Analysis results for a single subset/split partition."""
89+
90+ subset : str
91+ split : str
92+ shape : tuple [int , int ]
93+ schema : DataSchema
94+ stats : StatsResult
95+ viz : VizResult
96+ warnings : list [str ] = field (default_factory = list )
97+
98+
8599@dataclass
86100class AnalysisReport :
87101 """Top-level container for analysis results.
88102
89103 Attributes:
90104 dataset_name: Dataset name.
91- shape: ``(rows, columns)`` tuple.
92- schema: Data schema.
93- stats: Statistical analysis results.
94- viz: Visualization access object.
105+ shape: ``(rows, columns)`` tuple (total across all subsets) .
106+ schema: Data schema (of the first / single partition) .
107+ stats: Statistical analysis results (of the first / single partition) .
108+ viz: Visualization access object (of the first / single partition) .
95109 warnings: List of warnings found during analysis.
110+ subsets: Per-subset/split reports (empty when only one partition).
96111 """
97112
98113 dataset_name : str
@@ -101,29 +116,50 @@ class AnalysisReport:
101116 stats : StatsResult
102117 viz : VizResult
103118 warnings : list [str ] = field (default_factory = list )
119+ subsets : list [SubsetReport ] = field (default_factory = list )
104120
105121 def show (self ) -> None :
106122 """Print analysis summary to console."""
107123 sep = "=" * 60
108124 print (sep )
109125 print (f" f2a Analysis Report: { self .dataset_name } " )
110126 print (sep )
111- print (f"\n Rows: { self .shape [0 ]:,} | Columns: { self .shape [1 ]} " )
112- print (f" Memory: { self .schema .memory_usage_mb } MB" )
113- print (f"\n Numeric: { len (self .schema .numeric_columns )} " )
114- print (f" Categorical: { len (self .schema .categorical_columns )} " )
115- print (f" Text: { len (self .schema .text_columns )} " )
116- print (f" Datetime: { len (self .schema .datetime_columns )} " )
117-
118- print (f"\n { '─' * 60 } " )
119- print (" Summary Statistics:" )
120- print (self .stats .summary .to_string ())
121-
122- if self .warnings :
123- print (f"\n { '─' * 60 } " )
124- print (" ⚠ Warnings:" )
125- for w in self .warnings :
126- print (f" • { w } " )
127+
128+ if self .subsets :
129+ # Multi-subset mode
130+ print (f"\n Total Rows: { self .shape [0 ]:,} | Subsets: { len (self .subsets )} " )
131+ for sr in self .subsets :
132+ print (f"\n { '-' * 60 } " )
133+ print (f" [{ sr .subset } / { sr .split } ] { sr .shape [0 ]:,} rows x { sr .shape [1 ]} cols" )
134+ print (f" Memory: { sr .schema .memory_usage_mb } MB" )
135+ print (f" Numeric: { len (sr .schema .numeric_columns )} | "
136+ f"Categorical: { len (sr .schema .categorical_columns )} | "
137+ f"Text: { len (sr .schema .text_columns )} | "
138+ f"Datetime: { len (sr .schema .datetime_columns )} " )
139+ print ()
140+ print (sr .stats .summary .to_string ())
141+ if sr .warnings :
142+ print ("\n Warnings:" )
143+ for w in sr .warnings :
144+ print (f" - { w } " )
145+ else :
146+ # Single-partition mode
147+ print (f"\n Rows: { self .shape [0 ]:,} | Columns: { self .shape [1 ]} " )
148+ print (f" Memory: { self .schema .memory_usage_mb } MB" )
149+ print (f"\n Numeric: { len (self .schema .numeric_columns )} " )
150+ print (f" Categorical: { len (self .schema .categorical_columns )} " )
151+ print (f" Text: { len (self .schema .text_columns )} " )
152+ print (f" Datetime: { len (self .schema .datetime_columns )} " )
153+
154+ print (f"\n { '-' * 60 } " )
155+ print (" Summary Statistics:" )
156+ print (self .stats .summary .to_string ())
157+
158+ if self .warnings :
159+ print (f"\n { '-' * 60 } " )
160+ print (" Warnings:" )
161+ for w in self .warnings :
162+ print (f" - { w } " )
127163
128164 print (sep )
129165
@@ -136,40 +172,77 @@ def to_html(self, output_dir: str = ".") -> Path:
136172 Returns:
137173 Path to the saved HTML file.
138174 """
139- # Generate visualizations
140- figures : dict [str , plt .Figure ] = {}
141- try :
142- figures ["Distribution Histograms" ] = self .viz .plot_distributions ()
143- except Exception :
144- pass
145- try :
146- figures ["Boxplots" ] = self .viz .plot_boxplots ()
147- except Exception :
148- pass
149- try :
150- figures ["Correlation Heatmap" ] = self .viz .plot_correlation ()
151- except Exception :
152- pass
153- try :
154- figures ["Missing Data" ] = self .viz .plot_missing ()
155- except Exception :
156- pass
157-
158175 generator = ReportGenerator ()
159- output_path = Path (output_dir ) / f"{ self .dataset_name } _report.html"
160- generator .save_html (
161- output_path = output_path ,
162- dataset_name = self .dataset_name ,
163- schema_summary = self .schema .summary_dict (),
164- stats_df = self .stats .summary ,
165- figures = figures ,
166- warnings = self .warnings ,
167- )
176+ safe_name = re .sub (r'[<>:"/\\|?*]' , "_" , self .dataset_name )
177+ safe_name = safe_name .strip (". " )[:120 ] or "report"
178+ output_path = Path (output_dir ) / f"{ safe_name } _report.html"
179+
180+ if self .subsets :
181+ # Multi-subset mode: build per-subset section dicts
182+ subset_sections : list [dict [str , Any ]] = []
183+ for sr in self .subsets :
184+ figures : dict [str , plt .Figure ] = {}
185+ try :
186+ figures ["Distribution Histograms" ] = sr .viz .plot_distributions ()
187+ except Exception :
188+ pass
189+ try :
190+ figures ["Boxplots" ] = sr .viz .plot_boxplots ()
191+ except Exception :
192+ pass
193+ try :
194+ figures ["Correlation Heatmap" ] = sr .viz .plot_correlation ()
195+ except Exception :
196+ pass
197+ try :
198+ figures ["Missing Data" ] = sr .viz .plot_missing ()
199+ except Exception :
200+ pass
201+ subset_sections .append ({
202+ "subset" : sr .subset ,
203+ "split" : sr .split ,
204+ "schema_summary" : sr .schema .summary_dict (),
205+ "stats_df" : sr .stats .summary ,
206+ "figures" : figures ,
207+ "warnings" : sr .warnings ,
208+ })
209+ generator .save_html_multi (
210+ output_path = output_path ,
211+ dataset_name = self .dataset_name ,
212+ sections = subset_sections ,
213+ )
214+ else :
215+ # Single-partition mode
216+ figures : dict [str , plt .Figure ] = {}
217+ try :
218+ figures ["Distribution Histograms" ] = self .viz .plot_distributions ()
219+ except Exception :
220+ pass
221+ try :
222+ figures ["Boxplots" ] = self .viz .plot_boxplots ()
223+ except Exception :
224+ pass
225+ try :
226+ figures ["Correlation Heatmap" ] = self .viz .plot_correlation ()
227+ except Exception :
228+ pass
229+ try :
230+ figures ["Missing Data" ] = self .viz .plot_missing ()
231+ except Exception :
232+ pass
233+ generator .save_html (
234+ output_path = output_path ,
235+ dataset_name = self .dataset_name ,
236+ schema_summary = self .schema .summary_dict (),
237+ stats_df = self .stats .summary ,
238+ figures = figures ,
239+ warnings = self .warnings ,
240+ )
168241 return output_path
169242
170243 def to_dict (self ) -> dict [str , Any ]:
171244 """Return analysis results as a dictionary."""
172- return {
245+ result : dict [ str , Any ] = {
173246 "dataset_name" : self .dataset_name ,
174247 "shape" : self .shape ,
175248 "schema" : self .schema .summary_dict (),
@@ -179,6 +252,19 @@ def to_dict(self) -> dict[str, Any]:
179252 else {},
180253 "warnings" : self .warnings ,
181254 }
255+ if self .subsets :
256+ result ["subsets" ] = [
257+ {
258+ "subset" : sr .subset ,
259+ "split" : sr .split ,
260+ "shape" : sr .shape ,
261+ "schema" : sr .schema .summary_dict (),
262+ "stats_summary" : sr .stats .summary .to_dict (),
263+ "warnings" : sr .warnings ,
264+ }
265+ for sr in self .subsets
266+ ]
267+ return result
182268
183269
184270class Analyzer :
@@ -209,16 +295,30 @@ def run(self, source: str, **kwargs: Any) -> AnalysisReport:
209295 # 1. Load data
210296 df = self ._loader .load (source , ** kwargs )
211297
212- # 2. Infer schema
298+ # 2. Check for multi-subset HuggingFace data
299+ has_partitions = "__subset__" in df .columns and "__split__" in df .columns
300+
301+ if has_partitions :
302+ return self ._run_multi_subset (source , df )
303+
304+ # Single-partition analysis
305+ return self ._run_single (source , df )
306+
307+ def _run_single (
308+ self , source : str , df : pd .DataFrame
309+ ) -> AnalysisReport :
310+ """Run analysis on a single DataFrame."""
213311 schema = infer_schema (df )
214312 logger .info ("Schema inference complete: %s" , schema .summary_dict ())
215313
216- # 3. Statistical analysis
217314 warnings : list [str ] = []
218315 stats = self ._compute_stats (df , schema , warnings )
219316
220- # 4. Assemble results
221- dataset_name = Path (source ).stem if "/" not in source or "://" not in source else source
317+ dataset_name = (
318+ Path (source ).stem
319+ if "/" not in source or "://" not in source
320+ else source
321+ )
222322 viz = VizResult (_df = df , _schema = schema )
223323
224324 report = AnalysisReport (
@@ -229,10 +329,65 @@ def run(self, source: str, **kwargs: Any) -> AnalysisReport:
229329 viz = viz ,
230330 warnings = warnings ,
231331 )
232-
233332 logger .info ("Analysis complete: %s" , source )
234333 return report
235334
335+ def _run_multi_subset (
336+ self , source : str , df : pd .DataFrame
337+ ) -> AnalysisReport :
338+ """Run analysis on a multi-subset HuggingFace DataFrame."""
339+ groups = df .groupby (["__subset__" , "__split__" ], sort = False )
340+
341+ subset_reports : list [SubsetReport ] = []
342+ all_warnings : list [str ] = []
343+
344+ for (subset_name , split_name ), group_df in groups :
345+ # Drop the metadata columns before analysis
346+ part_df = group_df .drop (columns = ["__subset__" , "__split__" ]).reset_index (drop = True )
347+
348+ schema = infer_schema (part_df )
349+ warnings : list [str ] = []
350+ stats = self ._compute_stats (part_df , schema , warnings )
351+ viz = VizResult (_df = part_df , _schema = schema )
352+
353+ sr = SubsetReport (
354+ subset = str (subset_name ),
355+ split = str (split_name ),
356+ shape = (len (part_df ), len (part_df .columns )),
357+ schema = schema ,
358+ stats = stats ,
359+ viz = viz ,
360+ warnings = warnings ,
361+ )
362+ subset_reports .append (sr )
363+ all_warnings .extend (
364+ f"[{ subset_name } /{ split_name } ] { w } " for w in warnings
365+ )
366+ logger .info (
367+ "Subset analysis complete: %s/%s (%d rows × %d cols)" ,
368+ subset_name , split_name , len (part_df ), len (part_df .columns ),
369+ )
370+
371+ # Use the first subset for top-level schema/stats/viz
372+ first = subset_reports [0 ]
373+ total_rows = sum (sr .shape [0 ] for sr in subset_reports )
374+ total_cols = first .shape [1 ]
375+
376+ report = AnalysisReport (
377+ dataset_name = source ,
378+ shape = (total_rows , total_cols ),
379+ schema = first .schema ,
380+ stats = first .stats ,
381+ viz = first .viz ,
382+ warnings = all_warnings ,
383+ subsets = subset_reports ,
384+ )
385+ logger .info (
386+ "Multi-subset analysis complete: %s (%d subsets, %d total rows)" ,
387+ source , len (subset_reports ), total_rows ,
388+ )
389+ return report
390+
236391 def _compute_stats (
237392 self ,
238393 df : pd .DataFrame ,
0 commit comments