Skip to content

Commit 32a22b1

Browse files
author
CocoRoF
committed
Enhance HuggingFace dataset loading and reporting features
- Added support for multi-subset analysis in the analyzer module. - Updated DataLoader to handle HuggingFace dataset URLs and configurations. - Improved HTML report generation to include multi-subset sections with drag-to-scroll functionality. - Enhanced input validation to detect HuggingFace URLs. - Added unit tests for new HuggingFace URL detection logic.
1 parent 1721e03 commit 32a22b1

File tree

12 files changed

+614
-836
lines changed

12 files changed

+614
-836
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ htmlcov/
3030
.coverage
3131
coverage.xml
3232

33+
# Manual test folder
34+
test/
35+
3336
# Output
3437
examples/output/
3538
examples/sample_data.csv

pyproject.toml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,6 @@ line-length = 100
8585
profile = "black"
8686
line_length = 100
8787

88-
# ── pytest ────────────────────────────────────────────
89-
[tool.pytest.ini_options]
90-
testpaths = ["tests"]
91-
addopts = "-v --tb=short"
92-
9388
# ── mypy ──────────────────────────────────────────────
9489
[tool.mypy]
9590
python_version = "3.10"

src/f2a/core/analyzer.py

Lines changed: 209 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import re
56
from dataclasses import dataclass, field
67
from pathlib import Path
78
from typing import Any
@@ -82,17 +83,31 @@ def plot_missing(self) -> plt.Figure:
8283
return fig
8384

8485

86+
@dataclass
87+
class SubsetReport:
88+
"""Analysis results for a single subset/split partition."""
89+
90+
subset: str
91+
split: str
92+
shape: tuple[int, int]
93+
schema: DataSchema
94+
stats: StatsResult
95+
viz: VizResult
96+
warnings: list[str] = field(default_factory=list)
97+
98+
8599
@dataclass
86100
class AnalysisReport:
87101
"""Top-level container for analysis results.
88102
89103
Attributes:
90104
dataset_name: Dataset name.
91-
shape: ``(rows, columns)`` tuple.
92-
schema: Data schema.
93-
stats: Statistical analysis results.
94-
viz: Visualization access object.
105+
shape: ``(rows, columns)`` tuple (total across all subsets).
106+
schema: Data schema (of the first / single partition).
107+
stats: Statistical analysis results (of the first / single partition).
108+
viz: Visualization access object (of the first / single partition).
95109
warnings: List of warnings found during analysis.
110+
subsets: Per-subset/split reports (empty when only one partition).
96111
"""
97112

98113
dataset_name: str
@@ -101,29 +116,50 @@ class AnalysisReport:
101116
stats: StatsResult
102117
viz: VizResult
103118
warnings: list[str] = field(default_factory=list)
119+
subsets: list[SubsetReport] = field(default_factory=list)
104120

105121
def show(self) -> None:
106122
"""Print analysis summary to console."""
107123
sep = "=" * 60
108124
print(sep)
109125
print(f" f2a Analysis Report: {self.dataset_name}")
110126
print(sep)
111-
print(f"\n Rows: {self.shape[0]:,} | Columns: {self.shape[1]}")
112-
print(f" Memory: {self.schema.memory_usage_mb} MB")
113-
print(f"\n Numeric: {len(self.schema.numeric_columns)}")
114-
print(f" Categorical: {len(self.schema.categorical_columns)}")
115-
print(f" Text: {len(self.schema.text_columns)}")
116-
print(f" Datetime: {len(self.schema.datetime_columns)}")
117-
118-
print(f"\n{'─' * 60}")
119-
print(" Summary Statistics:")
120-
print(self.stats.summary.to_string())
121-
122-
if self.warnings:
123-
print(f"\n{'─' * 60}")
124-
print(" ⚠ Warnings:")
125-
for w in self.warnings:
126-
print(f" • {w}")
127+
128+
if self.subsets:
129+
# Multi-subset mode
130+
print(f"\n Total Rows: {self.shape[0]:,} | Subsets: {len(self.subsets)}")
131+
for sr in self.subsets:
132+
print(f"\n{'-' * 60}")
133+
print(f" [{sr.subset} / {sr.split}] {sr.shape[0]:,} rows x {sr.shape[1]} cols")
134+
print(f" Memory: {sr.schema.memory_usage_mb} MB")
135+
print(f" Numeric: {len(sr.schema.numeric_columns)} | "
136+
f"Categorical: {len(sr.schema.categorical_columns)} | "
137+
f"Text: {len(sr.schema.text_columns)} | "
138+
f"Datetime: {len(sr.schema.datetime_columns)}")
139+
print()
140+
print(sr.stats.summary.to_string())
141+
if sr.warnings:
142+
print("\n Warnings:")
143+
for w in sr.warnings:
144+
print(f" - {w}")
145+
else:
146+
# Single-partition mode
147+
print(f"\n Rows: {self.shape[0]:,} | Columns: {self.shape[1]}")
148+
print(f" Memory: {self.schema.memory_usage_mb} MB")
149+
print(f"\n Numeric: {len(self.schema.numeric_columns)}")
150+
print(f" Categorical: {len(self.schema.categorical_columns)}")
151+
print(f" Text: {len(self.schema.text_columns)}")
152+
print(f" Datetime: {len(self.schema.datetime_columns)}")
153+
154+
print(f"\n{'-' * 60}")
155+
print(" Summary Statistics:")
156+
print(self.stats.summary.to_string())
157+
158+
if self.warnings:
159+
print(f"\n{'-' * 60}")
160+
print(" Warnings:")
161+
for w in self.warnings:
162+
print(f" - {w}")
127163

128164
print(sep)
129165

@@ -136,40 +172,77 @@ def to_html(self, output_dir: str = ".") -> Path:
136172
Returns:
137173
Path to the saved HTML file.
138174
"""
139-
# Generate visualizations
140-
figures: dict[str, plt.Figure] = {}
141-
try:
142-
figures["Distribution Histograms"] = self.viz.plot_distributions()
143-
except Exception:
144-
pass
145-
try:
146-
figures["Boxplots"] = self.viz.plot_boxplots()
147-
except Exception:
148-
pass
149-
try:
150-
figures["Correlation Heatmap"] = self.viz.plot_correlation()
151-
except Exception:
152-
pass
153-
try:
154-
figures["Missing Data"] = self.viz.plot_missing()
155-
except Exception:
156-
pass
157-
158175
generator = ReportGenerator()
159-
output_path = Path(output_dir) / f"{self.dataset_name}_report.html"
160-
generator.save_html(
161-
output_path=output_path,
162-
dataset_name=self.dataset_name,
163-
schema_summary=self.schema.summary_dict(),
164-
stats_df=self.stats.summary,
165-
figures=figures,
166-
warnings=self.warnings,
167-
)
176+
safe_name = re.sub(r'[<>:"/\\|?*]', "_", self.dataset_name)
177+
safe_name = safe_name.strip(". ")[:120] or "report"
178+
output_path = Path(output_dir) / f"{safe_name}_report.html"
179+
180+
if self.subsets:
181+
# Multi-subset mode: build per-subset section dicts
182+
subset_sections: list[dict[str, Any]] = []
183+
for sr in self.subsets:
184+
figures: dict[str, plt.Figure] = {}
185+
try:
186+
figures["Distribution Histograms"] = sr.viz.plot_distributions()
187+
except Exception:
188+
pass
189+
try:
190+
figures["Boxplots"] = sr.viz.plot_boxplots()
191+
except Exception:
192+
pass
193+
try:
194+
figures["Correlation Heatmap"] = sr.viz.plot_correlation()
195+
except Exception:
196+
pass
197+
try:
198+
figures["Missing Data"] = sr.viz.plot_missing()
199+
except Exception:
200+
pass
201+
subset_sections.append({
202+
"subset": sr.subset,
203+
"split": sr.split,
204+
"schema_summary": sr.schema.summary_dict(),
205+
"stats_df": sr.stats.summary,
206+
"figures": figures,
207+
"warnings": sr.warnings,
208+
})
209+
generator.save_html_multi(
210+
output_path=output_path,
211+
dataset_name=self.dataset_name,
212+
sections=subset_sections,
213+
)
214+
else:
215+
# Single-partition mode
216+
figures: dict[str, plt.Figure] = {}
217+
try:
218+
figures["Distribution Histograms"] = self.viz.plot_distributions()
219+
except Exception:
220+
pass
221+
try:
222+
figures["Boxplots"] = self.viz.plot_boxplots()
223+
except Exception:
224+
pass
225+
try:
226+
figures["Correlation Heatmap"] = self.viz.plot_correlation()
227+
except Exception:
228+
pass
229+
try:
230+
figures["Missing Data"] = self.viz.plot_missing()
231+
except Exception:
232+
pass
233+
generator.save_html(
234+
output_path=output_path,
235+
dataset_name=self.dataset_name,
236+
schema_summary=self.schema.summary_dict(),
237+
stats_df=self.stats.summary,
238+
figures=figures,
239+
warnings=self.warnings,
240+
)
168241
return output_path
169242

170243
def to_dict(self) -> dict[str, Any]:
171244
"""Return analysis results as a dictionary."""
172-
return {
245+
result: dict[str, Any] = {
173246
"dataset_name": self.dataset_name,
174247
"shape": self.shape,
175248
"schema": self.schema.summary_dict(),
@@ -179,6 +252,19 @@ def to_dict(self) -> dict[str, Any]:
179252
else {},
180253
"warnings": self.warnings,
181254
}
255+
if self.subsets:
256+
result["subsets"] = [
257+
{
258+
"subset": sr.subset,
259+
"split": sr.split,
260+
"shape": sr.shape,
261+
"schema": sr.schema.summary_dict(),
262+
"stats_summary": sr.stats.summary.to_dict(),
263+
"warnings": sr.warnings,
264+
}
265+
for sr in self.subsets
266+
]
267+
return result
182268

183269

184270
class Analyzer:
@@ -209,16 +295,30 @@ def run(self, source: str, **kwargs: Any) -> AnalysisReport:
209295
# 1. Load data
210296
df = self._loader.load(source, **kwargs)
211297

212-
# 2. Infer schema
298+
# 2. Check for multi-subset HuggingFace data
299+
has_partitions = "__subset__" in df.columns and "__split__" in df.columns
300+
301+
if has_partitions:
302+
return self._run_multi_subset(source, df)
303+
304+
# Single-partition analysis
305+
return self._run_single(source, df)
306+
307+
def _run_single(
308+
self, source: str, df: pd.DataFrame
309+
) -> AnalysisReport:
310+
"""Run analysis on a single DataFrame."""
213311
schema = infer_schema(df)
214312
logger.info("Schema inference complete: %s", schema.summary_dict())
215313

216-
# 3. Statistical analysis
217314
warnings: list[str] = []
218315
stats = self._compute_stats(df, schema, warnings)
219316

220-
# 4. Assemble results
221-
dataset_name = Path(source).stem if "/" not in source or "://" not in source else source
317+
dataset_name = (
318+
Path(source).stem
319+
if "/" not in source or "://" not in source
320+
else source
321+
)
222322
viz = VizResult(_df=df, _schema=schema)
223323

224324
report = AnalysisReport(
@@ -229,10 +329,65 @@ def run(self, source: str, **kwargs: Any) -> AnalysisReport:
229329
viz=viz,
230330
warnings=warnings,
231331
)
232-
233332
logger.info("Analysis complete: %s", source)
234333
return report
235334

335+
def _run_multi_subset(
336+
self, source: str, df: pd.DataFrame
337+
) -> AnalysisReport:
338+
"""Run analysis on a multi-subset HuggingFace DataFrame."""
339+
groups = df.groupby(["__subset__", "__split__"], sort=False)
340+
341+
subset_reports: list[SubsetReport] = []
342+
all_warnings: list[str] = []
343+
344+
for (subset_name, split_name), group_df in groups:
345+
# Drop the metadata columns before analysis
346+
part_df = group_df.drop(columns=["__subset__", "__split__"]).reset_index(drop=True)
347+
348+
schema = infer_schema(part_df)
349+
warnings: list[str] = []
350+
stats = self._compute_stats(part_df, schema, warnings)
351+
viz = VizResult(_df=part_df, _schema=schema)
352+
353+
sr = SubsetReport(
354+
subset=str(subset_name),
355+
split=str(split_name),
356+
shape=(len(part_df), len(part_df.columns)),
357+
schema=schema,
358+
stats=stats,
359+
viz=viz,
360+
warnings=warnings,
361+
)
362+
subset_reports.append(sr)
363+
all_warnings.extend(
364+
f"[{subset_name}/{split_name}] {w}" for w in warnings
365+
)
366+
logger.info(
367+
"Subset analysis complete: %s/%s (%d rows × %d cols)",
368+
subset_name, split_name, len(part_df), len(part_df.columns),
369+
)
370+
371+
# Use the first subset for top-level schema/stats/viz
372+
first = subset_reports[0]
373+
total_rows = sum(sr.shape[0] for sr in subset_reports)
374+
total_cols = first.shape[1]
375+
376+
report = AnalysisReport(
377+
dataset_name=source,
378+
shape=(total_rows, total_cols),
379+
schema=first.schema,
380+
stats=first.stats,
381+
viz=first.viz,
382+
warnings=all_warnings,
383+
subsets=subset_reports,
384+
)
385+
logger.info(
386+
"Multi-subset analysis complete: %s (%d subsets, %d total rows)",
387+
source, len(subset_reports), total_rows,
388+
)
389+
return report
390+
236391
def _compute_stats(
237392
self,
238393
df: pd.DataFrame,

0 commit comments

Comments
 (0)