Skip to content

Commit c65cdff

Browse files
authored
Merge pull request #4 from CocoRoF/main
ver 0.1.3
2 parents 4ffdd26 + 43c0e3e commit c65cdff

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+8112
-972
lines changed

ADVANCED_ANALYSIS_PLAN.md

Lines changed: 371 additions & 0 deletions
Large diffs are not rendered by default.
File renamed without changes.
File renamed without changes.
Lines changed: 424 additions & 4 deletions
Large diffs are not rendered by default.
Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,50 @@ class AnalysisConfig:
6767
max_plot_columns: int = 20
6868
"""Maximum columns per plot grid (prevents overly large figures)."""
6969

70+
# ── Advanced analysis ─────────────────────────────────
71+
advanced: bool = True
72+
"""Enable the Advanced analysis tab (clustering, anomaly, etc.)."""
73+
74+
advanced_distribution: bool = True
75+
"""Best-fit distribution, power transform, Jarque-Bera, ECDF."""
76+
77+
advanced_correlation: bool = True
78+
"""Partial correlation, MI matrix, bootstrap CI, network graph."""
79+
80+
clustering: bool = True
81+
"""K-Means, DBSCAN, hierarchical clustering."""
82+
83+
advanced_dimreduction: bool = True
84+
"""t-SNE, UMAP (optional), Factor Analysis."""
85+
86+
feature_insights: bool = True
87+
"""Interaction, monotonic, binning, cardinality, leakage detection."""
88+
89+
advanced_anomaly: bool = True
90+
"""Isolation Forest, LOF, Mahalanobis, consensus."""
91+
92+
statistical_tests: bool = True
93+
"""Levene, Kruskal-Wallis, Mann-Whitney, goodness-of-fit, Grubbs."""
94+
95+
data_profiling: bool = True
96+
"""Automated insights, type recommendation, health dashboard."""
97+
98+
# ── Advanced sub-options ──────────────────────────────
99+
max_cluster_k: int = 10
100+
"""Maximum k for K-Means elbow search."""
101+
102+
tsne_perplexity: float = 30.0
103+
"""t-SNE perplexity parameter."""
104+
105+
bootstrap_iterations: int = 1000
106+
"""Number of bootstrap resamples for correlation CI."""
107+
108+
max_sample_for_advanced: int = 5000
109+
"""Max rows sampled for expensive advanced analyses (t-SNE, UMAP, etc.)."""
110+
111+
n_distribution_fits: int = 7
112+
"""Number of candidate distributions to fit."""
113+
70114
@staticmethod
71115
def minimal() -> "AnalysisConfig":
72116
"""Return a config with only core analyses (descriptive + missing)."""
@@ -80,12 +124,19 @@ def minimal() -> "AnalysisConfig":
80124
pca=False,
81125
duplicates=False,
82126
quality_score=False,
127+
advanced=False,
83128
)
84129

85130
@staticmethod
86131
def fast() -> "AnalysisConfig":
87-
"""Return a config that skips expensive analyses (PCA, feature importance)."""
132+
"""Return a config that skips expensive analyses (PCA, feature importance, advanced)."""
88133
return AnalysisConfig(
89134
pca=False,
90135
feature_importance=False,
136+
advanced=False,
91137
)
138+
139+
@staticmethod
140+
def basic_only() -> "AnalysisConfig":
141+
"""Return a config with all Basic analyses on, all Advanced off."""
142+
return AnalysisConfig(advanced=False)
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,17 @@ def infer_schema(df: pd.DataFrame) -> DataSchema:
7777

7878
for col in df.columns:
7979
n_missing = int(df[col].isna().sum())
80+
try:
81+
n_unique = int(df[col].nunique())
82+
except TypeError:
83+
# Column contains unhashable types (e.g. numpy arrays, lists)
84+
n_unique = len(df[col].dropna())
8085
columns.append(
8186
ColumnInfo(
8287
name=col,
8388
dtype=str(df[col].dtype),
8489
inferred_type=type_map[col],
85-
n_unique=int(df[col].nunique()),
90+
n_unique=n_unique,
8691
n_missing=n_missing,
8792
missing_ratio=round(n_missing / len(df), 4) if len(df) > 0 else 0.0,
8893
)

0 commit comments

Comments
 (0)