diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md index 2b3a1c77..c6cbd864 100644 --- a/WHAT-WE-LEARNED.md +++ b/WHAT-WE-LEARNED.md @@ -119,3 +119,7 @@ Selectize menus should overflow the containing card. */ .card, .card-body { overflow: visible !important; } ``` + +## Hard to make elements with controls display conditionally. + +I tried moving a `ui.input_selectize` out of the top-level UI function because I needed it to be conditional, but the event that should have updated the list no longer worked: I guess it's not visible if it's not part of the static render? diff --git a/dp_wizard/shiny/__init__.py b/dp_wizard/shiny/__init__.py index c965cb9e..66a98f6a 100644 --- a/dp_wizard/shiny/__init__.py +++ b/dp_wizard/shiny/__init__.py @@ -18,7 +18,7 @@ from dp_wizard.types import AppState, Product from dp_wizard.utils import config from dp_wizard.utils.argparse_helpers import CLIInfo -from dp_wizard.utils.csv_helper import read_csv_names +from dp_wizard.utils.csv_helper import read_csv_names, read_csv_numeric_names _shiny_root = package_root / "shiny" _assets_root = _shiny_root / "assets" @@ -211,10 +211,14 @@ def server(input: Inputs, output: Outputs, session: Session): # pragma: no cove initial_private_csv_path = package_root / "tmp/sample.csv" _make_sample_csv(initial_private_csv_path, initial_contributions) initial_column_names = read_csv_names(Path(initial_private_csv_path)) + initial_numeric_column_names = read_csv_numeric_names( + Path(initial_private_csv_path) + ) else: initial_contributions = 1 initial_private_csv_path = "" initial_column_names = [] + initial_numeric_column_names = [] initial_product = Product.STATISTICS @@ -236,7 +240,8 @@ def server(input: Inputs, output: Outputs, session: Session): # pragma: no cove initial_product=initial_product, product=reactive.value(initial_product), # Analysis choices: - column_names=reactive.value(initial_column_names), + all_column_names=reactive.value(initial_column_names), + numeric_column_names=reactive.value(initial_numeric_column_names), groups=reactive.value([]), epsilon=reactive.value(1.0), # Per-column choices: diff --git a/dp_wizard/shiny/components/summaries.py b/dp_wizard/shiny/components/summaries.py index a9118259..4c532e7a 100644 --- a/dp_wizard/shiny/components/summaries.py +++ b/dp_wizard/shiny/components/summaries.py @@ -8,7 +8,7 @@ product_icon, unit_of_privacy_icon, ) -from dp_wizard.types import AppState +from dp_wizard.types import AppState, Product _css = "display: block; padding: 0 1em 1em 1em;" @@ -47,10 +47,16 @@ def analysis_summary(state: AppState): # pragma: no cover budget = state.epsilon() return tags.small( - columns_icon, - f"Columns: {columns}; ", - groups_icon, - f"Groups: {groups}; ", + ( + [] + if state.product() == Product.CSV_DESCRIPTION + else [ + columns_icon, + f"Columns: {columns}; ", + groups_icon, + f"Groups: {groups}; ", + ] + ), budget_icon, f"Privacy Budget: {budget} epsilon.", style=_css, diff --git a/dp_wizard/shiny/panels/analysis_panel/__init__.py b/dp_wizard/shiny/panels/analysis_panel/__init__.py index 6d07da81..6b18c3e2 100644 --- a/dp_wizard/shiny/panels/analysis_panel/__init__.py +++ b/dp_wizard/shiny/panels/analysis_panel/__init__.py @@ -21,7 +21,7 @@ ) from dp_wizard.shiny.components.summaries import dataset_summary from dp_wizard.shiny.panels.analysis_panel.column_module import column_server, column_ui -from dp_wizard.types import AppState +from dp_wizard.types import AppState, Product from dp_wizard.utils.code_generators import make_privacy_loss_block from dp_wizard.utils.csv_helper import ( get_csv_row_count, @@ -36,10 +36,11 @@ def analysis_ui(): ui.output_ui("analysis_requirements_warning_ui"), ui.output_ui("analysis_release_warning_ui"), ui.output_ui("previous_summary_ui"), + ui.output_ui("conditional_css_ui"), ui.layout_columns( ui.card( ui.card_header(columns_icon, "Columns"), - ui.markdown("Select columns to calculate statistics on."), + ui.markdown("Select numeric columns to calculate statistics on."), ui.input_selectize( "columns_selectize", "Columns", @@ -47,6 +48,7 @@ def analysis_ui(): multiple=True, ), ui.output_ui("columns_selectize_tutorial_ui"), + class_="columns-card", ), ui.card( ui.card_header(groups_icon, "Grouping"), @@ -66,6 +68,7 @@ def analysis_ui(): multiple=True, ), ui.output_ui("groups_selectize_tutorial_ui"), + class_="grouping-card", ), ui.card( ui.card_header(budget_icon, "Privacy Budget"), @@ -83,10 +86,12 @@ def analysis_ui(): log_slider("log_epsilon_slider", 0.1, 10.0), ui.output_ui("epsilon_ui"), ui.output_ui("privacy_loss_python_ui"), + class_="budget-card", ), ui.card( ui.card_header(simulation_icon, "Simulation"), ui.output_ui("simulation_card_ui"), + class_="simulation-card", ), col_widths={ "sm": [12, 12, 12, 12], # 4 rows @@ -152,10 +157,11 @@ def analysis_server( # contributions_entity = state.contributions_entity max_rows = state.max_rows # initial_product = state.initial_product - # product = state.product + product = state.product # Analysis choices: - column_names = state.column_names + all_column_names = state.all_column_names + numeric_column_names = state.numeric_column_names groups = state.groups epsilon = state.epsilon @@ -173,26 +179,37 @@ def analysis_server( @reactive.calc def button_enabled(): + # TODO: Get this in sync with results panel warning: + # https://github.com/opendp/dp-wizard/issues/562 at_least_one_column = bool(weights()) no_errors = not any(analysis_errors().values()) - return at_least_one_column and no_errors + return ( + at_least_one_column and no_errors + ) or product() == Product.CSV_DESCRIPTION @reactive.effect def _update_columns(): - csv_ids_labels = { + all_ids_labels = { # Cast to string for type checking. - str(k): v - for k, v in csv_ids_labels_calc().items() + str(col_id): label + for col_id, label in csv_ids_labels_calc().items() } ui.update_selectize( "groups_selectize", label=None, - choices=csv_ids_labels, + choices=all_ids_labels, ) + + numeric_column_ids = id_names_dict_from_names(numeric_column_names()).keys() + numeric_ids_labels = { + col_id: label + for col_id, label in all_ids_labels.items() + if col_id in numeric_column_ids + } ui.update_selectize( "columns_selectize", label=None, - choices=csv_ids_labels, + choices=numeric_ids_labels, ) @reactive.effect @@ -205,7 +222,7 @@ def _on_groups_change(): @render.ui def analysis_requirements_warning_ui(): return hide_if( - bool(column_names()), + bool(all_column_names()), info_md_box( """ Please select your dataset on the previous tab @@ -227,6 +244,26 @@ def analysis_release_warning_ui(): ), ) + @render.ui + def conditional_css_ui(): + # This is hacky, but other approaches for conditional card display + # didn't work for me. + # - Adding a wrapping element caused the card not to fill the whole height. + # - The selectize lists for columns and groups weren't updating. + # If we can find something better, great! + if product() == Product.CSV_DESCRIPTION: + return ui.tags.style( + """ + .bslib-grid-item:has( + .columns-card, + .grouping-card, + .simulation-card + ) { + display: none; + } + """ + ) + @render.ui def previous_summary_ui(): return dataset_summary(state) @@ -359,11 +396,11 @@ def columns_ui(): @reactive.calc def csv_ids_names_calc(): - return id_names_dict_from_names(column_names()) + return id_names_dict_from_names(all_column_names()) @reactive.calc def csv_ids_labels_calc(): - return id_labels_dict_from_names(column_names()) + return id_labels_dict_from_names(all_column_names()) @reactive.effect @reactive.event(input.log_epsilon_slider) diff --git a/dp_wizard/shiny/panels/dataset_panel/__init__.py b/dp_wizard/shiny/panels/dataset_panel/__init__.py index 8c67ca55..5780f8b2 100644 --- a/dp_wizard/shiny/panels/dataset_panel/__init__.py +++ b/dp_wizard/shiny/panels/dataset_panel/__init__.py @@ -26,7 +26,11 @@ PUBLIC_TEXT, ) from dp_wizard.utils.code_generators import make_privacy_unit_block -from dp_wizard.utils.csv_helper import get_csv_names_mismatch, read_csv_names +from dp_wizard.utils.csv_helper import ( + get_csv_names_mismatch, + read_csv_names, + read_csv_numeric_names, +) dataset_panel_id = "dataset_panel" @@ -127,7 +131,8 @@ def dataset_server( product = state.product # Analysis choices: - column_names = state.column_names + all_column_names = state.all_column_names + numeric_column_names = state.numeric_column_names # groups = state.groups # epsilon = state.epsilon @@ -148,25 +153,27 @@ def dataset_server( def _on_public_csv_path_change(): path = input.public_csv_path()[0]["datapath"] public_csv_path.set(path) - column_names.set(read_csv_names(Path(path))) + all_column_names.set(read_csv_names(Path(path))) + numeric_column_names.set(read_csv_numeric_names(Path(path))) @reactive.effect @reactive.event(input.private_csv_path) def _on_private_csv_path_change(): path = input.private_csv_path()[0]["datapath"] private_csv_path.set(path) - column_names.set(read_csv_names(Path(path))) + all_column_names.set(read_csv_names(Path(path))) + numeric_column_names.set(read_csv_numeric_names(Path(path))) @reactive.effect - @reactive.event(input.column_names) + @reactive.event(input.all_column_names) def _on_column_names_change(): - column_names.set( - [ - clean - for line in input.column_names().splitlines() - if (clean := line.strip()) - ] - ) + column_names = [ + clean + for line in input.all_column_names().splitlines() + if (clean := line.strip()) + ] + all_column_names.set(column_names) + numeric_column_names.set(column_names) @reactive.calc def csv_column_mismatch_calc() -> Optional[tuple[set, set]]: @@ -239,7 +246,7 @@ def csv_or_columns_ui(): """, responsive=False, ), - ui.input_text_area("column_names", "CSV Column Names", rows=5), + ui.input_text_area("all_column_names", "CSV Column Names", rows=5), ] else: content = [ @@ -452,7 +459,7 @@ def button_enabled(): return ( contributions_valid() and not get_row_count_errors(max_rows()) - and len(column_names()) > 0 + and len(all_column_names()) > 0 and (in_cloud or not csv_column_mismatch_calc()) ) @@ -578,17 +585,20 @@ def product_ui(): ), tutorial_box( is_tutorial_mode(), - """ + f""" Although the underlying OpenDP library is very flexible, DP Wizard offers only a few analysis options: - - The **DP Statistics** option supports + - The **{Product.STATISTICS}** option supports grouping, histograms, mean, median, and count. - - With **DP Synthetic Data**, your privacy budget is used + - With **{Product.SYNTHETIC_DATA}**, your privacy budget is used to infer the distributions of values within the selected columns, and the correlations between columns. This is less accurate than calculating the desired statistics directly, but can be easier to work with downstream. + - The **{Product.CSV_DESCRIPTION}** summarizes the contents of CSVs + with a large number of columns, without revealing details + from individual rows. """, responsive=False, ), diff --git a/dp_wizard/shiny/panels/results_panel/__init__.py b/dp_wizard/shiny/panels/results_panel/__init__.py index 4a6310a4..47ae8f6f 100644 --- a/dp_wizard/shiny/panels/results_panel/__init__.py +++ b/dp_wizard/shiny/panels/results_panel/__init__.py @@ -16,7 +16,7 @@ tutorial_box, ) from dp_wizard.shiny.components.summaries import analysis_summary, dataset_summary -from dp_wizard.types import AppState +from dp_wizard.types import AppState, Product from dp_wizard.utils.code_generators import AnalysisPlan, AnalysisPlanColumn from dp_wizard.utils.code_generators.notebook_generator import ( PLACEHOLDER_CSV_NAME, @@ -115,7 +115,8 @@ def results_server( product = state.product # Analysis choices: - # column_names = state.column_names + # all_column_names = state.all_column_names + # numeric_column_names = state.numeric_column_names groups = state.groups epsilon = state.epsilon @@ -134,7 +135,9 @@ def results_server( @render.ui def results_requirements_warning_ui(): return hide_if( - bool(weights()), + # TODO: Get this in sync with analysis_panel validation + # https://github.com/opendp/dp-wizard/issues/562 + bool(weights()) or product() == Product.CSV_DESCRIPTION, info_md_box( """ Please define your analysis on the previous tab @@ -195,7 +198,7 @@ def clean_download_stem() -> str: def download_results_ui(): if in_cloud: return None - disabled = not weights() + disabled = not (weights() or product() == Product.CSV_DESCRIPTION) return [ ui.h3("Download Results"), tutorial_box( diff --git a/dp_wizard/types.py b/dp_wizard/types.py index 39873091..3ba9187d 100644 --- a/dp_wizard/types.py +++ b/dp_wizard/types.py @@ -8,19 +8,20 @@ class Product(Enum): STATISTICS = auto() SYNTHETIC_DATA = auto() + CSV_DESCRIPTION = auto() @classmethod def to_dict(cls) -> dict[str, str]: """ >>> Product.to_dict() - {'1': 'DP Statistics', '2': 'DP Synthetic Data'} + {'1': 'DP Statistics', '2': 'DP Synthetic Data', '3': 'DP CSV Description'} """ return { str(member.value): str(member) for (name, member) in cls.__members__.items() } def __str__(self) -> str: - return "DP " + self.name.replace("_", " ").title() + return "DP " + self.name.replace("_", " ").title().replace("Csv", "CSV") class AnalysisName(str): @@ -98,7 +99,8 @@ class AppState: product: reactive.Value[Product] # Analysis choices: - column_names: reactive.Value[list[ColumnName]] + all_column_names: reactive.Value[list[ColumnName]] + numeric_column_names: reactive.Value[list[ColumnName]] groups: reactive.Value[list[ColumnName]] epsilon: reactive.Value[float] diff --git a/dp_wizard/utils/code_generators/__init__.py b/dp_wizard/utils/code_generators/__init__.py index 024a9b9d..7da81e49 100644 --- a/dp_wizard/utils/code_generators/__init__.py +++ b/dp_wizard/utils/code_generators/__init__.py @@ -48,6 +48,9 @@ class AnalysisPlan(NamedTuple): columns: dict[ColumnName, list[AnalysisPlanColumn]] def __str__(self) -> str: + if self.product == Product.CSV_DESCRIPTION: + return str(self.product) + def md_list(names) -> str: return ", ".join(f"`{name}`" for name in names) diff --git a/dp_wizard/utils/code_generators/abstract_generator.py b/dp_wizard/utils/code_generators/abstract_generator.py index 9ffc200f..ff2ec1c0 100644 --- a/dp_wizard/utils/code_generators/abstract_generator.py +++ b/dp_wizard/utils/code_generators/abstract_generator.py @@ -28,12 +28,14 @@ def __init__(self, analysis_plan: AnalysisPlan, note: str): self.analysis_plan = analysis_plan self.note = note - def _get_synth_or_stats(self) -> str: + def _get_product(self) -> str: match self.analysis_plan.product: case Product.STATISTICS: return "stats" case Product.SYNTHETIC_DATA: return "synth" + case Product.CSV_DESCRIPTION: + return "description" case _: # pragma: no cover raise ValueError(self.analysis_plan.product) @@ -46,6 +48,8 @@ def _get_extra(self) -> str: return "polars" case Product.SYNTHETIC_DATA: return "mbi" + case Product.CSV_DESCRIPTION: + return "polars" case _: # pragma: no cover raise ValueError(self.analysis_plan.product) @@ -53,7 +57,7 @@ def _get_extra(self) -> str: def _get_notebook_or_script(self) -> str: ... # pragma: no cover def _get_root_template(self) -> str: - adj = self._get_synth_or_stats() + adj = self._get_product() noun = self._get_notebook_or_script() return f"{adj}_{noun}" @@ -92,7 +96,6 @@ def template(): ) .fill_code_blocks( IMPORTS_BLOCK=Template(template).finish(), - UTILS_BLOCK=(package_root / "utils/shared.py").read_text(), **self._make_extra_blocks(), ) .fill_comment_blocks( @@ -320,6 +323,27 @@ def _make_partial_synth_context(self): ) ) + def _make_partial_description_context(self): + privacy_unit_block = make_privacy_unit_block( + contributions=self.analysis_plan.contributions, + contributions_entity=self.analysis_plan.contributions_entity, + ) + privacy_loss_block = make_privacy_loss_block( + pure=False, + epsilon=self.analysis_plan.epsilon, + max_rows=self.analysis_plan.max_rows, + ) + return ( + Template("description_context", template_root) + .fill_expressions( + OPENDP_V_VERSION=f"v{opendp_version}", + ) + .fill_code_blocks( + PRIVACY_UNIT_BLOCK=privacy_unit_block, + PRIVACY_LOSS_BLOCK=privacy_loss_block, + ) + ) + def _make_synth_query(self): def template(synth_context, COLUMNS, CUTS): synth_query = ( diff --git a/dp_wizard/utils/code_generators/no-tests/_description_notebook.py b/dp_wizard/utils/code_generators/no-tests/_description_notebook.py new file mode 100644 index 00000000..00a85a29 --- /dev/null +++ b/dp_wizard/utils/code_generators/no-tests/_description_notebook.py @@ -0,0 +1,54 @@ +# # TITLE +# +# CUSTOM_NOTE +# +# Jump ahead: +# - [Analysis](#Analysis) +# - [Results](#Results) +# +# ## Prerequisites +# +# First install and import the required dependencies: +# WINDOWS_COMMENT_BLOCK + +# + +# %pip install DEPENDENCIES +# - + +# + +IMPORTS_BLOCK +# - + +# ## Analysis +# +# For each column numeric column we'll create a Polars expression +# for a histogram that spans orders of magnitude. + +DESCRIPTION_COLUMNS_BLOCK + +# ### Context +# +# Next, we'll define our Context. This is where we set the privacy budget, +# and set the weight for each query under that overall budget. + +# + +DESCRIPTION_CONTEXT_BLOCK +# - + +# ENCODING_COMMENT_BLOCK +# +# ## Results +# +# Finally, we run the queries. + +DESCRIPTION_QUERIES_BLOCK + +# If we try to run more queries at this point, it will error. Once the privacy budget +# is consumed, the library prevents you from running any more queries. + +# # Coda +# The code below produces a summary report. + +# + +REPORTS_BLOCK +# - diff --git a/dp_wizard/utils/code_generators/no-tests/_description_script.py b/dp_wizard/utils/code_generators/no-tests/_description_script.py new file mode 100644 index 00000000..784128a8 --- /dev/null +++ b/dp_wizard/utils/code_generators/no-tests/_description_script.py @@ -0,0 +1,31 @@ +# TITLE +# +# CUSTOM_NOTE + +# Install the following dependencies, if you haven't already: +# WINDOWS_COMMENT_BLOCK +# +# $ pip install DEPENDENCIES + +from argparse import ArgumentParser + +IMPORTS_BLOCK + +DESCRIPTION_COLUMNS_BLOCK + + +def get_stats_context_contributions(csv_path): + DESCRIPTION_CONTEXT_BLOCK + # ENCODING_COMMENT_BLOCK + return stats_context, contributions + + +if __name__ == "__main__": + parser = ArgumentParser(description="Describes the columns of a csv") + parser.add_argument( + "--csv", required=True, help="Path to csv containing private data" + ) + args = parser.parse_args() + stats_context, contributions = get_stats_context_contributions(csv_path=args.csv) + + DESCRIPTION_QUERIES_BLOCK diff --git a/dp_wizard/utils/code_generators/no-tests/_stats_notebook.py b/dp_wizard/utils/code_generators/no-tests/_stats_notebook.py index 144af0b6..d5669044 100644 --- a/dp_wizard/utils/code_generators/no-tests/_stats_notebook.py +++ b/dp_wizard/utils/code_generators/no-tests/_stats_notebook.py @@ -30,7 +30,7 @@ # Based on the input you provided, for each column we'll create a Polars expression # that describes how we want to summarize that column. -COLUMNS_BLOCK +STATS_COLUMNS_BLOCK # ### Context # @@ -56,5 +56,5 @@ # The code below produces a summary report. # + -STATS_REPORTS_BLOCK +REPORTS_BLOCK # - diff --git a/dp_wizard/utils/code_generators/no-tests/_stats_script.py b/dp_wizard/utils/code_generators/no-tests/_stats_script.py index 4a910972..07a4bd97 100644 --- a/dp_wizard/utils/code_generators/no-tests/_stats_script.py +++ b/dp_wizard/utils/code_generators/no-tests/_stats_script.py @@ -13,7 +13,7 @@ UTILS_BLOCK -COLUMNS_BLOCK +STATS_COLUMNS_BLOCK def get_stats_context_contributions(csv_path): diff --git a/dp_wizard/utils/code_generators/no-tests/_synth_notebook.py b/dp_wizard/utils/code_generators/no-tests/_synth_notebook.py index 73b06a55..64355aa8 100644 --- a/dp_wizard/utils/code_generators/no-tests/_synth_notebook.py +++ b/dp_wizard/utils/code_generators/no-tests/_synth_notebook.py @@ -58,5 +58,5 @@ # The code below produces a summary report. # + -SYNTH_REPORTS_BLOCK +REPORTS_BLOCK # - diff --git a/dp_wizard/utils/code_generators/notebook_generator.py b/dp_wizard/utils/code_generators/notebook_generator.py index f7ec5322..83140edf 100644 --- a/dp_wizard/utils/code_generators/notebook_generator.py +++ b/dp_wizard/utils/code_generators/notebook_generator.py @@ -78,11 +78,13 @@ def template(synthetic_data): ) + "}" ) + case Product.CSV_DESCRIPTION: + outputs_expression = "TODO" case _: # pragma: no cover raise ValueError(self.analysis_plan.product) tmp_path = package_root / "tmp" reports_block = ( - Template(f"{self._get_synth_or_stats()}_reports", root) + Template(f"{self._get_product()}_reports", root) .fill_expressions( OUTPUTS=outputs_expression, COLUMNS={ @@ -100,19 +102,37 @@ def template(synthetic_data): return reports_block def _make_extra_blocks(self): + report_blocks = { + "REPORTS_BLOCK": self._make_reports_block(), + } + utils_blocks = { + "UTILS_BLOCK": (package_root / "utils/shared.py").read_text(), + } match self.analysis_plan.product: case Product.SYNTHETIC_DATA: - return { - "SYNTH_CONTEXT_BLOCK": self._make_synth_context(), - "SYNTH_QUERY_BLOCK": self._make_synth_query(), - "SYNTH_REPORTS_BLOCK": self._make_reports_block(), - } + return ( + report_blocks + | utils_blocks + | { + "SYNTH_CONTEXT_BLOCK": self._make_synth_context(), + "SYNTH_QUERY_BLOCK": self._make_synth_query(), + } + ) case Product.STATISTICS: - return { - "COLUMNS_BLOCK": self._make_columns(), - "STATS_CONTEXT_BLOCK": self._make_stats_context(), - "STATS_QUERIES_BLOCK": self._make_stats_queries(), - "STATS_REPORTS_BLOCK": self._make_reports_block(), + return ( + report_blocks + | utils_blocks + | { + "STATS_COLUMNS_BLOCK": self._make_columns(), + "STATS_CONTEXT_BLOCK": self._make_stats_context(), + "STATS_QUERIES_BLOCK": self._make_stats_queries(), + } + ) + case Product.CSV_DESCRIPTION: + # Doesn't need the shared utils + return report_blocks | { + "DESCRIPTION_CONTEXT_BLOCK": self._make_stats_context(), + "DESCRIPTION_QUERIES_BLOCK": self._make_stats_queries(), } case _: # pragma: no cover raise ValueError(self.analysis_plan.product) diff --git a/dp_wizard/utils/code_generators/script_generator.py b/dp_wizard/utils/code_generators/script_generator.py index b750bd9a..92c680bc 100644 --- a/dp_wizard/utils/code_generators/script_generator.py +++ b/dp_wizard/utils/code_generators/script_generator.py @@ -21,17 +21,17 @@ def _make_columns(self): ) def _make_stats_context(self): - return ( - self._make_partial_stats_context() - .fill_expressions(CSV_PATH="csv_path") - .fill_code_blocks(OPTIONAL_CSV_BLOCK="") - .finish() - ) + return self._fill_partial_context(self._make_partial_stats_context()) def _make_synth_context(self): + return self._fill_partial_context(self._make_partial_synth_context()) + + def _make_description_context(self): + return self._fill_partial_context(self._make_partial_description_context()) + + def _fill_partial_context(self, partial_context): return ( - self._make_partial_synth_context() - .fill_expressions(CSV_PATH="csv_path") + partial_context.fill_expressions(CSV_PATH="csv_path") .fill_code_blocks(OPTIONAL_CSV_BLOCK="") .finish() ) @@ -50,9 +50,11 @@ def _make_extra_blocks(self): } case Product.STATISTICS: return { - "COLUMNS_BLOCK": self._make_columns(), + "STATS_COLUMNS_BLOCK": self._make_columns(), "STATS_CONTEXT_BLOCK": self._make_stats_context(), "STATS_QUERIES_BLOCK": self._make_stats_queries(), } + case Product.CSV_DESCRIPTION: + return {} case _: # pragma: no cover raise ValueError(self.analysis_plan.product) diff --git a/dp_wizard/utils/csv_helper.py b/dp_wizard/utils/csv_helper.py index f1857606..35c00b48 100644 --- a/dp_wizard/utils/csv_helper.py +++ b/dp_wizard/utils/csv_helper.py @@ -18,6 +18,30 @@ def read_csv_names(csv_path: Path) -> list[ColumnName]: return [ColumnName(name) for name in all_names if name.strip() != ""] +def read_csv_numeric_names(csv_path: Path) -> list[ColumnName]: # pragma: no cover + lf = pl.scan_csv(csv_path) + numeric_names = [ + name + for name, pl_type in lf.collect_schema().items() + if pl_type + in [ + pl.Int8, + pl.Int16, + pl.Int32, + pl.Int64, + pl.Int128, + pl.Float32, + pl.Float64, + pl.UInt8, + pl.UInt16, + pl.UInt32, + pl.UInt64, + ] + ] + # Exclude columns missing names: + return [ColumnName(name) for name in numeric_names if name.strip() != ""] + + def get_csv_names_mismatch( public_csv_path: Path, private_csv_path: Path ) -> tuple[set[ColumnName], set[ColumnName]]: diff --git a/tests/test_app.py b/tests/test_app.py index 7ca5e6a9..11d1cd61 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -72,7 +72,7 @@ def test_qa_app(page: Page, qa_app: ShinyAppProc): # pragma: no cover def test_local_app_validations(page: Page, local_app: ShinyAppProc): # pragma: no cover pick_dataset_text = "How many rows of the CSV" - perform_analysis_text = "Select columns to calculate statistics on" + perform_analysis_text = "Select numeric columns to calculate statistics on" download_results_text = "You can now make a differentially private release" # -- Select Dataset -- diff --git a/tests/utils/test_code_generators.py b/tests/utils/test_code_generators.py index c9d43d17..97f0fa12 100644 --- a/tests/utils/test_code_generators.py +++ b/tests/utils/test_code_generators.py @@ -237,6 +237,8 @@ def test_make_notebook(plan): context_global = "synth_context" case Product.STATISTICS: context_global = "stats_context" + case Product.CSV_DESCRIPTION: + context_global = "description_context" case _: # pragma: no cover raise ValueError(plan.product) assert isinstance(globals[context_global], dp.Context)