diff --git a/dp_wizard/shiny/components/summaries.py b/dp_wizard/shiny/components/summaries.py index a9118259..4c532e7a 100644 --- a/dp_wizard/shiny/components/summaries.py +++ b/dp_wizard/shiny/components/summaries.py @@ -8,7 +8,7 @@ product_icon, unit_of_privacy_icon, ) -from dp_wizard.types import AppState +from dp_wizard.types import AppState, Product _css = "display: block; padding: 0 1em 1em 1em;" @@ -47,10 +47,16 @@ def analysis_summary(state: AppState): # pragma: no cover budget = state.epsilon() return tags.small( - columns_icon, - f"Columns: {columns}; ", - groups_icon, - f"Groups: {groups}; ", + ( + [] + if state.product() == Product.CSV_DESCRIPTION + else [ + columns_icon, + f"Columns: {columns}; ", + groups_icon, + f"Groups: {groups}; ", + ] + ), budget_icon, f"Privacy Budget: {budget} epsilon.", style=_css, diff --git a/dp_wizard/shiny/panels/analysis_panel/__init__.py b/dp_wizard/shiny/panels/analysis_panel/__init__.py index 507f3f79..019fb0a4 100644 --- a/dp_wizard/shiny/panels/analysis_panel/__init__.py +++ b/dp_wizard/shiny/panels/analysis_panel/__init__.py @@ -21,7 +21,7 @@ ) from dp_wizard.shiny.components.summaries import dataset_summary from dp_wizard.shiny.panels.analysis_panel.column_module import column_server, column_ui -from dp_wizard.types import AppState +from dp_wizard.types import AppState, Product from dp_wizard.utils.code_generators import make_privacy_loss_block from dp_wizard.utils.csv_helper import ( get_csv_row_count, @@ -36,64 +36,7 @@ def analysis_ui(): ui.output_ui("analysis_requirements_warning_ui"), ui.output_ui("analysis_release_warning_ui"), ui.output_ui("previous_summary_ui"), - ui.layout_columns( - ui.card( - ui.card_header(columns_icon, "Columns"), - ui.markdown("Select numeric columns to calculate statistics on."), - ui.input_selectize( - "columns_selectize", - "Columns", - [], - multiple=True, - ), - ui.output_ui("columns_selectize_tutorial_ui"), - ), - ui.card( - ui.card_header(groups_icon, "Grouping"), - ui.markdown( - """ - Select columns to group by, or leave empty - to calculate statistics across the entire dataset. - - Groups aren't applied to the previews on this page - but will be used in the final release. - """ - ), - ui.input_selectize( - "groups_selectize", - "Group by", - [], - multiple=True, - ), - ui.output_ui("groups_selectize_tutorial_ui"), - ), - ui.card( - ui.card_header(budget_icon, "Privacy Budget"), - ui.markdown( - f""" - What is your privacy budget, or epsilon, for this release? - Many factors including the sensitivity of your data, - the frequency of DP releases, - and the regulatory landscape can be considered. - Consider how your budget compares to that of - other projects. - """ - ), - log_slider("log_epsilon_slider", 0.1, 10.0), - ui.output_ui("epsilon_ui"), - ui.output_ui("privacy_loss_python_ui"), - ), - ui.card( - ui.card_header(simulation_icon, "Simulation"), - ui.output_ui("simulation_card_ui"), - ), - col_widths={ - "sm": [12, 12, 12, 12], # 4 rows - "md": [6, 6, 6, 6], # 2 rows - "xxl": [3, 3, 3, 3], # 1 row - }, - ), + ui.output_ui("top_cards_ui"), ui.output_ui("columns_ui"), ui.output_ui("download_results_button_ui"), value="analysis_panel", @@ -152,7 +95,7 @@ def analysis_server( # contributions_entity = state.contributions_entity max_rows = state.max_rows # initial_product = state.initial_product - # product = state.product + product = state.product # Analysis choices: all_column_names = state.all_column_names @@ -174,9 +117,13 @@ def analysis_server( @reactive.calc def button_enabled(): + # TODO: Get this in sync with results panel warning: + # https://github.com/opendp/dp-wizard/issues/562 at_least_one_column = bool(weights()) no_errors = not any(analysis_errors().values()) - return at_least_one_column and no_errors + return ( + at_least_one_column and no_errors + ) or product() == Product.CSV_DESCRIPTION @reactive.effect def _update_columns(): @@ -239,6 +186,89 @@ def analysis_release_warning_ui(): def previous_summary_ui(): return dataset_summary(state) + @render.ui + def top_cards_ui(): + columns_card = ( + ui.card( + ui.card_header(columns_icon, "Columns"), + ui.markdown("Select numeric columns to calculate statistics on."), + ui.input_selectize( + "columns_selectize", + "Columns", + [], + multiple=True, + ), + ui.output_ui("columns_selectize_tutorial_ui"), + ), + ) + grouping_card = ( + ui.card( + ui.card_header(groups_icon, "Grouping"), + ui.markdown( + """ + Select columns to group by, or leave empty + to calculate statistics across the entire dataset. + + Groups aren't applied to the previews on this page + but will be used in the final release. + """ + ), + ui.input_selectize( + "groups_selectize", + "Group by", + [], + multiple=True, + ), + ui.output_ui("groups_selectize_tutorial_ui"), + ), + ) + budget_card = ( + ui.card( + ui.card_header(budget_icon, "Privacy Budget"), + ui.markdown( + f""" + What is your privacy budget, or epsilon, for this release? + Many factors including the sensitivity of your data, + the frequency of DP releases, + and the regulatory landscape can be considered. + Consider how your budget compares to that of + other projects. + """ + ), + log_slider("log_epsilon_slider", 0.1, 10.0), + ui.output_ui("epsilon_ui"), + ui.output_ui("privacy_loss_python_ui"), + ), + ) + simulation_card = ( + ui.card( + ui.card_header(simulation_icon, "Simulation"), + ui.output_ui("simulation_card_ui"), + ), + ) + + if product() == Product.CSV_DESCRIPTION: + return ( + ui.layout_columns( + budget_card, + col_widths={"md": [12], "lg": [6]}, + ), + ) + return ( + ui.layout_columns( + columns_card, + grouping_card, + budget_card, + simulation_card, + col_widths={ + "sm": [12, 12, 12, 12], # 4 rows + "md": [6, 6, 6, 6], # 2 rows + "xxl": [3, 3, 3, 3], # 1 row + }, + ), + ) + @reactive.effect @reactive.event(input.columns_selectize) def _on_columns_change(): diff --git a/dp_wizard/shiny/panels/dataset_panel/__init__.py b/dp_wizard/shiny/panels/dataset_panel/__init__.py index 23153ed4..e3a3b428 100644 --- a/dp_wizard/shiny/panels/dataset_panel/__init__.py +++ b/dp_wizard/shiny/panels/dataset_panel/__init__.py @@ -585,17 +585,20 @@ def product_ui(): ), tutorial_box( is_tutorial_mode(), - """ + f""" Although the underlying OpenDP library is very flexible, - DP Wizard offers only a few analysis options: + DP Wizard offers a few analysis options to help you get started: - - The **DP Statistics** option supports + - The **{Product.STATISTICS}** option supports grouping, histograms, mean, median, and count. - - With **DP Synthetic Data**, your privacy budget is used + - With **{Product.SYNTHETIC_DATA}**, your privacy budget is used to infer the distributions of values within the selected columns, and the correlations between columns. This is less accurate than calculating the desired statistics directly, but can be easier to work with downstream. + - The **{Product.CSV_DESCRIPTION}** summarizes the contents of CSVs + with a large number of columns, without revealing details + from individual rows. """, responsive=False, ), diff --git a/dp_wizard/shiny/panels/results_panel/__init__.py b/dp_wizard/shiny/panels/results_panel/__init__.py index 51a3589d..47ae8f6f 100644 --- a/dp_wizard/shiny/panels/results_panel/__init__.py +++ b/dp_wizard/shiny/panels/results_panel/__init__.py @@ -16,7 +16,7 @@ tutorial_box, ) from dp_wizard.shiny.components.summaries import analysis_summary, dataset_summary -from dp_wizard.types import AppState +from dp_wizard.types import AppState, Product from dp_wizard.utils.code_generators import AnalysisPlan, AnalysisPlanColumn from dp_wizard.utils.code_generators.notebook_generator import ( PLACEHOLDER_CSV_NAME, @@ -135,7 +135,9 @@ def results_server( @render.ui def results_requirements_warning_ui(): return hide_if( - bool(weights()), + # TODO: Get this in sync with analysis_panel validation + # https://github.com/opendp/dp-wizard/issues/562 + bool(weights()) or product() == Product.CSV_DESCRIPTION, info_md_box( """ Please define your analysis on the previous tab @@ -196,7 +198,7 @@ def clean_download_stem() -> str: def download_results_ui(): if in_cloud: return None - disabled = not weights() + disabled = not (weights() or product() == Product.CSV_DESCRIPTION) return [ ui.h3("Download Results"), tutorial_box( diff --git a/dp_wizard/types.py b/dp_wizard/types.py index f982489a..8ee32704 100644 --- a/dp_wizard/types.py +++ b/dp_wizard/types.py @@ -8,19 +8,20 @@ class Product(Enum): STATISTICS = auto() SYNTHETIC_DATA = auto() + CSV_DESCRIPTION = auto() @classmethod def to_dict(cls) -> dict[str, str]: """ >>> Product.to_dict() - {'1': 'DP Statistics', '2': 'DP Synthetic Data'} + {'1': 'DP Statistics', '2': 'DP Synthetic Data', '3': 'DP Codebook'} """ return { str(member.value): str(member) for (name, member) in cls.__members__.items() } def __str__(self) -> str: - return "DP " + self.name.replace("_", " ").title() + return "DP " + self.name.replace("_", " ").title().replace("Csv", "CSV") class AnalysisName(str): diff --git a/dp_wizard/utils/code_generators/__init__.py b/dp_wizard/utils/code_generators/__init__.py index 024a9b9d..5efbd7da 100644 --- a/dp_wizard/utils/code_generators/__init__.py +++ b/dp_wizard/utils/code_generators/__init__.py @@ -34,7 +34,7 @@ class AnalysisPlan(NamedTuple): >>> print(plan.to_stem()) dp_statistics_for_data_col_grouped_by_grouping_col >>> print(plan.to_note()) - This demonstrates how to calculate ... + This demonstrates how to create ... Generated by DP Wizard ... """ @@ -48,6 +48,9 @@ class AnalysisPlan(NamedTuple): columns: dict[ColumnName, list[AnalysisPlanColumn]] def __str__(self) -> str: + if self.product == Product.CSV_DESCRIPTION: + return str(self.product) + def md_list(names) -> str: return ", ".join(f"`{name}`" for name in names) @@ -62,7 +65,7 @@ def to_stem(self) -> str: def to_note(self) -> str: now = datetime.now().strftime("%b %d, %Y at %I:%M%p") return f""" -This demonstrates how to calculate {self} using OpenDP (https://docs.opendp.org). +This demonstrates how to create {self} using OpenDP (https://docs.opendp.org). Generated by DP Wizard v{__version__} (https://github.com/opendp/dp-wizard) on {now}. """.strip() diff --git a/dp_wizard/utils/code_generators/abstract_generator.py b/dp_wizard/utils/code_generators/abstract_generator.py index 9ffc200f..b8a9391f 100644 --- a/dp_wizard/utils/code_generators/abstract_generator.py +++ b/dp_wizard/utils/code_generators/abstract_generator.py @@ -28,12 +28,14 @@ def __init__(self, analysis_plan: AnalysisPlan, note: str): self.analysis_plan = analysis_plan self.note = note - def _get_synth_or_stats(self) -> str: + def _get_product(self) -> str: match self.analysis_plan.product: case Product.STATISTICS: return "stats" case Product.SYNTHETIC_DATA: return "synth" + case Product.CSV_DESCRIPTION: + return "codebook" case _: # pragma: no cover raise ValueError(self.analysis_plan.product) @@ -46,6 +48,8 @@ def _get_extra(self) -> str: return "polars" case Product.SYNTHETIC_DATA: return "mbi" + case Product.CSV_DESCRIPTION: + return "polars" case _: # pragma: no cover raise ValueError(self.analysis_plan.product) @@ -53,7 +57,7 @@ def _get_extra(self) -> str: def _get_notebook_or_script(self) -> str: ... # pragma: no cover def _get_root_template(self) -> str: - adj = self._get_synth_or_stats() + adj = self._get_product() noun = self._get_notebook_or_script() return f"{adj}_{noun}" diff --git a/dp_wizard/utils/code_generators/notebook_generator.py b/dp_wizard/utils/code_generators/notebook_generator.py index f7ec5322..5ee36b56 100644 --- a/dp_wizard/utils/code_generators/notebook_generator.py +++ b/dp_wizard/utils/code_generators/notebook_generator.py @@ -78,11 +78,13 @@ def template(synthetic_data): ) + "}" ) + case Product.CSV_DESCRIPTION: + outputs_expression = "TODO" case _: # pragma: no cover raise ValueError(self.analysis_plan.product) tmp_path = package_root / "tmp" reports_block = ( - Template(f"{self._get_synth_or_stats()}_reports", root) + Template(f"{self._get_product()}_reports", root) .fill_expressions( OUTPUTS=outputs_expression, COLUMNS={ @@ -114,5 +116,7 @@ def _make_extra_blocks(self): "STATS_QUERIES_BLOCK": self._make_stats_queries(), "STATS_REPORTS_BLOCK": self._make_reports_block(), } + case Product.CSV_DESCRIPTION: + return {} # TODO case _: # pragma: no cover raise ValueError(self.analysis_plan.product) diff --git a/dp_wizard/utils/code_generators/script_generator.py b/dp_wizard/utils/code_generators/script_generator.py index b750bd9a..be1b0cb4 100644 --- a/dp_wizard/utils/code_generators/script_generator.py +++ b/dp_wizard/utils/code_generators/script_generator.py @@ -54,5 +54,7 @@ def _make_extra_blocks(self): "STATS_CONTEXT_BLOCK": self._make_stats_context(), "STATS_QUERIES_BLOCK": self._make_stats_queries(), } + case Product.CSV_DESCRIPTION: + return {} # TODO case _: # pragma: no cover raise ValueError(self.analysis_plan.product) diff --git a/tests/utils/test_code_generators.py b/tests/utils/test_code_generators.py index c9d43d17..7a46e80c 100644 --- a/tests/utils/test_code_generators.py +++ b/tests/utils/test_code_generators.py @@ -237,6 +237,8 @@ def test_make_notebook(plan): context_global = "synth_context" case Product.STATISTICS: context_global = "stats_context" + case Product.CSV_DESCRIPTION: + context_global = "codebook_context" case _: # pragma: no cover raise ValueError(plan.product) assert isinstance(globals[context_global], dp.Context)