diff --git a/dp_wizard/shiny/panels/analysis_panel/__init__.py b/dp_wizard/shiny/panels/analysis_panel/__init__.py index 63f965f5..868d5372 100644 --- a/dp_wizard/shiny/panels/analysis_panel/__init__.py +++ b/dp_wizard/shiny/panels/analysis_panel/__init__.py @@ -322,45 +322,35 @@ def simulation_card_ui(): responsive=False, ), ) + choices = ["100", "1000", "10000"] if public_path(): row_count_str = str(get_csv_row_count(Path(public_path()))) - return [ - ui.markdown( - f""" - Because you've provided public data, - it *will be read* to generate previews. - - The confidence interval depends on the number of rows. - Your public data has {row_count_str} rows, - but if you believe the private data will be - much larger or smaller, please update. - """ - ), - ui.input_select( - "row_count", - "Estimated Rows", - choices=[row_count_str, "100", "1000", "10000"], - selected=row_count_str, - ), - help, - ] + choices.insert(0, row_count_str) + message = f""" + Because you've provided public data, + it *will be read* to generate previews. + + The confidence interval depends on the number of rows. + Your public data has {row_count_str} rows, + but if you believe the private data will be + much larger or smaller, please update. + """ else: - return [ - ui.markdown( - """ - What is the approximate number of rows in the dataset? - This number is only used for the simulation - and not the final calculation. - """ - ), - ui.input_select( - "row_count", - "Estimated Rows", - choices=["100", "1000", "10000"], - selected="100", - ), - help, - ] + message = """ + What is the approximate number of rows in the dataset? + This number is only used for the simulation + and not the final calculation. + """ + return [ + ui.markdown(message), + ui.input_select( + "row_count", + "Estimated Rows", + choices=choices, + selected=choices[0], + ), + help, + ] @render.ui def columns_ui(): diff --git a/dp_wizard/shiny/panels/analysis_panel/column_module.py b/dp_wizard/shiny/panels/analysis_panel/column_module.py index c626e61c..1e135e83 100644 --- a/dp_wizard/shiny/panels/analysis_panel/column_module.py +++ b/dp_wizard/shiny/panels/analysis_panel/column_module.py @@ -213,6 +213,9 @@ def accuracy_histogram(): # so not worth optimizing. lf = ( pl.scan_csv(public_path, ignore_errors=True) + .collect() + .sample(n=row_count, with_replacement=True) + .lazy() if public_path else pl.LazyFrame( mock_data({name: ColumnDef(lower_x, upper_x)}, row_count=row_count) @@ -221,7 +224,7 @@ def accuracy_histogram(): return make_accuracy_histogram( lf=lf, column_name=name, - row_count=row_count, + max_length=row_count, lower_bound=lower_x, upper_bound=upper_x, bin_count=bin_count, diff --git a/dp_wizard/utils/dp_helper.py b/dp_wizard/utils/dp_helper.py index 10e5a8f5..31435603 100644 --- a/dp_wizard/utils/dp_helper.py +++ b/dp_wizard/utils/dp_helper.py @@ -12,7 +12,7 @@ def make_accuracy_histogram( lf: pl.LazyFrame, column_name: str, - row_count: int, + max_length: int, lower_bound: float, upper_bound: float, bin_count: int, @@ -33,7 +33,7 @@ def make_accuracy_histogram( >>> accuracy, histogram = make_accuracy_histogram( ... lf=pl.LazyFrame(df), ... column_name=column_name, - ... row_count=100, + ... max_length=100, ... lower_bound=0, upper_bound=10, ... bin_count=5, ... contributions=1, @@ -76,7 +76,7 @@ def make_accuracy_histogram( margins=[ dp.polars.Margin( # type: ignore by=["bin"], - max_length=row_count, + max_length=max_length, # Range bins names are not private information: This is safe. invariant="keys", ),