diff --git a/docs/index.html b/docs/index.html index 8fd8ccc4..fd8c2e5a 100644 --- a/docs/index.html +++ b/docs/index.html @@ -827,7 +827,8 @@

Results

>>> grade_query = ( ... stats_context.query().group_by(groups).agg(pl.len().dp.noise().alias("count")) ... ) ->>> grade_accuracy = grade_query.summarize(alpha=1 - confidence)["accuracy"].item() +>>> summary = grade_query.summarize(alpha=1 - confidence) +>>> grade_accuracy = summary["accuracy"].item() >>> grade_stats = grade_query.release().collect()

If we try to run more queries at this point, it will error. Once the diff --git a/docs/index.md b/docs/index.md index 9299bf9c..6c6b1840 100644 --- a/docs/index.md +++ b/docs/index.md @@ -582,7 +582,8 @@ Query for grade: >>> grade_query = ( ... stats_context.query().group_by(groups).agg(pl.len().dp.noise().alias("count")) ... ) ->>> grade_accuracy = grade_query.summarize(alpha=1 - confidence)["accuracy"].item() +>>> summary = grade_query.summarize(alpha=1 - confidence) +>>> grade_accuracy = summary["accuracy"].item() >>> grade_stats = grade_query.release().collect() ``` diff --git a/docs/screenshots/download-results.png b/docs/screenshots/download-results.png index 7b111e3c..3c9d351c 100644 Binary files a/docs/screenshots/download-results.png and b/docs/screenshots/download-results.png differ diff --git a/docs/screenshots/select-dataset.png b/docs/screenshots/select-dataset.png index edd15616..440d45cd 100644 Binary files a/docs/screenshots/select-dataset.png and b/docs/screenshots/select-dataset.png differ diff --git a/dp_wizard/utils/code_generators/analyses/histogram/__init__.py b/dp_wizard/utils/code_generators/analyses/histogram/__init__.py index 18d21df3..1428c8c0 100644 --- a/dp_wizard/utils/code_generators/analyses/histogram/__init__.py +++ b/dp_wizard/utils/code_generators/analyses/histogram/__init__.py @@ -32,9 +32,23 @@ def template(BIN_NAME, GROUP_NAMES, stats_context, confidence): .agg(pl.len().dp.noise().alias("count")) # type: ignore .WITH_KEYS ) - ACCURACY_NAME = QUERY_NAME.summarize(alpha=1 - confidence)[ # noqa: F841 - "accuracy" - ].item() + + # + [markdown] tags=["tutorial"] + # We can summarize the statistic to get the accuracy. + # More on [`summarize()` in the OpenDP + # docs](https://docs.opendp.org/en/OPENDP_V_VERSION/api/python/opendp.extras.polars.html#opendp.extras.polars.LazyFrameQuery.summarize). + # - + + # + tags=["tutorial"] + summary = QUERY_NAME.summarize(alpha=1 - confidence) + summary + # - + + # + [markdown] tags=["tutorial"] + # Proceding to the DP release: + # - + + ACCURACY_NAME = summary["accuracy"].item() # noqa: F841 STATS_NAME = QUERY_NAME.release().collect() STATS_NAME # type: ignore diff --git a/dp_wizard/utils/code_generators/analyses/mean/__init__.py b/dp_wizard/utils/code_generators/analyses/mean/__init__.py index 993cfdc4..c912c228 100644 --- a/dp_wizard/utils/code_generators/analyses/mean/__init__.py +++ b/dp_wizard/utils/code_generators/analyses/mean/__init__.py @@ -19,13 +19,29 @@ def make_query(code_gen, identifier, accuracy_name, stats_name): - def template(GROUP_NAMES, stats_context, EXPR_NAME): + def template(GROUP_NAMES, stats_context, EXPR_NAME, confidence): groups = GROUP_NAMES QUERY_NAME = ( stats_context.query().group_by(groups).agg(EXPR_NAME).WITH_KEYS if groups else stats_context.query().select(EXPR_NAME) ) + + # + [markdown] tags=["tutorial"] + # If we summarize the statistic, we see that a mean is composed + # of a sum and a length, each with their own accuracy. + # More on [`summarize()` in the OpenDP + # docs](https://docs.opendp.org/en/OPENDP_V_VERSION/api/python/opendp.extras.polars.html#opendp.extras.polars.LazyFrameQuery.summarize). + # - + + # + tags=["tutorial"] + QUERY_NAME.summarize(alpha=1 - confidence) + # - + + # + [markdown] tags=["tutorial"] + # Proceding to the DP release: + # - + STATS_NAME = QUERY_NAME.release().collect() STATS_NAME # type: ignore diff --git a/dp_wizard/utils/code_generators/analyses/median/__init__.py b/dp_wizard/utils/code_generators/analyses/median/__init__.py index fd128d80..6b303ef9 100644 --- a/dp_wizard/utils/code_generators/analyses/median/__init__.py +++ b/dp_wizard/utils/code_generators/analyses/median/__init__.py @@ -21,13 +21,29 @@ def make_query(code_gen, identifier, accuracy_name, stats_name): - def template(GROUP_NAMES, stats_context, EXPR_NAME): + def template(GROUP_NAMES, stats_context, EXPR_NAME, confidence): groups = GROUP_NAMES QUERY_NAME = ( stats_context.query().group_by(groups).agg(EXPR_NAME).WITH_KEYS if groups else stats_context.query().select(EXPR_NAME) ) + + # + tags=["tutorial"] + # Because the median is based on selection from candidate values, + # it does not have an accuracy, unlike histogram and mean. + # More on [`summarize()` in the OpenDP + # docs](https://docs.opendp.org/en/OPENDP_V_VERSION/api/python/opendp.extras.polars.html#opendp.extras.polars.LazyFrameQuery.summarize). + # - + + # + tags=["tutorial"] + QUERY_NAME.summarize(alpha=1 - confidence) + # - + + # + tags=["tutorial"] + # Proceding to the DP release: + # - + STATS_NAME = QUERY_NAME.release().collect() STATS_NAME # type: ignore diff --git a/dp_wizard/utils/code_generators/script_generator.py b/dp_wizard/utils/code_generators/script_generator.py index 98ca4b17..7bec5326 100644 --- a/dp_wizard/utils/code_generators/script_generator.py +++ b/dp_wizard/utils/code_generators/script_generator.py @@ -10,7 +10,9 @@ def _get_notebook_or_script(self): def _clean_up_py(self, py: str): # The output is passed through black, so we don't need to overdo this regex. - py = re.sub(r"# [+-]", "", py) + # Strip jupytext light annotations. + py = re.sub(r"# \+.*", "", py) + py = re.sub(r"# -$", "", py, flags=re.MULTILINE) return py def _make_columns(self): diff --git a/tests/test_docs.py b/tests/test_docs.py index 255a65ed..c0c6f62b 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -60,12 +60,13 @@ def test_doc_examples_up_to_date(): ) expected_code = NotebookGenerator(plan, "Note goes here!").make_py(reformat=True) - if any( - # csv_path is expanded to an absolute path, so ignore it: - line not in expected_code and csv_path not in line + unexpected_lines = [ + line for line in doc_code.splitlines() - ): - # It's fine for the docs to be a subset of the generated code, - # but if a line is missing, the "pytest -vv" diff - # will give us context to fix it. - assert expected_code == doc_code # pragma: no cover + # csv_path is absolute and it will have local information + # that shouldn't be checked in. + if line not in expected_code and csv_path not in line + ] + assert ( + not unexpected_lines + ), f"These lines are missing from {index_md}:\n" + "\n".join(unexpected_lines) diff --git a/tests/utils/test_code_generators.py b/tests/utils/test_code_generators.py index ddc07cee..82d71a2e 100644 --- a/tests/utils/test_code_generators.py +++ b/tests/utils/test_code_generators.py @@ -189,6 +189,7 @@ def id_for_plan(plan: AnalysisPlan): expected_urls = [ "https://docs.opendp.org/", "https://github.com/opendp/dp-wizard", + "https://docs.opendp.org/en/v0.14.1/api/python/opendp.extras.polars.html#opendp.extras.polars.LazyFrameQuery.summarize", "https://docs.opendp.org/en/v0.14.1/api/python/opendp.extras.mbi.html#opendp.extras.mbi.ContingencyTable.synthesize", "https://docs.opendp.org/en/v0.14.1/api/python/opendp.extras.mbi.html#opendp.extras.mbi.ContingencyTable.project_melted", ]