opendp · mccalluc · Oct 18, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/.pytest.ini b/.pytest.ini
@@ -0,0 +1,10 @@
+[pytest]
+
+# Treat warnings as errors:
+filterwarnings =
+	error
+
+addopts = --doctest-glob '*.md' --doctest-modules --ignore dp_creator_ii/templates/ --ignore dp_creator_ii/tests/fixtures/
+
+# If an xfail starts passing unexpectedly, that should count as a failure:
+xfail_strict=true
diff --git a/README.md b/README.md
@@ -15,14 +15,13 @@ We plan to implement a [proof of concept](https://docs.google.com/document/d/1dt
 ## Usage
 
 ```
-usage: dp-creator-ii [-h] [--csv CSV_PATH] [--contrib CONTRIB]
-
-DP Creator II makes it easier to get started with Differential Privacy.
+usage: dp-creator-ii [-h] [--csv CSV_PATH] [--contrib CONTRIB] [--demo]
 
 options:
   -h, --help         show this help message and exit
   --csv CSV_PATH     Path to CSV containing private data
   --contrib CONTRIB  How many rows can an individual contribute?
+  --demo             Use generated fake CSV for a quick demo
 ```
 
 

diff --git a/dp_creator_ii/__init__.py b/dp_creator_ii/__init__.py
@@ -1,47 +1,16 @@
 """DP Creator II makes it easier to get started with Differential Privacy."""
 
-from pathlib import Path
-from argparse import ArgumentParser, ArgumentTypeError
-
 import shiny
+from dp_creator_ii.argparse_helpers import get_csv_contrib
 
 
 __version__ = "0.0.1"
 
 
-def existing_csv(arg):
-    path = Path(arg)
-    if not path.exists():
-        raise ArgumentTypeError(f"No such file: {arg}")
-    if path.suffix != ".csv":
-        raise ArgumentTypeError(f'Must have ".csv" extension: {arg}')
-    return path
-
-
-def get_arg_parser():
-    parser = ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--csv",
-        dest="csv_path",
-        type=existing_csv,
-        help="Path to CSV containing private data",
-    )
-    parser.add_argument(
-        "--contrib",
-        dest="contributions",
-        metavar="CONTRIB",
-        type=int,
-        default=1,
-        help="How many rows can an individual contribute?",
-    )
-    return parser
-
-
 def main():  # pragma: no cover
-    # We call parse_args() again inside the app.
-    # We only call it here so "--help" is handled,
-    # and to validate inputs.
-    get_arg_parser().parse_args()
+    # We only call this here so "--help" is handled,
+    # and to validate inputs before starting the server.
+    get_csv_contrib()
 
     shiny.run_app(
         app="dp_creator_ii.app",

diff --git a/dp_creator_ii/app/dataset_panel.py b/dp_creator_ii/app/dataset_panel.py
@@ -1,26 +1,13 @@
-from sys import argv
-
 from shiny import ui, reactive, render
 
-from dp_creator_ii import get_arg_parser
+from dp_creator_ii.argparse_helpers import get_csv_contrib
 from dp_creator_ii.csv_helper import read_field_names
 from dp_creator_ii.app.ui_helpers import output_code_sample
 from dp_creator_ii.template import make_privacy_unit_block
 
 
-def get_args():
-    arg_parser = get_arg_parser()
-    if argv[1:3] == ["run", "--port"]:
-        # We are running a Playwright test,
-        # and ARGV is polluted, so override:
-        return arg_parser.parse_args([])
-    else:
-        # Normal parsing:
-        return arg_parser.parse_args()
-
-
 def dataset_ui():
-    args = get_args()
+    (_csv_path, contributions) = get_csv_contrib()
 
     return ui.nav_panel(
         "Select Dataset",
@@ -34,17 +21,17 @@ def dataset_ui():
             'This is the "unit of privacy" which will be protected.'
         ),
         ui.output_text("csv_fields"),
-        ui.input_numeric("contributions", "Contributions", args.contributions),
+        ui.input_numeric("contributions", "Contributions", contributions),
         output_code_sample("unit_of_privacy_python"),
         ui.input_action_button("go_to_analysis", "Define analysis"),
         value="dataset_panel",
     )
 
 
 def dataset_server(input, output, session):
-    args = get_args()
+    (csv_path, _contributions) = get_csv_contrib()
 
-    csv_path_from_cli_value = reactive.value(args.csv_path)
+    csv_path_from_cli_value = reactive.value(csv_path)
 
     @reactive.calc
     def csv_path_calc():

diff --git a/dp_creator_ii/argparse_helpers.py b/dp_creator_ii/argparse_helpers.py
@@ -0,0 +1,113 @@
+from sys import argv
+from pathlib import Path
+from argparse import ArgumentParser, ArgumentTypeError
+import csv
+import random
+from warnings import warn
+
+
+def _existing_csv_type(arg):
+    path = Path(arg)
+    if not path.exists():
+        raise ArgumentTypeError(f"No such file: {arg}")
+    if path.suffix != ".csv":
+        raise ArgumentTypeError(f'Must have ".csv" extension: {arg}')
+    return path
+
+
+def _get_arg_parser():
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--csv",
+        dest="csv_path",
+        type=_existing_csv_type,
+        help="Path to CSV containing private data",
+    )
+    parser.add_argument(
+        "--contrib",
+        dest="contributions",
+        metavar="CONTRIB",
+        type=int,
+        default=1,
+        help="How many rows can an individual contribute?",
+    )
+    parser.add_argument(
+        "--demo", action="store_true", help="Use generated fake CSV for a quick demo"
+    )
+    return parser
+
+
+def _get_args():
+    """
+    >>> _get_args()
+    Namespace(csv_path=None, contributions=1, demo=False)
+    """
+    arg_parser = _get_arg_parser()
+    if "--port" in argv or "-v" in argv or "-k" in argv:
+        # We are running a test,
+        # and ARGV is polluted, so override:
+        return arg_parser.parse_args([])
+    else:
+        # Normal parsing:
+        return arg_parser.parse_args()  # pragma: no cover
+
+
+def _clip(n, lower, upper):
+    """
+    >>> _clip(-5, 0, 10)
+    0
+    >>> _clip(5, 0, 10)
+    5
+    >>> _clip(15, 0, 10)
+    10
+    """
+    return max(min(n, upper), lower)
+
+
+def _get_demo_csv_contrib():
+    """
+    >>> csv_path, contributions = _get_demo_csv_contrib()
+    >>> with open(csv_path, newline="") as csv_handle:
+    ...     reader = csv.DictReader(csv_handle)
+    ...     reader.fieldnames
+    ...     rows = list(reader)
+    ...     rows[0]
+    ...     rows[-1]
+    ['student_id', 'class_year', 'hw_number', 'grade']
+    {'student_id': '1', 'class_year': '2', 'hw_number': '1', 'grade': '73'}
+    {'student_id': '100', 'class_year': '1', 'hw_number': '10', 'grade': '78'}
+    """
+    random.seed(0)  # So the mock data will be stable across runs.
+
+    csv_path = "/tmp/demo.csv"
+    contributions = 10
+
+    with open(csv_path, "w", newline="") as demo_handle:
+        fields = ["student_id", "class_year", "hw_number", "grade"]
+        writer = csv.DictWriter(demo_handle, fieldnames=fields)
+        writer.writeheader()
+        for student_id in range(1, 101):
+            class_year = int(_clip(random.gauss(2, 1), 1, 4))
+            # Older students do slightly better in the class:
+            mean_grade = random.gauss(80, 5) + class_year * 2
+            for hw_number in range(1, contributions + 1):
+                grade = int(_clip(random.gauss(mean_grade, 5), 0, 100))
+                writer.writerow(
+                    {
+                        "student_id": student_id,
+                        "class_year": class_year,
+                        "hw_number": hw_number,
+                        "grade": grade,
+                    }
+                )
+
+    return csv_path, contributions
+
+
+def get_csv_contrib():  # pragma: no cover
+    args = _get_args()
+    if args.demo:
+        if args.csv_path is not None or args.contributions is not None:
+            warn('"--demo" overrides "--csv" and "--contrib"')
+        return _get_demo_csv_contrib()
+    return (args.csv_path, args.contributions)
diff --git a/dp_creator_ii/tests/test_arg_parser.py b/dp_creator_ii/tests/test_arg_parser.py
@@ -3,12 +3,12 @@
 
 import pytest
 
-from dp_creator_ii import get_arg_parser, existing_csv
+from dp_creator_ii.argparse_helpers import _get_arg_parser, _existing_csv_type
 
 
 def test_help():
     help = (
-        get_arg_parser()
+        _get_arg_parser()
         .format_help()
         # argparse doesn't actually know the name of the script
         # and inserts the name of the running program instead.
@@ -25,14 +25,14 @@ def test_help():
 
 def test_arg_validation_no_file():
     with pytest.raises(ArgumentTypeError, match="No such file: no-such-file"):
-        existing_csv("no-such-file")
+        _existing_csv_type("no-such-file")
 
 
 def test_arg_validation_not_csv():
     with pytest.raises(ArgumentTypeError, match='Must have ".csv" extension:'):
-        existing_csv(Path(__file__).parent / "fixtures" / "fake.ipynb")
+        _existing_csv_type(Path(__file__).parent / "fixtures" / "fake.ipynb")
 
 
 def test_arg_validation_works():
-    path = existing_csv(Path(__file__).parent / "fixtures" / "fake.csv")
+    path = _existing_csv_type(Path(__file__).parent / "fixtures" / "fake.csv")
     assert path.name == "fake.csv"