From d9e9cde2ccd8a7d6452e1a03ef6fcbb7a43ed81c Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 6 Nov 2024 10:46:54 -0500
Subject: [PATCH 1/6] tests pass: do not really like ui

---
 WHAT-WE-LEARNED.md                  | 22 ++++++++++++++++++----
 dp_creator_ii/app/analysis_panel.py | 12 ++++--------
 dp_creator_ii/utils/csv_helper.py   | 24 +++++++++++++++++++++++-
 tests/test_csv.py                   |  6 +++---
 4 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md
index 9045739..3d9bc88 100644
--- a/WHAT-WE-LEARNED.md
+++ b/WHAT-WE-LEARNED.md
@@ -21,18 +21,18 @@ Typing might help here. I've also wondered about naming conventions, but I haven
 It seems like the returned value would be the same, so I would like to compress something like this:
 ```python
 @reactive.calc
-def csv_fields_calc():
+def csv_labels_calc():
     return read_field_names(req(csv_path()))
 
 @render.text
-def csv_fields():
-    return csv_fields_calc()
+def csv_labels():
+    return csv_labels_calc()
 ```
 into:
 ```python
 @reactive.calc
 @render.text
-def csv_fields():
+def csv_labels():
     return read_field_names(req(csv_path()))
 ```
 but that returns an error:
@@ -44,6 +44,20 @@ Renderer.__call__() missing 1 required positional argument: '_fn'
 
 It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component.
 
+## Unstated requirements for module IDs
+
+The [docs](https://shiny.posit.co/py/docs/modules.html#how-to-use-modules) only say:
+
+> This id has two requirements. First, it must be unique in a single scope, and can’t be duplicated in a given application or module definition. ... Second, the UI and server ids must match.
+
+But it's fussier than that:
+
+```
+ValueError: The string 'Will this work?' is not a valid id; only letters, numbers, and underscore are permitted
+```
+
+Was planning to just use the CSV column headers as IDs, but that's not going to work. Could Shiny just hash whatever we provide, so we wouldn't have to worry about this?
+
 ## Normal tooling doesn't work inside of app?
 
 There are several bits of tooling that don't seem to work inside end-to-end app tests. My guess is that this isn't related to Shiny per se, but rather the ASGI framework: It's not running in the same process as pytest, so it's not surprising that the pytest process can't instrument this.
diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py
index ab17b60..7804d1e 100644
--- a/dp_creator_ii/app/analysis_panel.py
+++ b/dp_creator_ii/app/analysis_panel.py
@@ -4,7 +4,7 @@
 
 from dp_creator_ii.app.components.inputs import log_slider
 from dp_creator_ii.app.components.column_module import column_ui, column_server
-from dp_creator_ii.utils.csv_helper import read_field_names
+from dp_creator_ii.utils.csv_helper import read_csv_ids_labels
 from dp_creator_ii.app.components.outputs import output_code_sample
 from dp_creator_ii.utils.templates import make_privacy_loss_block
 
@@ -66,7 +66,7 @@ def _update_checkbox_group():
         ui.update_checkbox_group(
             "columns_checkbox_group",
             label=None,
-            choices=csv_fields_calc(),
+            choices=csv_ids_labels_calc(),
         )
 
     @reactive.effect
@@ -96,12 +96,8 @@ def columns_ui():
         ]
 
     @reactive.calc
-    def csv_fields_calc():
-        return read_field_names(req(csv_path()))
-
-    @render.text
-    def csv_fields():
-        return csv_fields_calc()
+    def csv_ids_labels_calc():
+        return read_csv_ids_labels(req(csv_path()))
 
     @reactive.calc
     def epsilon_calc():
diff --git a/dp_creator_ii/utils/csv_helper.py b/dp_creator_ii/utils/csv_helper.py
index 147551b..a1256d3 100644
--- a/dp_creator_ii/utils/csv_helper.py
+++ b/dp_creator_ii/utils/csv_helper.py
@@ -1,7 +1,15 @@
+"""
+We'll use the following terms consistently throughout the application:
+- name: This is the exact column header in the CSV.
+- label: This is the string we'll display.
+- id: This is the string we'll pass as a module ID.
+"""
+
+import re
 import polars as pl
 
 
-def read_field_names(csv_path):
+def read_csv_names(csv_path):
     # Polars is overkill, but it is more robust against
     # variations in encoding than Python stdlib csv.
     # However, it could be slow:
@@ -10,3 +18,17 @@ def read_field_names(csv_path):
     # > resolving its schema, which is a potentially expensive operation.
     lf = pl.scan_csv(csv_path)
     return lf.collect_schema().names()
+
+
+def read_csv_ids_labels(csv_path):
+    return {
+        name_to_id(name): f"{i+1}: {name}"
+        for i, name in enumerate(read_csv_names(csv_path))
+    }
+
+
+def name_to_id(name):
+    # Remember to handle empty strings!
+    # TODO: There is a risk of name collision if the only distinction
+    # is non-word characters. Switch to hash()?
+    return "id_" + re.sub(r"\W", "_", name)
diff --git a/tests/test_csv.py b/tests/test_csv.py
index c35c39a..40abf20 100644
--- a/tests/test_csv.py
+++ b/tests/test_csv.py
@@ -4,7 +4,7 @@
 import tempfile
 import pytest
 
-from dp_creator_ii.utils.csv_helper import read_field_names
+from dp_creator_ii.utils.csv_helper import read_csv_ids_labels
 
 
 # We will not reference the encoding when reading:
@@ -49,5 +49,5 @@ def test_csv_loading(write_encoding):
             pl_testing.assert_frame_equal(write_lf, read_lf)
 
         # Preceding lines are reading the whole DF via Polars.
-        field_names_read = read_field_names(fp.name)
-        assert field_names_read == list(data.keys())
+        ids_labels = read_csv_ids_labels(fp.name)
+        assert ids_labels == {"id_AGE": "2: AGE", "id_NAME": "1: NAME"}

From f2b26d733a00668ccf8c705f591eb1a2eb9f37d2 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 6 Nov 2024 10:51:27 -0500
Subject: [PATCH 2/6] demo that we can handle spaces

---
 tests/fixtures/fake.csv | 2 +-
 tests/test_app.py       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/fixtures/fake.csv b/tests/fixtures/fake.csv
index baabea7..6718bba 100644
--- a/tests/fixtures/fake.csv
+++ b/tests/fixtures/fake.csv
@@ -1,4 +1,4 @@
-student_id,class_year,hw_number,grade
+,class year,hw number,grade
 1234,1,1,90
 1234,1,2,95
 1234,1,3,85
diff --git a/tests/test_app.py b/tests/test_app.py
index 624b276..0033d10 100644
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -62,10 +62,10 @@ def expect_no_error():
     expect_visible(perform_analysis_text)
     expect_not_visible(download_results_text)
     # Columns:
-    expect_visible("student_id")
-    expect_visible("class_year")
-    expect_visible("hw_number")
-    expect_visible("grade")
+    expect_visible("1:")
+    expect_visible("2: class year")
+    expect_visible("3: hw number")
+    expect_visible("4: grade")
     # Epsilon slider:
     # (Note: Slider tests failed on CI when run after column details,
     # although it worked locally. This works in either environment.

From 710bfac677d836a0a263aecbaad57d54cbdc1a5f Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 6 Nov 2024 10:53:34 -0500
Subject: [PATCH 3/6] [blank] for empty headers

---
 dp_creator_ii/utils/csv_helper.py | 2 +-
 tests/test_app.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dp_creator_ii/utils/csv_helper.py b/dp_creator_ii/utils/csv_helper.py
index a1256d3..ff8ace0 100644
--- a/dp_creator_ii/utils/csv_helper.py
+++ b/dp_creator_ii/utils/csv_helper.py
@@ -22,7 +22,7 @@ def read_csv_names(csv_path):
 
 def read_csv_ids_labels(csv_path):
     return {
-        name_to_id(name): f"{i+1}: {name}"
+        name_to_id(name): f"{i+1}: {name or '[blank]'}"
         for i, name in enumerate(read_csv_names(csv_path))
     }
 
diff --git a/tests/test_app.py b/tests/test_app.py
index 0033d10..22c412f 100644
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -62,7 +62,7 @@ def expect_no_error():
     expect_visible(perform_analysis_text)
     expect_not_visible(download_results_text)
     # Columns:
-    expect_visible("1:")
+    expect_visible("1: [blank]")
     expect_visible("2: class year")
     expect_visible("3: hw number")
     expect_visible("4: grade")

From cfce236dfd9c7294230794a0229659d78a04a20c Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 6 Nov 2024 11:47:13 -0500
Subject: [PATCH 4/6] Show labels in UI instead of names or IDs

---
 WHAT-WE-LEARNED.md                  |  4 ++--
 dp_creator_ii/app/analysis_panel.py | 12 +++++++++---
 dp_creator_ii/utils/csv_helper.py   |  4 ++++
 tests/test_csv.py                   |  5 ++++-
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md
index 3d9bc88..4966c4f 100644
--- a/WHAT-WE-LEARNED.md
+++ b/WHAT-WE-LEARNED.md
@@ -22,7 +22,7 @@ It seems like the returned value would be the same, so I would like to compress
 ```python
 @reactive.calc
 def csv_labels_calc():
-    return read_field_names(req(csv_path()))
+    return read_labels(req(csv_path()))
 
 @render.text
 def csv_labels():
@@ -33,7 +33,7 @@ into:
 @reactive.calc
 @render.text
 def csv_labels():
-    return read_field_names(req(csv_path()))
+    return read_labels(req(csv_path()))
 ```
 but that returns an error:
 ```
diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py
index 7804d1e..e52c45b 100644
--- a/dp_creator_ii/app/analysis_panel.py
+++ b/dp_creator_ii/app/analysis_panel.py
@@ -4,7 +4,7 @@
 
 from dp_creator_ii.app.components.inputs import log_slider
 from dp_creator_ii.app.components.column_module import column_ui, column_server
-from dp_creator_ii.utils.csv_helper import read_csv_ids_labels
+from dp_creator_ii.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
 from dp_creator_ii.app.components.outputs import output_code_sample
 from dp_creator_ii.utils.templates import make_privacy_loss_block
 
@@ -78,10 +78,12 @@ def _on_column_set_change():
     @render.ui
     def columns_ui():
         column_ids = input.columns_checkbox_group()
+        column_ids_to_names = csv_ids_names_calc()
+        column_ids_to_labels = csv_ids_labels_calc()
         for column_id in column_ids:
             column_server(
                 column_id,
-                name=column_id,
+                name=column_ids_to_names[column_id],
                 contributions=contributions(),
                 epsilon=epsilon_calc(),
                 set_column_weight=set_column_weight,
@@ -89,12 +91,16 @@ def columns_ui():
             )
         return [
             [
-                ui.h3(column_id),
+                ui.h3(column_ids_to_labels[column_id]),
                 column_ui(column_id),
             ]
             for column_id in column_ids
         ]
 
+    @reactive.calc
+    def csv_ids_names_calc():
+        return read_csv_ids_names(req(csv_path()))
+
     @reactive.calc
     def csv_ids_labels_calc():
         return read_csv_ids_labels(req(csv_path()))
diff --git a/dp_creator_ii/utils/csv_helper.py b/dp_creator_ii/utils/csv_helper.py
index ff8ace0..e70dc46 100644
--- a/dp_creator_ii/utils/csv_helper.py
+++ b/dp_creator_ii/utils/csv_helper.py
@@ -27,6 +27,10 @@ def read_csv_ids_labels(csv_path):
     }
 
 
+def read_csv_ids_names(csv_path):
+    return {name_to_id(name): name for name in read_csv_names(csv_path)}
+
+
 def name_to_id(name):
     # Remember to handle empty strings!
     # TODO: There is a risk of name collision if the only distinction
diff --git a/tests/test_csv.py b/tests/test_csv.py
index 40abf20..87e35c6 100644
--- a/tests/test_csv.py
+++ b/tests/test_csv.py
@@ -4,7 +4,7 @@
 import tempfile
 import pytest
 
-from dp_creator_ii.utils.csv_helper import read_csv_ids_labels
+from dp_creator_ii.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
 
 
 # We will not reference the encoding when reading:
@@ -51,3 +51,6 @@ def test_csv_loading(write_encoding):
         # Preceding lines are reading the whole DF via Polars.
         ids_labels = read_csv_ids_labels(fp.name)
         assert ids_labels == {"id_AGE": "2: AGE", "id_NAME": "1: NAME"}
+
+        ids_names = read_csv_ids_names(fp.name)
+        assert ids_names == {"id_AGE": "AGE", "id_NAME": "NAME"}

From 3590f1b0998c5d0adfd697aeb6818b6e7b691fee Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 6 Nov 2024 12:17:36 -0500
Subject: [PATCH 5/6] What if two columns are only distinguished by a non-word
 character?

---
 dp_creator_ii/utils/csv_helper.py |  8 +++-----
 tests/fixtures/fake.csv           | 14 +++++++-------
 tests/test_app.py                 |  3 ++-
 tests/test_csv.py                 |  9 ++++++---
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/dp_creator_ii/utils/csv_helper.py b/dp_creator_ii/utils/csv_helper.py
index e70dc46..4c279ee 100644
--- a/dp_creator_ii/utils/csv_helper.py
+++ b/dp_creator_ii/utils/csv_helper.py
@@ -5,7 +5,6 @@
 - id: This is the string we'll pass as a module ID.
 """
 
-import re
 import polars as pl
 
 
@@ -32,7 +31,6 @@ def read_csv_ids_names(csv_path):
 
 
 def name_to_id(name):
-    # Remember to handle empty strings!
-    # TODO: There is a risk of name collision if the only distinction
-    # is non-word characters. Switch to hash()?
-    return "id_" + re.sub(r"\W", "_", name)
+    # Shiny is fussy about module IDs,
+    # but we don't need them to be human readable.
+    return str(hash(name)).replace("-", "_")
diff --git a/tests/fixtures/fake.csv b/tests/fixtures/fake.csv
index 6718bba..cb1796b 100644
--- a/tests/fixtures/fake.csv
+++ b/tests/fixtures/fake.csv
@@ -1,7 +1,7 @@
-,class year,hw number,grade
-1234,1,1,90
-1234,1,2,95
-1234,1,3,85
-6789,2,1,70
-6789,2,2,100
-6789,2,3,90
+,class year,hw number,hw-number,grade
+1234,1,1,4,90
+1234,1,2,5,95
+1234,1,3,6,85
+6789,2,1,4,70
+6789,2,2,5,100
+6789,2,3,6,90
diff --git a/tests/test_app.py b/tests/test_app.py
index 22c412f..37c00ed 100644
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -65,7 +65,8 @@ def expect_no_error():
     expect_visible("1: [blank]")
     expect_visible("2: class year")
     expect_visible("3: hw number")
-    expect_visible("4: grade")
+    expect_visible("4: hw-number")
+    expect_visible("5: grade")
     # Epsilon slider:
     # (Note: Slider tests failed on CI when run after column details,
     # although it worked locally. This works in either environment.
diff --git a/tests/test_csv.py b/tests/test_csv.py
index 87e35c6..de6f219 100644
--- a/tests/test_csv.py
+++ b/tests/test_csv.py
@@ -48,9 +48,12 @@ def test_csv_loading(write_encoding):
         else:
             pl_testing.assert_frame_equal(write_lf, read_lf)
 
-        # Preceding lines are reading the whole DF via Polars.
+        # Preceding lines are reading the whole DF via Polars:
+        # Now test how we read just the headers.
+        # Keys are hashes, and won't be stable across platforms,
+        # so let's just look at the values.
         ids_labels = read_csv_ids_labels(fp.name)
-        assert ids_labels == {"id_AGE": "2: AGE", "id_NAME": "1: NAME"}
+        assert set(ids_labels.values()) == {"2: AGE", "1: NAME"}
 
         ids_names = read_csv_ids_names(fp.name)
-        assert ids_names == {"id_AGE": "AGE", "id_NAME": "NAME"}
+        assert set(ids_names.values()) == {"AGE", "NAME"}

From 165c6873bb13a2ac5923c8897f2f0fd99b420ffe Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 6 Nov 2024 12:27:27 -0500
Subject: [PATCH 6/6] duplicated columns: another CSV edge case

---
 tests/fixtures/fake.csv | 14 +++++++-------
 tests/test_app.py       |  9 +++++----
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/tests/fixtures/fake.csv b/tests/fixtures/fake.csv
index cb1796b..e11908a 100644
--- a/tests/fixtures/fake.csv
+++ b/tests/fixtures/fake.csv
@@ -1,7 +1,7 @@
-,class year,hw number,hw-number,grade
-1234,1,1,4,90
-1234,1,2,5,95
-1234,1,3,6,85
-6789,2,1,4,70
-6789,2,2,5,100
-6789,2,3,6,90
+,class year,class year,hw number,hw-number,grade
+1234,1,1,1,4,90
+1234,1,1,2,5,95
+1234,1,1,3,6,85
+6789,2,2,1,4,70
+6789,2,2,2,5,100
+6789,2,2,3,6,90
diff --git a/tests/test_app.py b/tests/test_app.py
index 37c00ed..638fb90 100644
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -62,11 +62,12 @@ def expect_no_error():
     expect_visible(perform_analysis_text)
     expect_not_visible(download_results_text)
     # Columns:
-    expect_visible("1: [blank]")
+    expect_visible("1: [blank]")  # Empty column!
     expect_visible("2: class year")
-    expect_visible("3: hw number")
-    expect_visible("4: hw-number")
-    expect_visible("5: grade")
+    expect_visible("3: class year_duplicated_0")  # Duplicated column!
+    expect_visible("4: hw number")
+    expect_visible("5: hw-number")  # Distinguished by non-word character!
+    expect_visible("6: grade")
     # Epsilon slider:
     # (Note: Slider tests failed on CI when run after column details,
     # although it worked locally. This works in either environment.