Merge pull request moj-analytical-services#2323 from moj-analytical-s…

…ervices/bug/completeness-source-dataset Completeness - handle existing source dataset column
aalexandersson · Aug 12, 2024 · 116e2ff · 116e2ff
2 parents 30a7cc7 + d3384d4
commit 116e2ff
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
-- Completeness chart now works correectly with indexed columns in spark ([#2309](https://github.com/moj-analytical-services/splink/pull/2309))
+- Completeness chart now works correctly with indexed columns in spark ([#2309](https://github.com/moj-analytical-services/splink/pull/2309))
+- Completeness chart works even if you have a `source_dataset` column ([#2323](https://github.com/moj-analytical-services/splink/pull/2323))
+
 
 ## [4.0.0] - 2024-07-24
 

diff --git a/splink/internals/completeness.py b/splink/internals/completeness.py
@@ -29,17 +29,24 @@ def completeness_data(
 
     pipeline.enqueue_sql(sql, "__splink__df_concat")
 
+    # use an internal name for source_dataset, as if it already exists in dataframe
+    # we may run into ambiguous column issues. This name unlikely to clash
+    internal_source_colname = "__completeness_source_dataset"
+
     # In the case of a single input dataframe, a source_dataset column
     # will not have been created, create one
     first_df = next(iter(splink_df_dict.values()))
     if len(splink_df_dict) == 1:
         sql = f"""
-        select '{first_df.physical_name}' as source_dataset, *
+        select '{first_df.physical_name}' as {internal_source_colname}, *
         from __splink__df_concat
         """
 
     else:
-        sql = "select * from __splink__df_concat"
+        sql = (
+            f"select *, source_dataset AS {internal_source_colname} "
+            f"from __splink__df_concat"
+        )
 
     pipeline.enqueue_sql(sql, "__splink__df_concat_with_source_dataset")
 
@@ -56,13 +63,13 @@ def completeness_data(
 
         sql = f"""
         (select
-            source_dataset,
+            {internal_source_colname} AS source_dataset,
             '{unquoted_col}' as column_name,
             count(*) - count({quoted_col}) as total_null_rows,
             count(*) as total_rows_inc_nulls,
             cast(count({quoted_col})*1.0/count(*) as float) as completeness
         from __splink__df_concat_with_source_dataset
-        group by source_dataset
+        group by {internal_source_colname}
         order by count(*) desc)
         """
         sqls.append(sql)

diff --git a/tests/helpers.py b/tests/helpers.py
@@ -44,6 +44,10 @@ def load_frame_from_csv(self, path):
     def load_frame_from_parquet(self, path):
         return pd.read_parquet(path)
 
+    @property
+    def arrays_from(self) -> int:
+        return 1
+
 
 class DuckDBTestHelper(TestHelper):
     @property
@@ -84,6 +88,10 @@ def load_frame_from_parquet(self, path):
         df.persist()
         return df
 
+    @property
+    def arrays_from(self) -> int:
+        return 0
+
 
 class SQLiteTestHelper(TestHelper):
     _frame_counter = 0

diff --git a/tests/test_completeness.py b/tests/test_completeness.py
@@ -53,6 +53,17 @@ def test_completeness_chart_complex_columns(dialect, test_helpers):
         }
     )
     df = helper.convert_frame(df)
+    first = helper.arrays_from
     # check completeness when we have more complicated column constructs, such as
     # indexing into array columns
-    completeness_chart(df, db_api, cols=["first_name", "surname", "city_arr[0]"])
+    completeness_chart(df, db_api, cols=["first_name", "surname", f"city_arr[{first}]"])
+
+
+@mark_with_dialects_excluding("sqlite")
+def test_completeness_chart_source_dataset(dialect, test_helpers):
+    helper = test_helpers[dialect]
+    db_api = helper.DatabaseAPI(**helper.db_api_args())
+    df_pd = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
+    df_pd["source_dataset"] = "fake_1000"
+    df = helper.convert_frame(df_pd)
+    completeness_chart(df, db_api)