Skip to content

Commit

Permalink
Merge pull request moj-analytical-services#2323 from moj-analytical-s…
Browse files Browse the repository at this point in the history
…ervices/bug/completeness-source-dataset

Completeness - handle existing source dataset column
  • Loading branch information
ADBond authored Aug 12, 2024
2 parents 30a7cc7 + d3384d4 commit 116e2ff
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 6 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- Completeness chart now works correectly with indexed columns in spark ([#2309](https://github.com/moj-analytical-services/splink/pull/2309))
- Completeness chart now works correctly with indexed columns in spark ([#2309](https://github.com/moj-analytical-services/splink/pull/2309))
- Completeness chart works even if you have a `source_dataset` column ([#2323](https://github.com/moj-analytical-services/splink/pull/2323))


## [4.0.0] - 2024-07-24

Expand Down
15 changes: 11 additions & 4 deletions splink/internals/completeness.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,24 @@ def completeness_data(

pipeline.enqueue_sql(sql, "__splink__df_concat")

# use an internal name for source_dataset, as if it already exists in dataframe
# we may run into ambiguous column issues. This name unlikely to clash
internal_source_colname = "__completeness_source_dataset"

# In the case of a single input dataframe, a source_dataset column
# will not have been created, create one
first_df = next(iter(splink_df_dict.values()))
if len(splink_df_dict) == 1:
sql = f"""
select '{first_df.physical_name}' as source_dataset, *
select '{first_df.physical_name}' as {internal_source_colname}, *
from __splink__df_concat
"""

else:
sql = "select * from __splink__df_concat"
sql = (
f"select *, source_dataset AS {internal_source_colname} "
f"from __splink__df_concat"
)

pipeline.enqueue_sql(sql, "__splink__df_concat_with_source_dataset")

Expand All @@ -56,13 +63,13 @@ def completeness_data(

sql = f"""
(select
source_dataset,
{internal_source_colname} AS source_dataset,
'{unquoted_col}' as column_name,
count(*) - count({quoted_col}) as total_null_rows,
count(*) as total_rows_inc_nulls,
cast(count({quoted_col})*1.0/count(*) as float) as completeness
from __splink__df_concat_with_source_dataset
group by source_dataset
group by {internal_source_colname}
order by count(*) desc)
"""
sqls.append(sql)
Expand Down
8 changes: 8 additions & 0 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ def load_frame_from_csv(self, path):
def load_frame_from_parquet(self, path):
return pd.read_parquet(path)

@property
def arrays_from(self) -> int:
return 1


class DuckDBTestHelper(TestHelper):
@property
Expand Down Expand Up @@ -84,6 +88,10 @@ def load_frame_from_parquet(self, path):
df.persist()
return df

@property
def arrays_from(self) -> int:
return 0


class SQLiteTestHelper(TestHelper):
_frame_counter = 0
Expand Down
13 changes: 12 additions & 1 deletion tests/test_completeness.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,17 @@ def test_completeness_chart_complex_columns(dialect, test_helpers):
}
)
df = helper.convert_frame(df)
first = helper.arrays_from
# check completeness when we have more complicated column constructs, such as
# indexing into array columns
completeness_chart(df, db_api, cols=["first_name", "surname", "city_arr[0]"])
completeness_chart(df, db_api, cols=["first_name", "surname", f"city_arr[{first}]"])


@mark_with_dialects_excluding("sqlite")
def test_completeness_chart_source_dataset(dialect, test_helpers):
helper = test_helpers[dialect]
db_api = helper.DatabaseAPI(**helper.db_api_args())
df_pd = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
df_pd["source_dataset"] = "fake_1000"
df = helper.convert_frame(df_pd)
completeness_chart(df, db_api)

0 comments on commit 116e2ff

Please sign in to comment.