From 5490c6a655db3a06b84cba12c39858263783df4d Mon Sep 17 00:00:00 2001 From: Felipe Date: Fri, 21 Jul 2023 08:37:10 -0700 Subject: [PATCH 1/4] Move contraint reverese_transform order --- sdv/data_processing/data_processor.py | 10 +++++++--- sdv/data_processing/test.py | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 sdv/data_processing/test.py diff --git a/sdv/data_processing/data_processor.py b/sdv/data_processing/data_processor.py index d2cdc3c74..1f87a5ede 100644 --- a/sdv/data_processing/data_processor.py +++ b/sdv/data_processing/data_processor.py @@ -710,9 +710,6 @@ def reverse_transform(self, data, reset_keys=False): except rdt.errors.NotFittedError: LOGGER.info(f'HyperTransformer has not been fitted for table {self.table_name}') - for constraint in reversed(self._constraints_to_reverse): - reversed_data = constraint.reverse_transform(reversed_data) - num_rows = len(reversed_data) sampled_columns = list(reversed_data.columns) missing_columns = [ @@ -731,6 +728,13 @@ def reverse_transform(self, data, reset_keys=False): generated_keys = self.generate_keys(num_rows, reset_keys) sampled_columns.extend(self._keys) + for constraint in reversed(self._constraints_to_reverse): + reversed_data = constraint.reverse_transform(reversed_data) + + # Add new columns generated by the constraint + new_columns = list(set(reversed_data.columns) - set(sampled_columns)) + sampled_columns.extend(new_columns) + # Sort the sampled columns in the order of the metadata # In multitable there may be missing columns in the sample such as foreign keys # And alternate keys. Thats the reason of ensuring that the metadata column is within diff --git a/sdv/data_processing/test.py b/sdv/data_processing/test.py new file mode 100644 index 000000000..319d797fe --- /dev/null +++ b/sdv/data_processing/test.py @@ -0,0 +1,17 @@ +from sdv.data_processing.data_processor import DataProcessor +from sdv.metadata.single_table import SingleTableMetadata +from sdv.single_table.copulas import GaussianCopulaSynthesizer +import pandas as pd + +data = pd.DataFrame({ + 'low': [1, 2, 3], +}) +metadata = SingleTableMetadata() +metadata.add_column('low', sdtype='numerical') +metadata.update_column('low', sdtype='job', pii=True) + +dp = DataProcessor(metadata) +dp.fit(data) +transformed = dp.transform(data) +reverse_transformed = dp.reverse_transform(transformed) +print(reverse_transformed) From 4710b7ba21a40eb2e15535267cf2450c4f0a56db Mon Sep 17 00:00:00 2001 From: Felipe Date: Fri, 21 Jul 2023 08:42:08 -0700 Subject: [PATCH 2/4] Remove uneeded file --- sdv/data_processing/test.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 sdv/data_processing/test.py diff --git a/sdv/data_processing/test.py b/sdv/data_processing/test.py deleted file mode 100644 index 319d797fe..000000000 --- a/sdv/data_processing/test.py +++ /dev/null @@ -1,17 +0,0 @@ -from sdv.data_processing.data_processor import DataProcessor -from sdv.metadata.single_table import SingleTableMetadata -from sdv.single_table.copulas import GaussianCopulaSynthesizer -import pandas as pd - -data = pd.DataFrame({ - 'low': [1, 2, 3], -}) -metadata = SingleTableMetadata() -metadata.add_column('low', sdtype='numerical') -metadata.update_column('low', sdtype='job', pii=True) - -dp = DataProcessor(metadata) -dp.fit(data) -transformed = dp.transform(data) -reverse_transformed = dp.reverse_transform(transformed) -print(reverse_transformed) From 3ba4bfa6dc44baac5fd3286b65db12ddc3333af7 Mon Sep 17 00:00:00 2001 From: Felipe Date: Mon, 24 Jul 2023 10:17:36 -0700 Subject: [PATCH 3/4] Add comment --- sdv/data_processing/data_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdv/data_processing/data_processor.py b/sdv/data_processing/data_processor.py index 39bebf359..1c0f8de3b 100644 --- a/sdv/data_processing/data_processor.py +++ b/sdv/data_processing/data_processor.py @@ -718,13 +718,15 @@ def reverse_transform(self, data, reset_keys=False): sampled_columns.extend(self._keys) for constraint in reversed(self._constraints_to_reverse): + sampled_columns = constraint.reverse_transform(sampled_columns) reversed_data = constraint.reverse_transform(reversed_data) # Add new columns generated by the constraint new_columns = list(set(reversed_data.columns) - set(sampled_columns)) sampled_columns.extend(new_columns) - # Sort the sampled columns in the order of the metadata + # Sort the sampled columns in the order of the metadata. + # Any extra columns not present in the metadata will be dropped. # In multitable there may be missing columns in the sample such as foreign keys # And alternate keys. Thats the reason of ensuring that the metadata column is within # The sampled columns. From 0041b98a4c42fb9ce5afddf8feb07a5ea78789e9 Mon Sep 17 00:00:00 2001 From: Felipe Date: Mon, 24 Jul 2023 10:24:49 -0700 Subject: [PATCH 4/4] Remove mistake --- sdv/data_processing/data_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdv/data_processing/data_processor.py b/sdv/data_processing/data_processor.py index 1c0f8de3b..563c9aa81 100644 --- a/sdv/data_processing/data_processor.py +++ b/sdv/data_processing/data_processor.py @@ -718,7 +718,6 @@ def reverse_transform(self, data, reset_keys=False): sampled_columns.extend(self._keys) for constraint in reversed(self._constraints_to_reverse): - sampled_columns = constraint.reverse_transform(sampled_columns) reversed_data = constraint.reverse_transform(reversed_data) # Add new columns generated by the constraint