Skip to content

Commit

Permalink
Use IDGenerator for key columns without specified regex patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
frances-h committed Aug 14, 2023
1 parent 72f7e1f commit b4a2e22
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 9 deletions.
31 changes: 23 additions & 8 deletions sdv/data_processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pandas as pd
import rdt
from pandas.api.types import is_float_dtype, is_integer_dtype
from rdt.transformers import AnonymizedFaker, RegexGenerator, get_default_transformers
from rdt.transformers import AnonymizedFaker, IDGenerator, RegexGenerator, get_default_transformers

from sdv.constraints import Constraint
from sdv.constraints.base import get_subclasses
Expand Down Expand Up @@ -458,13 +458,28 @@ def _create_config(self, data, columns_created_by_constraints):

if sdtype == 'id':
is_numeric = pd.api.types.is_numeric_dtype(data[column].dtype)
transformers[column] = self.create_regex_generator(
column,
sdtype,
column_metadata,
is_numeric
)
sdtypes[column] = 'text'
if column_metadata.get('regex_format', False):
transformers[column] = self.create_regex_generator(
column,
sdtype,
column_metadata,
is_numeric
)
sdtypes[column] = 'text'
elif column == self.metadata.primary_key or column in self.metadata.alternate_keys:
prefix = None
if not is_numeric:
prefix = 'sdv-id-'

transformers[column] = IDGenerator(prefix=prefix)
sdtypes[column] = 'text'
else:
transformers[column] = AnonymizedFaker(
provider_name=None,
function_name='bothify',
function_kwargs={'text': '#####'}
)
sdtypes[column] = 'pii'

elif pii:
enforce_uniqueness = bool(column in self._keys)
Expand Down
29 changes: 28 additions & 1 deletion tests/unit/data_processing/test_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest
from rdt.errors import ConfigNotSetError
from rdt.errors import NotFittedError as RDTNotFittedError
from rdt.transformers import FloatFormatter, LabelEncoder, UnixTimestampEncoder
from rdt.transformers import AnonymizedFaker, FloatFormatter, IDGenerator, LabelEncoder, UnixTimestampEncoder

from sdv.constraints.errors import MissingConstraintColumnError
from sdv.constraints.tabular import Positive, ScalarRange
Expand Down Expand Up @@ -1052,15 +1052,21 @@ def test__create_config(self):
'email': ['a@aol.com', 'b@gmail.com', 'c@gmx.com'],
'first_name': ['John', 'Doe', 'Johanna'],
'id': ['ID_001', 'ID_002', 'ID_003'],
'id_no_regex': ['ID_001', 'ID_002', 'ID_003'],
'id_numeric': [0, 1, 2],
'id_column': ['ID_999', 'ID_999', 'ID_007'],
'date': ['2021-02-01', '2022-03-05', '2023-01-31']
})
dp = DataProcessor(SingleTableMetadata(), locales=locales)
dp.metadata = Mock()
dp.create_anonymized_transformer = Mock()
dp.create_regex_generator = Mock()
dp.create_id_generator = Mock()
dp.create_anonymized_transformer.return_value = 'AnonymizedFaker'
dp.create_regex_generator.return_value = 'RegexGenerator'
dp.create_id_generator.return_value = 'IDGenerator'
dp.metadata.primary_key = 'id'
dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric']
dp._primary_key = 'id'
dp._keys = ['id']
dp.metadata.columns = {
Expand All @@ -1071,6 +1077,9 @@ def test__create_config(self):
'email': {'sdtype': 'email', 'pii': True},
'first_name': {'sdtype': 'first_name'},
'id': {'sdtype': 'id', 'regex_format': 'ID_\\d{3}[0-9]'},
'id_no_regex': {'sdtype': 'id'},
'id_numeric': {'sdtype': 'id'},
'id_column': {'sdtype': 'id'},
'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'}
}

Expand All @@ -1091,6 +1100,9 @@ def test__create_config(self):
'email': 'pii',
'first_name': 'pii',
'id': 'text',
'id_no_regex': 'text',
'id_numeric': 'text',
'id_column': 'pii',
'date': 'datetime'
}

Expand Down Expand Up @@ -1126,6 +1138,21 @@ def test__create_config(self):
assert datetime_transformer.datetime_format == '%Y-%m-%d'
assert dp._primary_key == 'id'

id_no_regex_transformer = config['transformers']['id_no_regex']
assert isinstance(id_no_regex_transformer, IDGenerator)
assert id_no_regex_transformer.prefix == 'sdv-id-'
assert id_no_regex_transformer.starting_value == 0

id_numeric_transformer = config['transformers']['id_numeric']
assert isinstance(id_numeric_transformer, IDGenerator)
assert id_numeric_transformer.prefix == None
assert id_numeric_transformer.starting_value == 0

id_column_transformer = config['transformers']['id_column']
assert isinstance(id_column_transformer, AnonymizedFaker)
assert id_column_transformer.function_name == 'bothify'
assert id_column_transformer.function_kwargs == {'text': '#####'}

dp.create_anonymized_transformer.calls == [
call('email', {'sdtype': 'email', 'pii': True, 'locales': locales}),
call('first_name', {'sdtype': 'first_name', 'locales': locales})
Expand Down

0 comments on commit b4a2e22

Please sign in to comment.