Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added top_k as parameter to transforms #80

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
57 changes: 45 additions & 12 deletions decepticonlp/transforms/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ class Transforms(object):
def extractor_not_valid_message(self):
return "Extractor chosen invalid. Please choose from " + str(extractor_list)

def apply(self, text, extractor, perturb_type, **kwargs):
def apply(self, text, extractor, top_k, perturb_type, **kwargs):
words = text.split(" ")
indices = extractor.extract(words)
indices = extractor.extract(words=words, top_k=top_k)

for index in indices:
words[index] = perturb_type.apply(words[index], **kwargs)
Expand Down Expand Up @@ -82,6 +82,7 @@ class AddChar(Transforms):
Args:
extractor: str (default: "RandomWordCharacter")
-One of ["RandomWordExtractor"]
top_k: Number of words to be extracted
char_perturb: boolean (default: False)
-If True, add space in word randomly.
-If False, add character in word randomly.
Expand All @@ -97,21 +98,25 @@ class AddChar(Transforms):
"""

def __init__(
self, extractor="RandomWordExtractor", char_perturb=False, ignore=True
self, extractor="RandomWordExtractor", top_k=1, char_perturb=False, ignore=True
):

assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()

if extractor == "RandomWordExtractor":
self.extractor = basic.RandomImportantWordExtractor()

self.top_k = top_k

self.char_perturb = char_perturb
self.space_char_perturb = perturbations.InsertSpaceCharacterPerturbations()
self.ignore = ignore

def __call__(self, text):
kwargs = {"char_perturb": self.char_perturb, "ignore": self.ignore}
return self.apply(text, self.extractor, self.space_char_perturb, **kwargs)
return self.apply(
text, self.extractor, self.top_k, self.space_char_perturb, **kwargs
)

def __repr__(self):
return self.__class__.__name__ + "()"
Expand All @@ -124,6 +129,7 @@ class ShuffleChar(Transforms):
Args:
extractor: str (default: "RandomWordExtractor")
-One of ["RandomWordExtractor"]
top_k: Number of words to be extracted
mid: boolean (default: False)
-if True, shuffles the characters of a word at random, barring the initial and last character
-if False, swaps any two characters of a word at random, barring the initial and last character
Expand All @@ -138,20 +144,26 @@ class ShuffleChar(Transforms):
This is fascinatign!
"""

def __init__(self, extractor="RandomWordExtractor", mid=False, ignore=True):
def __init__(
self, extractor="RandomWordExtractor", top_k=1, mid=False, ignore=True
):

assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()

if extractor == "RandomWordExtractor":
self.extractor = basic.RandomImportantWordExtractor()

self.top_k = top_k

self.shuffle_char_perturb = perturbations.ShuffleCharacterPerturbations()
self.mid = mid
self.ignore = ignore

def __call__(self, text):
kwargs = {"mid": self.mid, "ignore": self.ignore}
return self.apply(text, self.extractor, self.shuffle_char_perturb, **kwargs)
return self.apply(
text, self.extractor, self.top_k, self.shuffle_char_perturb, **kwargs
)

def __repr__(self):
return self.__class__.__name__ + "()"
Expand All @@ -164,6 +176,7 @@ class DeleteChar(Transforms):
Args:
extractor: str (default: "RandomWordExtractor")
-One of ["RandomWordExtractor"]
top_k: Number of words to be extracted
ignore: boolean (default: True)
-If True, ignore assertion errors (recommended).
-If False, do not ignore assertion errors.
Expand All @@ -175,19 +188,23 @@ class DeleteChar(Transforms):
This is fascinting!
"""

def __init__(self, extractor="RandomWordExtractor", ignore=True):
def __init__(self, extractor="RandomWordExtractor", top_k=1, ignore=True):

assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()

if extractor == "RandomWordExtractor":
self.extractor = basic.RandomImportantWordExtractor()

self.top_k = top_k

self.delete_char_perturb = perturbations.DeleteCharacterPerturbations()
self.ignore = ignore

def __call__(self, text):
kwargs = {"ignore": self.ignore}
return self.apply(text, self.extractor, self.delete_char_perturb, **kwargs)
return self.apply(
text, self.extractor, self.top_k, self.delete_char_perturb, **kwargs
)

def __repr__(self):
return self.__class__.__name__ + "()"
Expand All @@ -200,6 +217,7 @@ class TypoChar(Transforms):
Args:
extractor: str (default: "RandomWordExtractor")
-One of ["RandomWordExtractor"]
top_k: Number of words to be extracted
probability: float in range [0,1] (default: 0.1)
-probability*100 percent characters in the word will become typos.
ignore: boolean (default: True)
Expand All @@ -213,20 +231,26 @@ class TypoChar(Transforms):
This us fascinating!
"""

def __init__(self, extractor="RandomWordExtractor", probability=0.1, ignore=True):
def __init__(
self, extractor="RandomWordExtractor", top_k=1, probability=0.1, ignore=True
):

assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()

if extractor == "RandomWordExtractor":
self.extractor = basic.RandomImportantWordExtractor()

self.top_k = top_k

self.typo_char_perturb = perturbations.TypoCharacterPerturbations()
self.probability = probability
self.ignore = ignore

def __call__(self, text):
kwargs = {"probability": self.probability, "ignore": self.ignore}
return self.apply(text, self.extractor, self.typo_char_perturb, **kwargs)
return self.apply(
text, self.extractor, self.top_k, self.typo_char_perturb, **kwargs
)

def __repr__(self):
return self.__class__.__name__ + "()"
Expand All @@ -239,6 +263,7 @@ class VisuallySimilarChar(Transforms):
Args:
extractor: str (default: "RandomWordExtractor")
-One of ["RandomWordExtractor"]
top_k: Number of words to be extracted
seed: int (default: None)
-seed for random
ignore: boolean (default: True)
Expand All @@ -252,13 +277,17 @@ class VisuallySimilarChar(Transforms):
T̕h̒i̕s̒ is fascinating!
"""

def __init__(self, extractor="RandomWordExtractor", seed=None, ignore=True):
def __init__(
self, extractor="RandomWordExtractor", top_k=1, seed=None, ignore=True
):

assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()

if extractor == "RandomWordExtractor":
self.extractor = basic.RandomImportantWordExtractor()

self.top_k = top_k

self.visually_similar_char_perturb = perturbations.VisuallySimilarCharacterPerturbations(
"unicode", "homoglyph"
)
Expand All @@ -268,7 +297,11 @@ def __init__(self, extractor="RandomWordExtractor", seed=None, ignore=True):
def __call__(self, text):
kwargs = {"seed": self.seed, "ignore": self.ignore}
return self.apply(
text, self.extractor, self.visually_similar_char_perturb, **kwargs
text,
self.extractor,
self.top_k,
self.visually_similar_char_perturb,
**kwargs
)

def __repr__(self):
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Sphinx==1.8.5
twine==1.14.0
tensorflow==2.2.0
tensorflow-hub==0.8.0
tqdm==4.46.0
torch==1.5.0

setuptools
pytest==5.4.2
Expand Down
34 changes: 21 additions & 13 deletions tests/test_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
)
def test_add_space(text, expected_result):
random.seed(42)
tfms = transforms.AddChar(extractor="RandomWordExtractor", char_perturb=False)
tfms = transforms.AddChar(
extractor="RandomWordExtractor", top_k=1, char_perturb=False
)
assert tfms(text) == expected_result


Expand All @@ -30,7 +32,9 @@ def test_add_space(text, expected_result):
)
def test_add_char(text, expected_result):
random.seed(42)
tfms = transforms.AddChar(extractor="RandomWordExtractor", char_perturb=True)
tfms = transforms.AddChar(
extractor="RandomWordExtractor", top_k=1, char_perturb=True
)
assert tfms(text) == expected_result


Expand All @@ -44,7 +48,7 @@ def test_add_char(text, expected_result):
)
def test_shuffle_char(text, expected_result):
random.seed(42)
tfms = transforms.ShuffleChar(extractor="RandomWordExtractor", mid=False)
tfms = transforms.ShuffleChar(extractor="RandomWordExtractor", top_k=1, mid=False)
assert tfms(text) == expected_result


Expand All @@ -58,7 +62,7 @@ def test_shuffle_char(text, expected_result):
)
def test_shuffle_char(text, expected_result):
random.seed(42)
tfms = transforms.ShuffleChar(extractor="RandomWordExtractor", mid=True)
tfms = transforms.ShuffleChar(extractor="RandomWordExtractor", top_k=1, mid=True)
assert tfms(text) == expected_result


Expand All @@ -72,7 +76,7 @@ def test_shuffle_char(text, expected_result):
)
def test_delete_char(text, expected_result):
random.seed(42)
tfms = transforms.DeleteChar(extractor="RandomWordExtractor")
tfms = transforms.DeleteChar(extractor="RandomWordExtractor", top_k=1)
assert tfms(text) == expected_result


Expand All @@ -86,7 +90,9 @@ def test_delete_char(text, expected_result):
)
def test_typo_char(text, expected_result):
random.seed(42)
tfms = transforms.TypoChar(extractor="RandomWordExtractor", probability=0.3)
tfms = transforms.TypoChar(
extractor="RandomWordExtractor", top_k=1, probability=0.3
)
assert tfms(text) == expected_result


Expand All @@ -100,26 +106,28 @@ def test_typo_char(text, expected_result):
)
def test_visually_similar_char(text, expected_result):
random.seed(42)
tfms = transforms.VisuallySimilarChar(extractor="RandomWordExtractor", seed=None)
tfms = transforms.VisuallySimilarChar(
extractor="RandomWordExtractor", top_k=1, seed=None
)
assert tfms(text) == expected_result


@pytest.mark.parametrize(
"text, expected_result",
[
("Twinkle twinkle little star.", "R winkle tknwlie little 𝒔𝘵𝖆𝘳."),
("Hey, this is so fascinating!", "Y̐ ey, this is so fitcgaasnin!"),
("The earthen pot has cold water.", "The earthen oor has cold 𝚠 ater."),
("Twinkle twinkle little star.", "𝘛 wlnkie tkwnlie little 𝕤𝐭а𝑟."),
("Hey, this is so fascinating!", "𝙃 sy, tihs is so f́n̂s̐i̐t̂n̐a̐ác̒g̐í!́"),
("The earthen pot has cold water.", "The 𝐞𝐚ⲅ𝓽𝚑℮𝐧 𝚙0𝗍 has cold w ater."),
],
)
def test_compose_transforms(text, expected_result):
random.seed(42)
tfms = transforms.Compose(
[
transforms.AddChar(),
transforms.ShuffleChar("RandomWordExtractor", True),
transforms.VisuallySimilarChar(),
transforms.TypoChar("RandomWordExtractor", probability=0.5),
transforms.ShuffleChar("RandomWordExtractor", 2, True),
transforms.VisuallySimilarChar(top_k=2),
transforms.TypoChar("RandomWordExtractor", top_k=1, probability=0.5),
]
)
assert tfms(text) == expected_result