SforAiDl · abheesht17 · May 14, 2020 · May 14, 2020 · May 20, 2020 · May 20, 2020
diff --git a/decepticonlp/transforms/transforms.py b/decepticonlp/transforms/transforms.py
@@ -27,9 +27,9 @@ class Transforms(object):
     def extractor_not_valid_message(self):
         return "Extractor chosen invalid. Please choose from " + str(extractor_list)
 
-    def apply(self, text, extractor, perturb_type, **kwargs):
+    def apply(self, text, extractor, top_k, perturb_type, **kwargs):
         words = text.split(" ")
-        indices = extractor.extract(words)
+        indices = extractor.extract(words=words, top_k=top_k)
 
         for index in indices:
             words[index] = perturb_type.apply(words[index], **kwargs)
@@ -82,6 +82,7 @@ class AddChar(Transforms):
 		Args:
 		extractor: str (default: "RandomWordCharacter")
 			-One of ["RandomWordExtractor"]
+		top_k: Number of words to be extracted
 		char_perturb: boolean (default: False)
 			-If True, add space in word randomly.
 			-If False, add character in word randomly.
@@ -97,21 +98,25 @@ class AddChar(Transforms):
 	"""
 
     def __init__(
-        self, extractor="RandomWordExtractor", char_perturb=False, ignore=True
+        self, extractor="RandomWordExtractor", top_k=1, char_perturb=False, ignore=True
     ):
 
         assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()
 
         if extractor == "RandomWordExtractor":
             self.extractor = basic.RandomImportantWordExtractor()
 
+        self.top_k = top_k
+
         self.char_perturb = char_perturb
         self.space_char_perturb = perturbations.InsertSpaceCharacterPerturbations()
         self.ignore = ignore
 
     def __call__(self, text):
         kwargs = {"char_perturb": self.char_perturb, "ignore": self.ignore}
-        return self.apply(text, self.extractor, self.space_char_perturb, **kwargs)
+        return self.apply(
+            text, self.extractor, self.top_k, self.space_char_perturb, **kwargs
+        )
 
     def __repr__(self):
         return self.__class__.__name__ + "()"
@@ -124,6 +129,7 @@ class ShuffleChar(Transforms):
 		Args:
 		extractor: str (default: "RandomWordExtractor")
 			-One of ["RandomWordExtractor"]
+		top_k: Number of words to be extracted
 		mid: boolean (default: False)
 			-if True, shuffles the characters of a word at random, barring the initial and last character
             -if False, swaps any two characters of a word at random, barring the initial and last character
@@ -138,20 +144,26 @@ class ShuffleChar(Transforms):
 		This is fascinatign!
 	"""
 
-    def __init__(self, extractor="RandomWordExtractor", mid=False, ignore=True):
+    def __init__(
+        self, extractor="RandomWordExtractor", top_k=1, mid=False, ignore=True
+    ):
 
         assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()
 
         if extractor == "RandomWordExtractor":
             self.extractor = basic.RandomImportantWordExtractor()
 
+        self.top_k = top_k
+
         self.shuffle_char_perturb = perturbations.ShuffleCharacterPerturbations()
         self.mid = mid
         self.ignore = ignore
 
     def __call__(self, text):
         kwargs = {"mid": self.mid, "ignore": self.ignore}
-        return self.apply(text, self.extractor, self.shuffle_char_perturb, **kwargs)
+        return self.apply(
+            text, self.extractor, self.top_k, self.shuffle_char_perturb, **kwargs
+        )
 
     def __repr__(self):
         return self.__class__.__name__ + "()"
@@ -164,6 +176,7 @@ class DeleteChar(Transforms):
 		Args:
 		extractor: str (default: "RandomWordExtractor")
 			-One of ["RandomWordExtractor"]
+		top_k: Number of words to be extracted
 		ignore: boolean (default: True)
 			-If True, ignore assertion errors (recommended).
 			-If False, do not ignore assertion errors.
@@ -175,19 +188,23 @@ class DeleteChar(Transforms):
 		This is fascinting!
 	"""
 
-    def __init__(self, extractor="RandomWordExtractor", ignore=True):
+    def __init__(self, extractor="RandomWordExtractor", top_k=1, ignore=True):
 
         assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()
 
         if extractor == "RandomWordExtractor":
             self.extractor = basic.RandomImportantWordExtractor()
 
+        self.top_k = top_k
+
         self.delete_char_perturb = perturbations.DeleteCharacterPerturbations()
         self.ignore = ignore
 
     def __call__(self, text):
         kwargs = {"ignore": self.ignore}
-        return self.apply(text, self.extractor, self.delete_char_perturb, **kwargs)
+        return self.apply(
+            text, self.extractor, self.top_k, self.delete_char_perturb, **kwargs
+        )
 
     def __repr__(self):
         return self.__class__.__name__ + "()"
@@ -200,6 +217,7 @@ class TypoChar(Transforms):
 		Args:
 		extractor: str (default: "RandomWordExtractor")
 			-One of ["RandomWordExtractor"]
+		top_k: Number of words to be extracted
 		probability: float in range [0,1] (default: 0.1)
 			-probability*100 percent characters in the word will become typos.
 		ignore: boolean (default: True)
@@ -213,20 +231,26 @@ class TypoChar(Transforms):
 		This us fascinating!
 	"""
 
-    def __init__(self, extractor="RandomWordExtractor", probability=0.1, ignore=True):
+    def __init__(
+        self, extractor="RandomWordExtractor", top_k=1, probability=0.1, ignore=True
+    ):
 
         assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()
 
         if extractor == "RandomWordExtractor":
             self.extractor = basic.RandomImportantWordExtractor()
 
+        self.top_k = top_k
+
         self.typo_char_perturb = perturbations.TypoCharacterPerturbations()
         self.probability = probability
         self.ignore = ignore
 
     def __call__(self, text):
         kwargs = {"probability": self.probability, "ignore": self.ignore}
-        return self.apply(text, self.extractor, self.typo_char_perturb, **kwargs)
+        return self.apply(
+            text, self.extractor, self.top_k, self.typo_char_perturb, **kwargs
+        )
 
     def __repr__(self):
         return self.__class__.__name__ + "()"
@@ -239,6 +263,7 @@ class VisuallySimilarChar(Transforms):
 		Args:
 		extractor: str (default: "RandomWordExtractor")
 			-One of ["RandomWordExtractor"]
+		top_k: Number of words to be extracted
 		seed: int (default: None)
 			-seed for random
 		ignore: boolean (default: True)
@@ -252,13 +277,17 @@ class VisuallySimilarChar(Transforms):
 		T̕h̒i̕s̒ is fascinating!
 	"""
 
-    def __init__(self, extractor="RandomWordExtractor", seed=None, ignore=True):
+    def __init__(
+        self, extractor="RandomWordExtractor", top_k=1, seed=None, ignore=True
+    ):
 
         assert extractor in ["RandomWordExtractor"], self.extractor_not_valid_message()
 
         if extractor == "RandomWordExtractor":
             self.extractor = basic.RandomImportantWordExtractor()
 
+        self.top_k = top_k
+
         self.visually_similar_char_perturb = perturbations.VisuallySimilarCharacterPerturbations(
             "unicode", "homoglyph"
         )
@@ -268,7 +297,11 @@ def __init__(self, extractor="RandomWordExtractor", seed=None, ignore=True):
     def __call__(self, text):
         kwargs = {"seed": self.seed, "ignore": self.ignore}
         return self.apply(
-            text, self.extractor, self.visually_similar_char_perturb, **kwargs
+            text,
+            self.extractor,
+            self.top_k,
+            self.visually_similar_char_perturb,
+            **kwargs
         )
 
     def __repr__(self):

diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,8 @@ Sphinx==1.8.5
 twine==1.14.0
 tensorflow==2.2.0
 tensorflow-hub==0.8.0
+tqdm==4.46.0
+torch==1.5.0
 
 setuptools
 pytest==5.4.2

diff --git a/tests/test_transforms.py b/tests/test_transforms.py
@@ -16,7 +16,9 @@
 )
 def test_add_space(text, expected_result):
     random.seed(42)
-    tfms = transforms.AddChar(extractor="RandomWordExtractor", char_perturb=False)
+    tfms = transforms.AddChar(
+        extractor="RandomWordExtractor", top_k=1, char_perturb=False
+    )
     assert tfms(text) == expected_result
 
 
@@ -30,7 +32,9 @@ def test_add_space(text, expected_result):
 )
 def test_add_char(text, expected_result):
     random.seed(42)
-    tfms = transforms.AddChar(extractor="RandomWordExtractor", char_perturb=True)
+    tfms = transforms.AddChar(
+        extractor="RandomWordExtractor", top_k=1, char_perturb=True
+    )
     assert tfms(text) == expected_result
 
 
@@ -44,7 +48,7 @@ def test_add_char(text, expected_result):
 )
 def test_shuffle_char(text, expected_result):
     random.seed(42)
-    tfms = transforms.ShuffleChar(extractor="RandomWordExtractor", mid=False)
+    tfms = transforms.ShuffleChar(extractor="RandomWordExtractor", top_k=1, mid=False)
     assert tfms(text) == expected_result
 
 
@@ -58,7 +62,7 @@ def test_shuffle_char(text, expected_result):
 )
 def test_shuffle_char(text, expected_result):
     random.seed(42)
-    tfms = transforms.ShuffleChar(extractor="RandomWordExtractor", mid=True)
+    tfms = transforms.ShuffleChar(extractor="RandomWordExtractor", top_k=1, mid=True)
     assert tfms(text) == expected_result
 
 
@@ -72,7 +76,7 @@ def test_shuffle_char(text, expected_result):
 )
 def test_delete_char(text, expected_result):
     random.seed(42)
-    tfms = transforms.DeleteChar(extractor="RandomWordExtractor")
+    tfms = transforms.DeleteChar(extractor="RandomWordExtractor", top_k=1)
     assert tfms(text) == expected_result
 
 
@@ -86,7 +90,9 @@ def test_delete_char(text, expected_result):
 )
 def test_typo_char(text, expected_result):
     random.seed(42)
-    tfms = transforms.TypoChar(extractor="RandomWordExtractor", probability=0.3)
+    tfms = transforms.TypoChar(
+        extractor="RandomWordExtractor", top_k=1, probability=0.3
+    )
     assert tfms(text) == expected_result
 
 
@@ -100,26 +106,28 @@ def test_typo_char(text, expected_result):
 )
 def test_visually_similar_char(text, expected_result):
     random.seed(42)
-    tfms = transforms.VisuallySimilarChar(extractor="RandomWordExtractor", seed=None)
+    tfms = transforms.VisuallySimilarChar(
+        extractor="RandomWordExtractor", top_k=1, seed=None
+    )
     assert tfms(text) == expected_result
 
 
 @pytest.mark.parametrize(
     "text, expected_result",
     [
-        ("Twinkle twinkle little star.", "R winkle tknwlie little 𝒔𝘵𝖆𝘳."),
-        ("Hey, this is so fascinating!", "Y̐ ey, this is so fitcgaasnin!"),
-        ("The earthen pot has cold water.", "The earthen oor has cold 𝚠 ater."),
+        ("Twinkle twinkle little star.", "𝘛 wlnkie tkwnlie little 𝕤𝐭а𝑟."),
+        ("Hey, this is so fascinating!", "𝙃 sy, tihs is so f́n̂s̐i̐t̂n̐a̐ác̒g̐í!́"),
+        ("The earthen pot has cold water.", "The 𝐞𝐚ⲅ𝓽𝚑℮𝐧 𝚙0𝗍 has cold w ater."),
     ],
 )
 def test_compose_transforms(text, expected_result):
     random.seed(42)
     tfms = transforms.Compose(
         [
             transforms.AddChar(),
-            transforms.ShuffleChar("RandomWordExtractor", True),
-            transforms.VisuallySimilarChar(),
-            transforms.TypoChar("RandomWordExtractor", probability=0.5),
+            transforms.ShuffleChar("RandomWordExtractor", 2, True),
+            transforms.VisuallySimilarChar(top_k=2),
+            transforms.TypoChar("RandomWordExtractor", top_k=1, probability=0.5),
         ]
     )
     assert tfms(text) == expected_result