Merge pull request #7 from toriving/develop

Develop
toriving · Sep 26, 2021 · 5dfbb0e · 5dfbb0e
2 parents fe88c2c + 0b1847a
commit 5dfbb0e
Show file tree

Hide file tree

Showing 15 changed files with 254 additions and 59 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.6', '3.7', '3.8', '3.9']
+        python-version: ['3.7', '3.8', '3.9']
 
     steps:
     - uses: actions/checkout@v2

diff --git a/README.md b/README.md
@@ -20,69 +20,141 @@ KoEDA
 <p>Easy Data Augmentation for Korean
 </h3>
 
+This is a project that re-implemented Easy data augmentation and A Easier Data Augmentation, which were implemented for English, to fit Korean.
+
 ## Prerequisites
-- python >= 3.6
+- python >= 3.7
 
 ## Installation
-This repository is tested on Python 3.6 - 3.9.  
+This repository is tested on Python 3.7 - 3.9.  
+
 KoEDA can be installed using pip as follows:
 ```shell script
 $ pip install koeda
 ```
 
 ## Quick Start
-
+- EDA
 ```python
-from koeda import EasyDataAugmentation
+from koeda import EDA
 
 
-EDA = EasyDataAugmentation(
-    morpheme_analyzer=None, alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3
+eda = EDA(
+    morpheme_analyzer="Okt", alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3
 )
 
 text = "아버지가 방에 들어가신다"
 
-result = EDA(text)
+result = eda(text)
 print(result)
 # 아버지가 정실에 들어가신다
+
+result = eda(text, p=(0.9, 0.9, 0.9, 0.9), repetition=2)
+print(result)
+# ['아버지가 객실 아빠 안방 방에 정실 들어가신다', '아버지가 탈의실 방 휴게실 에 안방 탈의실 들어가신다']
 ```
 
+- AEDA
+```python
+from koeda import AEDA
+
+
+aeda = AEDA(
+    morpheme_analyzer="Okt", punc_ratio=0.3, punctuations=[".", ",", "!", "?", ";", ":"]
+)
+
+text = "어머니가 집을 나가신다"
+
+result = aeda(text)
+print(result)
+# 어머니가 ! 집을 , 나가신다
+
+result = aeda(text, p=0.9, repetition=2)
+print(result)
+# ['! 어머니가 ! 집 ; 을 ? 나가신다', '. 어머니 ? 가 . 집 , 을 , 나가신다']
+```
 ## Augmenters
 - EasyDataAugmentation (EDA)
+- AEasierDataAugmentation (AEDA)
 - RandomDeletion (RD)
 - RandomInsertion (RI)
 - SynonymReplacement (SR)
 - RandomSwap (RS)
 
+There are two ways to load Augmenter.
+
+The first is to use the full name.
+```python
+from koeda import EasyDataAugmentation
+```
+The second is to use abbreviations.
+```python
+from koeda import EDA
+```
 
 ## Usage
-- EDA class
+- EDA
 ```python
-EDA = EasyDataAugmentation(
-    morpheme_analyzer: str = None,
-    alpha_sr: float = 0.1,
-    alpha_ri: float = 0.1,
-    alpha_rs: float = 0.1,
-    prob_rd: float = 0.1,
-):
-
-text = "아버지가방에들어가신다"
-
-# EDA(data: Union[List[str], str], p: List[float] = None, repetition: int = 1)
-result = EDA(data=text, p=None, repetition=1)
+augmenter = EDA(
+              morpheme_analyzer: str = None,  # Default = "Okt"
+              alpha_sr: float = 0.1,
+              alpha_ri: float = 0.1,
+              alpha_rs: float = 0.1,
+              prob_rd: float = 0.1
+            )
+
+result = augmenter(
+            data: Union[List[str], str], 
+            p: List[float] = None,  # Default = (0.1, 0.1, 0.1, 0.1)
+            repetition: int = 1
+          )
 ```
 
-- The others (RD, RI, SR, RS)
+- AEDA
 ```python
-augmenter = Augmenter(morpheme_analyzer: str = None, stopword: bool = False)
-
-text = "아버지가방에들어가신다"
+augmenter = AEDA(
+              morpheme_analyzer: str = None,  # Default = "Okt"
+              punc_ratio: float = 0.3,
+              punctuations: List[str] = None  # default = ('.', ',', '!', '?', ';', ':')
+            )
+
+result = augmenter(
+            data: Union[List[str], str], 
+            p: float = None,  # Default = 0.3 
+            repetition: int = 1
+          )
+```
 
-# augmenter(data: Union[List[str], str], p: float = 0.1, repetition: int = 1)
-result = augmenter(data=text, p=0.5, repetiion=1)
+- The others (RD, RI, SR, RS)
+```python
+augmenter = RD(
+              morpheme_analyzer: str = None, 
+            )
+
+augmenter = RI(
+              morpheme_analyzer: str = None, 
+              stopword: bool = False
+            )
+
+augmenter = SR(
+              morpheme_analyzer: str = None, 
+              stopword: bool = False
+            )
+
+augmenter = RS(
+              morpheme_analyzer: str = None, 
+            )
+
+result = augmenter(
+            data: Union[List[str], str], 
+            p: float = 0.1,
+            repetition: int = 1
+          )
 ```
 
 ## Reference
 [Easy Data Augmentation Paper](https://www.aclweb.org/anthology/D19-1670.pdf)  
 [Easy Data Augmentation Repository](https://github.com/jasonwei20/eda_nlp)  
+[A Easier Data Augmentation Paper](https://arxiv.org/pdf/2108.13230.pdf)  
+[A Easier Data Augmentation Repository](https://github.com/akkarimi/aeda_nlp)  
 [Korean WordNet](http://wordnet.kaist.ac.kr/)
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
-numpy==1.19.4
-konlpy==0.5.2
+numpy>=1.19.4
+konlpy>=0.5.2
+tweepy==3.10.0
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="koeda",
-    version="0.0.3",
+    version="0.0.4",
     description="Korean Easy Data Augmentation Package",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
@@ -17,7 +17,7 @@
     install_requires=requirements,
     keywords=["NLP deep learning koeda korean easy data augmentation"],
     license="MIT",
-    python_requires=">=3.6.0",
+    python_requires=">=3.7.0",
     include_package_data=True,
     zip_safe=False,
     classifiers=[
@@ -28,7 +28,6 @@
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",

diff --git a/src/koeda/__init__.py b/src/koeda/__init__.py
@@ -1,5 +1,5 @@
 __title__ = "KoEDA"
-__version__ = "0.0.3"
+__version__ = "0.0.4"
 
 __author__ = "Dongju Park"
 __email__ = "toriving@gmail.com"
@@ -11,7 +11,13 @@
 
 
 from .eda import EasyDataAugmentation
+from .aeda import AEasierDataAugmentation
+from .augmenters import RandomDeletion, RandomInsertion, \
+    SynonymReplacement, RandomSwap
 
-from .augmenters import RandomDeletion, RandomInsertion, SynonymReplacement, RandomSwap
-
-from .utils import STOPWORD, WORDNET, get_synonyms
+from .aeda import AEasierDataAugmentation as AEDA
+from .eda import EasyDataAugmentation as EDA
+from .augmenters import RandomDeletion as RD
+from .augmenters import RandomInsertion as RI
+from .augmenters import SynonymReplacement as SR
+from .augmenters import RandomSwap as RS
diff --git a/src/koeda/aeda.py b/src/koeda/aeda.py
@@ -0,0 +1,94 @@
+import random
+from typing import Union, List
+from itertools import repeat, chain
+
+from konlpy.tag import *
+
+from .utils import replace_space, revert_space, SPACE_TOKEN
+
+
+class AEasierDataAugmentation:
+    def __init__(
+        self,
+        morpheme_analyzer: str = None,
+        punc_ratio: float = 0.3,
+        punctuations: List[str] = None
+    ):
+        if punctuations is None or not isinstance(punctuations, list):
+            self.punctuations = ('.', ',', '!', '?', ';', ':')
+        else:
+            self.punctuations = punctuations
+
+        if morpheme_analyzer is None:
+            self.morpheme_analyzer = Okt()
+        elif morpheme_analyzer in ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]:
+            self.morpheme_analyzer = eval(morpheme_analyzer)()
+        elif hasattr(morpheme_analyzer, "morphs"):
+            self.morpheme_analyzer = morpheme_analyzer
+        else:
+            raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. '
+                             f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]')
+
+        self.ratio = punc_ratio
+
+    def __call__(self, *args, **kwargs):
+        return self.aeda(*args, **kwargs)
+
+    def aeda(
+            self, data: Union[List[str], str], p: float = None, repetition: int = 1
+    ) -> Union[List[str], str]:
+        if isinstance(data, str):
+            if repetition <= 1:
+                return self._aeda(data, p)
+            else:
+                return list(
+                    map(self._aeda, repeat(data, repetition), repeat(p, repetition))
+                )
+        elif isinstance(data, list):
+            if repetition <= 1:
+                return list(map(self._aeda, data, repeat(p, len(data))))
+            else:
+                return list(
+                    map(
+                        self._aeda,
+                        chain.from_iterable(repeat(x, repetition) for x in data),
+                        repeat(p, len(data) * repetition),
+                    )
+                )
+        else:
+            raise TypeError(f"Does not support the data type : {type(data)}")
+
+    def _aeda(self, data: str, p: float) -> str:
+        if p is None:
+            p = self.ratio
+
+        split_words = self.morpheme_analyzer.morphs(replace_space(data))
+        words = self.morpheme_analyzer.morphs(data)
+
+        new_words = []
+        q = random.randint(1, int(p * len(words) + 1))
+        qs = random.sample(range(0, len(split_words)), q)
+
+        while self.check_special_selection(split_words, qs):
+            qs = random.sample(range(0, len(split_words)), q)
+
+        for j, word in enumerate(split_words):
+            if j in qs:
+                new_words.append(SPACE_TOKEN)
+                new_words.append(
+                    self.punctuations[random.randint(0, len(self.punctuations) - 1)])
+                new_words.append(SPACE_TOKEN)
+                new_words.append(word)
+            else:
+                new_words.append(word)
+
+        augmented_sentences = revert_space(new_words)
+
+        return augmented_sentences
+
+    @staticmethod
+    def check_special_selection(split_words: list, qs: list) -> bool:
+        for i in qs:
+            if split_words[i] == SPACE_TOKEN:
+                return True
+        return False
diff --git a/src/koeda/augmenters/deletion.py b/src/koeda/augmenters/deletion.py
@@ -4,7 +4,7 @@
 
 from konlpy.tag import *
 
-from koeda.utils import replace_space, revert_space, SPACE_TOKEN
+from ..utils import replace_space, revert_space, SPACE_TOKEN
 
 
 class RandomDeletion:
@@ -16,7 +16,8 @@ def __init__(self, morpheme_analyzer: str = None):
         elif hasattr(morpheme_analyzer, "morphs"):
             self.morpheme_analyzer = morpheme_analyzer
         else:
-            raise Exception("Does not support morpheme analyzer.")
+            raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. '
+                             f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]')
 
     def __call__(self, *args, **kwargs):
         return self.random_deletion(*args, **kwargs)
@@ -43,7 +44,7 @@ def random_deletion(
                     )
                 )
         else:
-            raise Exception(f"Does not support the data type : {type(data)}")
+            raise TypeError(f"Does not support the data type : {type(data)}")
 
     def _deletion(self, data: str, p: float = 0.1) -> str:
         split_words = self.morpheme_analyzer.morphs(replace_space(data))

diff --git a/src/koeda/augmenters/insertion.py b/src/koeda/augmenters/insertion.py
@@ -4,7 +4,7 @@
 
 from konlpy.tag import *
 
-from koeda.utils import replace_space, revert_space, get_synonyms, STOPWORD, SPACE_TOKEN
+from ..utils import replace_space, revert_space, get_synonyms, STOPWORD, SPACE_TOKEN
 
 
 class RandomInsertion:
@@ -18,7 +18,8 @@ def __init__(self, morpheme_analyzer: str = None, stopword: bool = False):
         elif hasattr(morpheme_analyzer, "morphs"):
             self.morpheme_analyzer = morpheme_analyzer
         else:
-            raise Exception("Does not support morpheme analyzer.")
+            raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. '
+                             f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]')
 
     def __call__(self, *args, **kwargs):
         return self.random_insertion(*args, **kwargs)
@@ -47,7 +48,7 @@ def random_insertion(
                     )
                 )
         else:
-            raise Exception(f"Does not support the data type : {type(data)}")
+            raise TypeError(f"Does not support the data type : {type(data)}")
 
     def _insertion(self, data: str, p: float = 0.1) -> str:
         split_words = self.morpheme_analyzer.morphs(replace_space(data))