From dc5655aee1ab5c48297e188d73afc1370ee715eb Mon Sep 17 00:00:00 2001 From: toriving Date: Sun, 26 Sep 2021 20:00:52 +0900 Subject: [PATCH 1/7] Implement a easier data augmentation --- src/koeda/__init__.py | 14 ++++-- src/koeda/aeda.py | 94 +++++++++++++++++++++++++++++++++++++ src/koeda/utils/__init__.py | 3 +- src/koeda/utils/space.py | 1 - tests/conftest.py | 12 ++++- tests/test_aeda.py | 10 ++++ 6 files changed, 127 insertions(+), 7 deletions(-) create mode 100644 src/koeda/aeda.py create mode 100644 tests/test_aeda.py diff --git a/src/koeda/__init__.py b/src/koeda/__init__.py index 3885373..4b7320a 100644 --- a/src/koeda/__init__.py +++ b/src/koeda/__init__.py @@ -1,5 +1,5 @@ __title__ = "KoEDA" -__version__ = "0.0.3" +__version__ = "0.0.4" __author__ = "Dongju Park" __email__ = "toriving@gmail.com" @@ -11,7 +11,13 @@ from .eda import EasyDataAugmentation +from .aeda import AEasierDataAugmentation +from .augmenters import RandomDeletion, RandomInsertion, \ + SynonymReplacement, RandomSwap -from .augmenters import RandomDeletion, RandomInsertion, SynonymReplacement, RandomSwap - -from .utils import STOPWORD, WORDNET, get_synonyms +from .aeda import AEasierDataAugmentation as AEDA +from .eda import EasyDataAugmentation as EDA +from .augmenters import RandomDeletion as RD +from .augmenters import RandomInsertion as RI +from .augmenters import SynonymReplacement as SR +from .augmenters import RandomSwap as RS diff --git a/src/koeda/aeda.py b/src/koeda/aeda.py new file mode 100644 index 0000000..8bd7c65 --- /dev/null +++ b/src/koeda/aeda.py @@ -0,0 +1,94 @@ +import random +from typing import Union, List +from itertools import repeat, chain + +from konlpy.tag import * + +from .utils import replace_space, revert_space, SPACE_TOKEN + + +class AEasierDataAugmentation: + def __init__( + self, + morpheme_analyzer: str = None, + punc_ratio: float = 0.3, + punctuations: List[str] = None + ): + if punctuations is None or not isinstance(punctuations, list): + self.punctuations = ['.', ',', '!', '?', ';', ':'] + else: + self.punctuations = punctuations + + if morpheme_analyzer is None: + self.morpheme_analyzer = Okt() + elif morpheme_analyzer in ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]: + self.morpheme_analyzer = eval(morpheme_analyzer)() + elif hasattr(morpheme_analyzer, "morphs"): + self.morpheme_analyzer = morpheme_analyzer + else: + raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. ' + f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]') + + self.ratio = punc_ratio + + def __call__(self, *args, **kwargs): + return self.aeda(*args, **kwargs) + + def aeda( + self, data: Union[List[str], str], p: float = None, repetition: int = 1 + ) -> Union[List[str], str]: + if isinstance(data, str): + if repetition <= 1: + return self._aeda(data, p) + else: + return list( + map(self._aeda, repeat(data, repetition), repeat(p, repetition)) + ) + elif isinstance(data, list): + if repetition <= 1: + return list(map(self._aeda, data, repeat(p, len(data)))) + else: + return list( + map( + self._aeda, + chain.from_iterable(repeat(x, repetition) for x in data), + repeat(p, len(data) * repetition), + ) + ) + else: + raise TypeError(f"Does not support the data type : {type(data)}") + + def _aeda(self, data: str, p: float) -> str: + if p is None: + p = self.ratio + + split_words = self.morpheme_analyzer.morphs(replace_space(data)) + words = self.morpheme_analyzer.morphs(data) + + new_words = [] + q = random.randint(1, int(p * len(words) + 1)) + qs = random.sample(range(0, len(split_words)), q) + + while self.check_special_selection(split_words, qs): + qs = random.sample(range(0, len(split_words)), q) + + for j, word in enumerate(split_words): + if j in qs: + new_words.append(SPACE_TOKEN) + new_words.append( + self.punctuations[random.randint(0, len(self.punctuations) - 1)]) + new_words.append(SPACE_TOKEN) + new_words.append(word) + else: + new_words.append(word) + + augmented_sentences = revert_space(new_words) + + return augmented_sentences + + @staticmethod + def check_special_selection(split_words: list, qs: list) -> bool: + for i in qs: + if split_words[i] == SPACE_TOKEN: + return True + return False diff --git a/src/koeda/utils/__init__.py b/src/koeda/utils/__init__.py index 357cad0..caa630e 100644 --- a/src/koeda/utils/__init__.py +++ b/src/koeda/utils/__init__.py @@ -1,4 +1,5 @@ -__all__ = ["WORDNET", "STOPWORD", "get_synonyms"] +__all__ = ["WORDNET", "STOPWORD", "get_synonyms", "replace_space", + "revert_space", "SPACE_TOKEN"] from .wordnet import * from .stopwords import * diff --git a/src/koeda/utils/space.py b/src/koeda/utils/space.py index 9c922e0..38d866e 100644 --- a/src/koeda/utils/space.py +++ b/src/koeda/utils/space.py @@ -8,7 +8,6 @@ def replace_space(text: str) -> str: def revert_space(text: list) -> str: clean = ( " ".join("".join(text).replace(SPACE_TOKEN, " ").split()) - .replace(" .", ".") .strip() ) return clean diff --git a/tests/conftest.py b/tests/conftest.py index aa3944d..887028f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ import pytest from koeda import EasyDataAugmentation +from koeda import AEasierDataAugmentation from koeda import RandomDeletion from koeda import RandomInsertion from koeda import SynonymReplacement @@ -13,6 +14,12 @@ def EDA(): return EDA +@pytest.fixture +def AEDA(): + AEDA = AEasierDataAugmentation() + return AEDA + + @pytest.fixture def RD(): RD = RandomDeletion() @@ -45,5 +52,8 @@ def str_data(): @pytest.fixture def list_data(): - list_data = ["아버지가 방에 들어가신다.", "어머니가 집을 나가신다."] + list_data = ["아버지가 방에 들어가신다.", + "어머니가 집을 나가신다.", + "아버지가방에들어가신다 .", + "어머니가집을나가신다 ."] return list_data diff --git a/tests/test_aeda.py b/tests/test_aeda.py new file mode 100644 index 0000000..82d5812 --- /dev/null +++ b/tests/test_aeda.py @@ -0,0 +1,10 @@ +def test_str(AEDA, str_data): + assert AEDA(str_data, 0.8) != str_data + assert type(AEDA(str_data, 0.8)) is str + assert len(AEDA(str_data, 0.8, repetition=3)) == 3 + + +def test_list(AEDA, list_data): + assert AEDA(list_data, 0.8) != list_data + assert type(AEDA(list_data, 0.8)) is list + assert len(AEDA(list_data, 0.8, repetition=3)) == len(list_data) * 3 From 5afe196189edecb87c7e227f2697ad46160841cf Mon Sep 17 00:00:00 2001 From: toriving Date: Sun, 26 Sep 2021 20:01:35 +0900 Subject: [PATCH 2/7] Update misc --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6267328..371d85e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -numpy==1.19.4 -konlpy==0.5.2 \ No newline at end of file +numpy>=1.19.4 +konlpy>=0.5.2 diff --git a/setup.py b/setup.py index cc95ddc..5196d5d 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="koeda", - version="0.0.3", + version="0.0.4", description="Korean Easy Data Augmentation Package", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", From f95a6ce72199b67bbcb760a7bc14b1647f9cb134 Mon Sep 17 00:00:00 2001 From: toriving Date: Sun, 26 Sep 2021 20:01:56 +0900 Subject: [PATCH 3/7] Update error type and error check --- src/koeda/augmenters/deletion.py | 7 ++++--- src/koeda/augmenters/insertion.py | 7 ++++--- src/koeda/augmenters/replacement.py | 7 ++++--- src/koeda/augmenters/swap.py | 7 ++++--- src/koeda/eda.py | 15 +++++++-------- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/koeda/augmenters/deletion.py b/src/koeda/augmenters/deletion.py index c2069c7..8c9fa0f 100644 --- a/src/koeda/augmenters/deletion.py +++ b/src/koeda/augmenters/deletion.py @@ -4,7 +4,7 @@ from konlpy.tag import * -from koeda.utils import replace_space, revert_space, SPACE_TOKEN +from ..utils import replace_space, revert_space, SPACE_TOKEN class RandomDeletion: @@ -16,7 +16,8 @@ def __init__(self, morpheme_analyzer: str = None): elif hasattr(morpheme_analyzer, "morphs"): self.morpheme_analyzer = morpheme_analyzer else: - raise Exception("Does not support morpheme analyzer.") + raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. ' + f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]') def __call__(self, *args, **kwargs): return self.random_deletion(*args, **kwargs) @@ -43,7 +44,7 @@ def random_deletion( ) ) else: - raise Exception(f"Does not support the data type : {type(data)}") + raise TypeError(f"Does not support the data type : {type(data)}") def _deletion(self, data: str, p: float = 0.1) -> str: split_words = self.morpheme_analyzer.morphs(replace_space(data)) diff --git a/src/koeda/augmenters/insertion.py b/src/koeda/augmenters/insertion.py index 3cd5df0..e9ed76f 100644 --- a/src/koeda/augmenters/insertion.py +++ b/src/koeda/augmenters/insertion.py @@ -4,7 +4,7 @@ from konlpy.tag import * -from koeda.utils import replace_space, revert_space, get_synonyms, STOPWORD, SPACE_TOKEN +from ..utils import replace_space, revert_space, get_synonyms, STOPWORD, SPACE_TOKEN class RandomInsertion: @@ -18,7 +18,8 @@ def __init__(self, morpheme_analyzer: str = None, stopword: bool = False): elif hasattr(morpheme_analyzer, "morphs"): self.morpheme_analyzer = morpheme_analyzer else: - raise Exception("Does not support morpheme analyzer.") + raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. ' + f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]') def __call__(self, *args, **kwargs): return self.random_insertion(*args, **kwargs) @@ -47,7 +48,7 @@ def random_insertion( ) ) else: - raise Exception(f"Does not support the data type : {type(data)}") + raise TypeError(f"Does not support the data type : {type(data)}") def _insertion(self, data: str, p: float = 0.1) -> str: split_words = self.morpheme_analyzer.morphs(replace_space(data)) diff --git a/src/koeda/augmenters/replacement.py b/src/koeda/augmenters/replacement.py index a21cf6a..20de17d 100644 --- a/src/koeda/augmenters/replacement.py +++ b/src/koeda/augmenters/replacement.py @@ -4,7 +4,7 @@ from konlpy.tag import * -from koeda.utils import replace_space, revert_space, get_synonyms, STOPWORD +from ..utils import replace_space, revert_space, get_synonyms, STOPWORD class SynonymReplacement: @@ -18,7 +18,8 @@ def __init__(self, morpheme_analyzer: str = None, stopword: bool = False): elif hasattr(morpheme_analyzer, "morphs"): self.morpheme_analyzer = morpheme_analyzer else: - raise Exception("Does not support morpheme analyzer.") + raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. ' + f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]') def __call__(self, *args, **kwargs): return self.synonym_replacement(*args, **kwargs) @@ -49,7 +50,7 @@ def synonym_replacement( ) ) else: - raise Exception(f"Does not support the data type : {type(data)}") + raise TypeError(f"Does not support the data type : {type(data)}") def _replacement(self, data: str, p: float = 0.1) -> str: split_words = self.morpheme_analyzer.morphs(replace_space(data)) diff --git a/src/koeda/augmenters/swap.py b/src/koeda/augmenters/swap.py index 1edbbea..8bc4667 100644 --- a/src/koeda/augmenters/swap.py +++ b/src/koeda/augmenters/swap.py @@ -4,7 +4,7 @@ from konlpy.tag import * -from koeda.utils import replace_space, revert_space +from ..utils import replace_space, revert_space class RandomSwap: @@ -16,7 +16,8 @@ def __init__(self, morpheme_analyzer: str = None): elif hasattr(morpheme_analyzer, "morphs"): self.morpheme_analyzer = morpheme_analyzer else: - raise Exception("Does not support morpheme analyzer.") + raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. ' + f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]') def __call__(self, *args, **kwargs): return self.random_swap(*args, **kwargs) @@ -43,7 +44,7 @@ def random_swap( ) ) else: - raise Exception(f"Does not support the data type : {type(data)}") + raise TypeError(f"Does not support the data type : {type(data)}") def _swap(self, data: str, p: float = 0.1) -> str: split_words = self.morpheme_analyzer.morphs(replace_space(data)) diff --git a/src/koeda/eda.py b/src/koeda/eda.py index 46d121e..d1bad5e 100644 --- a/src/koeda/eda.py +++ b/src/koeda/eda.py @@ -23,7 +23,8 @@ def __init__( elif hasattr(morpheme_analyzer, "morphs"): self.morpheme_analyzer = morpheme_analyzer else: - raise Exception("Does not support morpheme analyzer.") + raise ValueError(f'Does not support {morpheme_analyzer} morpheme analyzer. ' + f'Choose one of ["Okt", "Kkma", "Komoran", "Mecab", "Hannanum"]') self.alphas = (alpha_sr, alpha_ri, alpha_rs, prob_rd) @@ -59,15 +60,13 @@ def eda( ) ) else: - raise Exception(f"Does not support the data type : {type(data)}") + raise TypeError(f"Does not support the data type : {type(data)}") def _eda(self, data: str, p: List[float]) -> str: random_idx = random.randint(0, 3) - if p is not None and len(p) == 4: - augmented_sentences = self.augmentations[random_idx](data, p[random_idx]) - else: - augmented_sentences = self.augmentations[random_idx]( - data, self.alphas[random_idx] - ) + if p is None or len(p) != 4: + p = self.alphas + + augmented_sentences = self.augmentations[random_idx](data, p[random_idx]) return augmented_sentences From 03cfaa61091b9b1ccf184f9a508733a5e0846c53 Mon Sep 17 00:00:00 2001 From: toriving Date: Sun, 26 Sep 2021 20:45:21 +0900 Subject: [PATCH 4/7] Change list to tuple --- src/koeda/aeda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/koeda/aeda.py b/src/koeda/aeda.py index 8bd7c65..2f983af 100644 --- a/src/koeda/aeda.py +++ b/src/koeda/aeda.py @@ -15,7 +15,7 @@ def __init__( punctuations: List[str] = None ): if punctuations is None or not isinstance(punctuations, list): - self.punctuations = ['.', ',', '!', '?', ';', ':'] + self.punctuations = ('.', ',', '!', '?', ';', ':') else: self.punctuations = punctuations From 07a22453adad849976845e8328a4f283ef49d206 Mon Sep 17 00:00:00 2001 From: toriving Date: Sun, 26 Sep 2021 20:45:29 +0900 Subject: [PATCH 5/7] Update README.md --- README.md | 120 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 96 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 6171b35..19d5bf6 100644 --- a/README.md +++ b/README.md @@ -20,69 +20,141 @@ KoEDA

Easy Data Augmentation for Korean +This is a project that re-implemented Easy data augmentation and A Easier Data Augmentation, which were implemented for English, to fit Korean. + ## Prerequisites - python >= 3.6 ## Installation This repository is tested on Python 3.6 - 3.9. + KoEDA can be installed using pip as follows: ```shell script $ pip install koeda ``` ## Quick Start - +- EDA ```python -from koeda import EasyDataAugmentation +from koeda import EDA -EDA = EasyDataAugmentation( - morpheme_analyzer=None, alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3 +eda = EDA( + morpheme_analyzer="Okt", alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3 ) text = "아버지가 방에 들어가신다" -result = EDA(text) +result = eda(text) print(result) # 아버지가 정실에 들어가신다 + +result = eda(text, p=(0.9, 0.9, 0.9, 0.9), repetition=2) +print(result) +# ['아버지가 객실 아빠 안방 방에 정실 들어가신다', '아버지가 탈의실 방 휴게실 에 안방 탈의실 들어가신다'] ``` +- AEDA +```python +from koeda import AEDA + + +aeda = AEDA( + morpheme_analyzer="Okt", punc_ratio=0.3, punctuations=[".", ",", "!", "?", ";", ":"] +) + +text = "어머니가 집을 나가신다" + +result = aeda(text) +print(result) +# 어머니가 ! 집을 , 나가신다 + +result = aeda(text, p=0.9, repetition=2) +print(result) +# ['! 어머니가 ! 집 ; 을 ? 나가신다', '. 어머니 ? 가 . 집 , 을 , 나가신다'] +``` ## Augmenters - EasyDataAugmentation (EDA) +- AEasierDataAugmentation (AEDA) - RandomDeletion (RD) - RandomInsertion (RI) - SynonymReplacement (SR) - RandomSwap (RS) +There are two ways to load Augmenter. + +The first is to use the full name. +```python +from koeda import EasyDataAugmentation +``` +The second is to use abbreviations. +```python +from koeda import EDA +``` ## Usage -- EDA class +- EDA ```python -EDA = EasyDataAugmentation( - morpheme_analyzer: str = None, - alpha_sr: float = 0.1, - alpha_ri: float = 0.1, - alpha_rs: float = 0.1, - prob_rd: float = 0.1, -): - -text = "아버지가방에들어가신다" - -# EDA(data: Union[List[str], str], p: List[float] = None, repetition: int = 1) -result = EDA(data=text, p=None, repetition=1) +augmenter = EDA( + morpheme_analyzer: str = None, # Default = "Okt" + alpha_sr: float = 0.1, + alpha_ri: float = 0.1, + alpha_rs: float = 0.1, + prob_rd: float = 0.1 + ) + +result = augmenter( + data: Union[List[str], str], + p: List[float] = None, # Default = (0.1, 0.1, 0.1, 0.1) + repetition: int = 1 + ) ``` -- The others (RD, RI, SR, RS) +- AEDA ```python -augmenter = Augmenter(morpheme_analyzer: str = None, stopword: bool = False) - -text = "아버지가방에들어가신다" +augmenter = AEDA( + morpheme_analyzer: str = None, # Default = "Okt" + punc_ratio: float = 0.3, + punctuations: List[str] = None # default = ('.', ',', '!', '?', ';', ':') + ) + +result = augmenter( + data: Union[List[str], str], + p: float = None, # Default = 0.3 + repetition: int = 1 + ) +``` -# augmenter(data: Union[List[str], str], p: float = 0.1, repetition: int = 1) -result = augmenter(data=text, p=0.5, repetiion=1) +- The others (RD, RI, SR, RS) +```python +augmenter = RD( + morpheme_analyzer: str = None, + ) + +augmenter = RI( + morpheme_analyzer: str = None, + stopword: bool = False + ) + +augmenter = SR( + morpheme_analyzer: str = None, + stopword: bool = False + ) + +augmenter = RS( + morpheme_analyzer: str = None, + ) + +result = augmenter( + data: Union[List[str], str], + p: float = 0.1, + repetition: int = 1 + ) ``` ## Reference [Easy Data Augmentation Paper](https://www.aclweb.org/anthology/D19-1670.pdf) [Easy Data Augmentation Repository](https://github.com/jasonwei20/eda_nlp) +[A Easier Data Augmentation Paper](https://arxiv.org/pdf/2108.13230.pdf) +[A Easier Data Augmentation Repository](https://github.com/akkarimi/aeda_nlp) [Korean WordNet](http://wordnet.kaist.ac.kr/) From f548522df3e430a958713b3af20c6019c8efd3b9 Mon Sep 17 00:00:00 2001 From: toriving Date: Sun, 26 Sep 2021 20:55:04 +0900 Subject: [PATCH 6/7] Update minimum python version --- README.md | 4 ++-- setup.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 19d5bf6..87ece9b 100644 --- a/README.md +++ b/README.md @@ -23,10 +23,10 @@ KoEDA This is a project that re-implemented Easy data augmentation and A Easier Data Augmentation, which were implemented for English, to fit Korean. ## Prerequisites -- python >= 3.6 +- python >= 3.7 ## Installation -This repository is tested on Python 3.6 - 3.9. +This repository is tested on Python 3.7 - 3.9. KoEDA can be installed using pip as follows: ```shell script diff --git a/setup.py b/setup.py index 5196d5d..16c325c 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ install_requires=requirements, keywords=["NLP deep learning koeda korean easy data augmentation"], license="MIT", - python_requires=">=3.6.0", + python_requires=">=3.7.0", include_package_data=True, zip_safe=False, classifiers=[ @@ -28,7 +28,6 @@ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", From 0b1847a5c33f898f7dcc45f10bab5c531febebf8 Mon Sep 17 00:00:00 2001 From: toriving Date: Sun, 26 Sep 2021 21:07:45 +0900 Subject: [PATCH 7/7] Update tweepy bersion --- .github/workflows/test.yml | 2 +- requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7968749..50ada1d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.6', '3.7', '3.8', '3.9'] + python-version: ['3.7', '3.8', '3.9'] steps: - uses: actions/checkout@v2 diff --git a/requirements.txt b/requirements.txt index 371d85e..5b0b7a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ numpy>=1.19.4 konlpy>=0.5.2 +tweepy==3.10.0