-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #60 from Maitreyapatel/sa_augmentations
Augmentations for Sentiment Analysis
- Loading branch information
Showing
8 changed files
with
899 additions
and
713 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 57 additions & 0 deletions
57
reliability_checklist/augmentation/sentiment_analysis/back_translate.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import pandas as pd | ||
from datasets import ClassLabel, Dataset | ||
from tqdm import tqdm | ||
from transformers import MarianMTModel, MarianTokenizer | ||
|
||
|
||
class back_translate_augmentation: | ||
def __init__(self, cols=None): | ||
|
||
self.cols = cols | ||
self.model_translate_name = "Helsinki-NLP/opus-mt-en-roa" | ||
self.model_back_translate_name = "Helsinki-NLP/opus-mt-roa-en" | ||
|
||
def download(self, model_name): | ||
tokenizer = MarianTokenizer.from_pretrained(model_name) | ||
model = MarianMTModel.from_pretrained(model_name) | ||
return model, tokenizer | ||
|
||
def infer(self, dataset): | ||
|
||
model_translate, model_translate_tokenizer = self.download(self.model_translate_name) | ||
model_back_translate, model_back_translate_tokenizer = self.download( | ||
self.model_back_translate_name | ||
) | ||
|
||
datacols = list(dataset.features.keys()) + ["mapping"] | ||
new_dataset = {k: [] for k in datacols} | ||
|
||
for i in tqdm(range(len(dataset))): | ||
src_text = [">>fra<< " + dataset[i]["text"]] | ||
src_translated = model_translate.generate( | ||
**model_translate_tokenizer(src_text, return_tensors="pt", padding=True) | ||
) | ||
|
||
tgt_text = [ | ||
model_translate_tokenizer.decode(t, skip_special_tokens=True) | ||
for t in src_translated | ||
][0] | ||
tgt_text = [">>eng<< " + tgt_text] | ||
tgt_translated = model_back_translate.generate( | ||
**model_back_translate_tokenizer(tgt_text, return_tensors="pt", padding=True) | ||
) | ||
|
||
back_translated_text = [ | ||
model_back_translate_tokenizer.decode(t, skip_special_tokens=True) | ||
for t in tgt_translated | ||
][0] | ||
if back_translated_text != dataset[i]["text"]: | ||
new_dataset["text"] = back_translated_text | ||
new_dataset["label"].append(dataset["label"][i]) | ||
new_dataset["mapping"].append(i) | ||
for k in datacols: | ||
if k not in ["label", "mapping"] + self.cols: | ||
new_dataset[k].append(dataset[k][i]) | ||
|
||
new_dataset = pd.DataFrame(new_dataset) | ||
return Dataset.from_pandas(new_dataset).cast_column("label", dataset.features["label"]) |
65 changes: 65 additions & 0 deletions
65
reliability_checklist/augmentation/sentiment_analysis/double_denial.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import pandas as pd | ||
from datasets import ClassLabel, Dataset | ||
from tqdm import tqdm | ||
from transformers import MarianMTModel, MarianTokenizer | ||
|
||
|
||
class double_denial_augmentation: | ||
def __init__(self, cols=None): | ||
self.cols = cols | ||
self.polarity_dict = { | ||
"poor": "not good", | ||
"bad": "not great", | ||
"lame": "not interesting", | ||
"awful": "not awesome", | ||
"great": "not bad", | ||
"good": "not poor", | ||
"applause": "not discourage", | ||
"recommend": "don't prevent", | ||
"best": "not worst", | ||
"encourage": "don't discourage", | ||
"entertain": "don't disapprove", | ||
"wonderfully": "not poorly", | ||
"love": "don't hate", | ||
"interesting": "not uninteresting", | ||
"interested": "not ignorant", | ||
"glad": "not reluctant", | ||
"positive": "not negative", | ||
"perfect": "not imperfect", | ||
"entertaining": "not uninteresting", | ||
"moved": "not moved", | ||
"like": "don't refuse", | ||
"worth": "not undeserving", | ||
"better": "not worse", | ||
"funny": "not uninteresting", | ||
"awesome": "not ugly", | ||
"impressed": "not impressed", | ||
} | ||
|
||
def infer(self, dataset): | ||
datacols = list(dataset.features.keys()) + ["mapping"] | ||
new_dataset = {k: [] for k in datacols} | ||
|
||
for i in tqdm(range(len(dataset))): | ||
flag = False | ||
tokens = dataset[i]["text"].split() | ||
augmented_string = "" | ||
for each_token in tokens: | ||
if each_token in self.polarity_dict: | ||
augmented_string += self.polarity_dict[each_token] | ||
flag = True | ||
else: | ||
augmented_string += each_token | ||
augmented_string += " " | ||
|
||
if flag: | ||
new_dataset["text"].append(augmented_string) | ||
new_dataset["label"].append(dataset["label"][i]) | ||
new_dataset["mapping"].append(i) | ||
|
||
for k in datacols: | ||
if k not in ["label", "mapping"] + self.cols: | ||
new_dataset[k].append(dataset[k][i]) | ||
|
||
new_dataset = pd.DataFrame(new_dataset) | ||
return Dataset.from_pandas(new_dataset).cast_column("label", dataset.features["label"]) |
5 changes: 5 additions & 0 deletions
5
reliability_checklist/configs/augmentation/back_translate.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
back_translate: | ||
_target_: reliability_checklist.augmentation.augments.back_translate_aug | ||
_partial_: true | ||
__name__: "BACK_TRANS" | ||
cols: ${datamodule.dataset_specific_args.cols} |
5 changes: 5 additions & 0 deletions
5
reliability_checklist/configs/augmentation/double_denial.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
double_denial: | ||
_target_: reliability_checklist.augmentation.augments.double_denial_aug | ||
_partial_: true | ||
__name__: "DOUBLE_DENIAL" | ||
cols: ${datamodule.dataset_specific_args.cols} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
defaults: | ||
- default.yaml | ||
- parrot.yaml | ||
- back_translate.yaml | ||
- double_denial.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters