-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_build_nsmc.py
31 lines (27 loc) · 1.24 KB
/
run_build_nsmc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
import wandb
from Korpora import Korpora, NSMCKorpus
from cleanrnns.fetchers import fetch_config
from cleanrnns.preprocess import cleanse, stratified_split
def main():
config = fetch_config()["nsmc"]
Korpora.fetch("nsmc")
nsmc = NSMCKorpus()
train_df = pd.DataFrame([(example.text, example.label) for example in nsmc.train], columns=["text", "label"])
test_df = pd.DataFrame([(example.text, example.label) for example in nsmc.test], columns=["text", "label"])
# preprocessing
test_df = test_df.pipe(cleanse)
# we construct a validation set here
val_df, train_df = train_df.pipe(cleanse)\
.pipe(stratified_split, ratio=config['val_ratio'], seed=config['seed'])
train = wandb.Table(data=train_df)
val = wandb.Table(data=val_df)
test = wandb.Table(data=test_df)
with wandb.init(project="the-clean-rnns", config=config) as run:
artifact = wandb.Artifact(name="nsmc", type="dataset", metadata=config, description=config['desc'])
artifact.add(train, name="train")
artifact.add(val, name="val")
artifact.add(test, name="test")
run.log_artifact(artifact, aliases=["latest", config['ver']])
if __name__ == '__main__':
main()