From 537f5c7f591a221b8ba46b80bb6573044729aa2a Mon Sep 17 00:00:00 2001
From: mponty <heapnhash@gmail.com>
Date: Thu, 21 Mar 2019 18:38:45 +0300
Subject: [PATCH 1/5] init training on SWAG dataset

---
 lilbert/lib/swag_utils.py | 213 ++++++++++++++++++++++++++++++++++++++
 notebooks/main_swag.py    |  52 ++++++++++
 2 files changed, 265 insertions(+)
 create mode 100644 lilbert/lib/swag_utils.py
 create mode 100644 notebooks/main_swag.py

diff --git a/lilbert/lib/swag_utils.py b/lilbert/lib/swag_utils.py
new file mode 100644
index 0000000..c878ae0
--- /dev/null
+++ b/lilbert/lib/swag_utils.py
@@ -0,0 +1,213 @@
+import csv
+import sys
+from io import open
+from tqdm import tqdm
+import numpy as np
+
+
+class SwagExample(object):
+    """A single training/test example for the SWAG dataset."""
+
+    def __init__(self,
+                 swag_id,
+                 context_sentence,
+                 start_ending,
+                 ending_0,
+                 ending_1,
+                 ending_2,
+                 ending_3,
+                 label=None):
+        self.swag_id = swag_id
+        self.context_sentence = context_sentence
+        self.start_ending = start_ending
+        self.endings = [
+            ending_0,
+            ending_1,
+            ending_2,
+            ending_3,
+        ]
+        self.label = label
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        l = [
+            "swag_id: {}".format(self.swag_id),
+            "context_sentence: {}".format(self.context_sentence),
+            "start_ending: {}".format(self.start_ending),
+            "ending_0: {}".format(self.endings[0]),
+            "ending_1: {}".format(self.endings[1]),
+            "ending_2: {}".format(self.endings[2]),
+            "ending_3: {}".format(self.endings[3]),
+        ]
+
+        if self.label is not None:
+            l.append("label: {}".format(self.label))
+
+        return ", ".join(l)
+
+
+class InputFeatures(object):
+    def __init__(self,
+                 example_id,
+                 choices_features,
+                 label
+
+                 ):
+        self.example_id = example_id
+        self.choices_features = [
+            {
+                'input_ids': input_ids,
+                'input_mask': input_mask,
+                'segment_ids': segment_ids
+            }
+            for _, input_ids, input_mask, segment_ids in choices_features
+        ]
+        self.label = label
+
+
+def read_swag_examples(input_file, is_training):
+    with open(input_file, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        lines = []
+        for line in reader:
+            if sys.version_info[0] == 2:
+                line = list(unicode(cell, 'utf-8') for cell in line)
+            lines.append(line)
+
+    if is_training and lines[0][-1] != 'label':
+        raise ValueError(
+            "For training, the input file must contain a label column."
+        )
+
+    examples = [
+        SwagExample(
+            swag_id=line[2],
+            context_sentence=line[4],
+            start_ending=line[5],  # in the swag dataset, the
+            # common beginning of each
+            # choice is stored in "sent2".
+            ending_0=line[7],
+            ending_1=line[8],
+            ending_2=line[9],
+            ending_3=line[10],
+            label=int(line[11]) if is_training else None
+        ) for line in lines[1:]  # we skip the line with the column names
+    ]
+
+    return examples
+
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 is_training):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # Swag is a multiple choice task. To perform this task using Bert,
+    # we will use the formatting proposed in "Improving Language
+    # Understanding by Generative Pre-Training" and suggested by
+    # @jacobdevlin-google in this issue
+    # https://github.com/google-research/bert/issues/38.
+    #
+    # Each choice will correspond to a sample on which we run the
+    # inference. For a given Swag example, we will create the 4
+    # following inputs:
+    # - [CLS] context [SEP] choice_1 [SEP]
+    # - [CLS] context [SEP] choice_2 [SEP]
+    # - [CLS] context [SEP] choice_3 [SEP]
+    # - [CLS] context [SEP] choice_4 [SEP]
+    # The model will output a single value for each input. To get the
+    # final decision of the model, we will run a softmax over these 4
+    # outputs.
+    features = []
+    # TODO: remove tqdm
+    for example_index, example in tqdm(enumerate(examples), total=len(examples), desc='converting examples'):
+        # for example_index, example in enumerate(examples):
+        context_tokens = tokenizer.tokenize(example.context_sentence)
+        start_ending_tokens = tokenizer.tokenize(example.start_ending)
+
+        choices_features = []
+        for ending_index, ending in enumerate(example.endings):
+            # We create a copy of the context tokens in order to be
+            # able to shrink it according to ending_tokens
+            context_tokens_choice = context_tokens[:]
+            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
+            # Modifies `context_tokens_choice` and `ending_tokens` in
+            # place so that the total length is less than the
+            # specified length.  Account for [CLS], [SEP], [SEP] with
+            # "- 3"
+            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
+
+            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
+            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding = [0] * (max_seq_length - len(input_ids))
+            input_ids += padding
+            input_mask += padding
+            segment_ids += padding
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            choices_features.append((tokens, input_ids, input_mask, segment_ids))
+
+        label = example.label
+
+        # if example_index < 5:
+        #     print("*** Example ***")
+        #     print("swag_id: {}".format(example.swag_id))
+        #     for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
+        #         print("choice: {}".format(choice_idx))
+        #         print("tokens: {}".format(' '.join(tokens)))
+        #         print("input_ids: {}".format(' '.join(map(str, input_ids))))
+        #         print("input_mask: {}".format(' '.join(map(str, input_mask))))
+        #         print("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+        #     if is_training:
+        #         print("label: {}".format(label))
+
+        features.append(
+            InputFeatures(
+                example_id=example.swag_id,
+                choices_features=choices_features,
+                label=label
+            )
+        )
+
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+
+def select_field(features, field):
+    return [
+        [
+            choice[field]
+            for choice in feature.choices_features
+        ]
+        for feature in features
+    ]
diff --git a/notebooks/main_swag.py b/notebooks/main_swag.py
new file mode 100644
index 0000000..31dccb7
--- /dev/null
+++ b/notebooks/main_swag.py
@@ -0,0 +1,52 @@
+import os
+import sys
+# sys.path.append('..')
+
+import numpy as np
+import random
+import torch
+from pytorch_pretrained_bert import BertForMultipleChoice
+
+from lib import data_processors, utils
+from lib.bert import BertForSequenceClassification
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from lib.train_eval import train, evaluate, train_swag
+from lib import swag_utils
+
+params = {
+    'data_dir': '../data/SWAG',
+    'output_dir': '../output',
+    'cache_dir': '../model_cache',
+    'task_name': 'swag',
+    'bert_model': 'bert-base-uncased',
+    'max_seq_length': 12,
+    # 'max_seq_length': 128,
+    'train_batch_size': 1,
+    # 'train_batch_size': 32,
+    'eval_batch_size': 8,
+    'learning_rate': 2e-5,
+    'warmup_proportion': 0.1,
+    'num_train_epochs': 1,
+    'seed': 1331
+}
+
+random.seed(params['seed'])
+np.random.seed(params['seed'])
+torch.manual_seed(params['seed'])
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+tokenizer = BertTokenizer.from_pretrained(
+    params['bert_model'], do_lower_case=True)
+
+# train_examples = processor.get_train_examples(params['data_dir'])
+# eval_examples = processor.get_dev_examples(params['data_dir'])
+train_examples = swag_utils.read_swag_examples(os.path.join(params['data_dir'], 'train.csv'), is_training=True)
+
+model = BertForMultipleChoice.from_pretrained(
+    params['bert_model'], cache_dir=params['cache_dir'], num_choices=4).to(device)
+
+# model = BertForSequenceClassification.from_pretrained(
+#     params['bert_model'], cache_dir=params['cache_dir'], num_labels=num_labels).to(device)
+
+model, result = train_swag(model, train_examples, params, tokenizer)

From 31764afe36e50912e21be36bce69e474c4cbbe8e Mon Sep 17 00:00:00 2001
From: mponty <heapnhash@gmail.com>
Date: Tue, 26 Mar 2019 15:52:34 +0300
Subject: [PATCH 2/5] rebase with master

---
 notebooks/main_swag.py | 52 ------------------------------------------
 1 file changed, 52 deletions(-)
 delete mode 100644 notebooks/main_swag.py

diff --git a/notebooks/main_swag.py b/notebooks/main_swag.py
deleted file mode 100644
index 31dccb7..0000000
--- a/notebooks/main_swag.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import os
-import sys
-# sys.path.append('..')
-
-import numpy as np
-import random
-import torch
-from pytorch_pretrained_bert import BertForMultipleChoice
-
-from lib import data_processors, utils
-from lib.bert import BertForSequenceClassification
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from lib.train_eval import train, evaluate, train_swag
-from lib import swag_utils
-
-params = {
-    'data_dir': '../data/SWAG',
-    'output_dir': '../output',
-    'cache_dir': '../model_cache',
-    'task_name': 'swag',
-    'bert_model': 'bert-base-uncased',
-    'max_seq_length': 12,
-    # 'max_seq_length': 128,
-    'train_batch_size': 1,
-    # 'train_batch_size': 32,
-    'eval_batch_size': 8,
-    'learning_rate': 2e-5,
-    'warmup_proportion': 0.1,
-    'num_train_epochs': 1,
-    'seed': 1331
-}
-
-random.seed(params['seed'])
-np.random.seed(params['seed'])
-torch.manual_seed(params['seed'])
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-
-tokenizer = BertTokenizer.from_pretrained(
-    params['bert_model'], do_lower_case=True)
-
-# train_examples = processor.get_train_examples(params['data_dir'])
-# eval_examples = processor.get_dev_examples(params['data_dir'])
-train_examples = swag_utils.read_swag_examples(os.path.join(params['data_dir'], 'train.csv'), is_training=True)
-
-model = BertForMultipleChoice.from_pretrained(
-    params['bert_model'], cache_dir=params['cache_dir'], num_choices=4).to(device)
-
-# model = BertForSequenceClassification.from_pretrained(
-#     params['bert_model'], cache_dir=params['cache_dir'], num_labels=num_labels).to(device)
-
-model, result = train_swag(model, train_examples, params, tokenizer)

From f13fa89050fa01433a5f1a86478b722367d5af66 Mon Sep 17 00:00:00 2001
From: mponty <heapnhash@gmail.com>
Date: Tue, 26 Mar 2019 16:49:37 +0300
Subject: [PATCH 3/5] add SWAG benchmark

---
 lilbert/lib/data_processors.py       | 110 ++++++++++++++++++++++++---
 lilbert/lib/feature_processors.py    |  93 +++++++++++++++++++---
 lilbert/lib/tasks.py                 |   1 +
 lilbert/lib/train_eval.py            |  43 +++++------
 lilbert/notebooks/train_bert_swag.py |  58 ++++++++++++++
 5 files changed, 262 insertions(+), 43 deletions(-)
 create mode 100644 lilbert/notebooks/train_bert_swag.py

diff --git a/lilbert/lib/data_processors.py b/lilbert/lib/data_processors.py
index 28097c9..3371427 100644
--- a/lilbert/lib/data_processors.py
+++ b/lilbert/lib/data_processors.py
@@ -46,7 +46,7 @@ def get_train_examples(self, data_dir):
     def get_dev_examples(self, data_dir):
         """Gets a collection of `InputExample`s for the dev set."""
         raise NotImplementedError()
-    
+
     def get_labels(self):
         """Gets the list of labels for this data set."""
         raise NotImplementedError()
@@ -76,7 +76,7 @@ def get_dev_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    
+
     def get_labels(self):
         """See base class."""
         return ["0", "1"]
@@ -109,7 +109,7 @@ def get_dev_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
             "dev_matched")
-    
+
     def get_labels(self):
         """See base class."""
         return ["contradiction", "entailment", "neutral"]
@@ -156,11 +156,11 @@ def _create_examples(self, lines, set_type):
             examples.append(
                 InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
         return examples
-    
+
 
 class SST2Processor(DataProcessor):
     """Processor for the SST2 data set (GLUE version)."""
-    
+
     def get_train_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
@@ -186,7 +186,7 @@ def _create_examples(self, lines, set_type):
                 InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
         return examples
 
-    
+
 def get_quora_df(filename):
     with open(filename, "r", encoding='utf-8') as f:
         rows = list(csv.reader(f, delimiter='\t', quotechar=None))
@@ -195,11 +195,11 @@ def get_quora_df(filename):
     df = df[pd.notnull(df['is_duplicate'])]
     df.columns = ['text_a', 'text_b', 'label']
     return df
-    
-    
+
+
 class QQPProcessor(DataProcessor):
     """Processor for the SST2 data set (GLUE version)."""
-    
+
     def get_train_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
@@ -225,3 +225,95 @@ def _create_examples(self, df, set_type):
             examples.append(
                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
+
+
+class SwagExample(object):
+    """A single training/test example for the SWAG dataset."""
+
+    def __init__(self,
+                 swag_id,
+                 context_sentence,
+                 start_ending,
+                 ending_0,
+                 ending_1,
+                 ending_2,
+                 ending_3,
+                 label=None):
+        self.swag_id = swag_id
+        self.context_sentence = context_sentence
+        self.start_ending = start_ending
+        self.endings = [
+            ending_0,
+            ending_1,
+            ending_2,
+            ending_3,
+        ]
+        self.label = label
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        l = [
+            "swag_id: {}".format(self.swag_id),
+            "context_sentence: {}".format(self.context_sentence),
+            "start_ending: {}".format(self.start_ending),
+            "ending_0: {}".format(self.endings[0]),
+            "ending_1: {}".format(self.endings[1]),
+            "ending_2: {}".format(self.endings[2]),
+            "ending_3: {}".format(self.endings[3]),
+        ]
+
+        if self.label is not None:
+            l.append("label: {}".format(self.label))
+
+        return ", ".join(l)
+
+
+class SWAGProcessor(DataProcessor):
+    """Processor for the SWAG data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        lines = self._read_csv(os.path.join(data_dir, "train.csv"), True)
+        return self._create_examples(lines, True)
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        lines = self._read_csv(os.path.join(data_dir, "val.csv"), False)
+        return self._create_examples(lines, False)
+
+    def _read_csv(self, input_file, is_training):
+        with open(input_file, 'r', encoding='utf-8') as f:
+            reader = csv.reader(f)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+
+        if is_training and lines[0][-1] != 'label':
+            raise ValueError(
+                "For training, the input file must contain a label column."
+            )
+        return lines[1:]
+
+    def _create_examples(self, lines, is_training):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for line in lines:
+            examples.append(
+                SwagExample(
+                    swag_id=line[2],
+                    context_sentence=line[4],
+                    start_ending=line[5],  # in the swag dataset, the
+                    # common beginning of each
+                    # choice is stored in "sent2".
+                    ending_0=line[7],
+                    ending_1=line[8],
+                    ending_2=line[9],
+                    ending_3=line[10],
+                    label=int(line[11]) if is_training else None
+                )
+            )
+        return examples
diff --git a/lilbert/lib/feature_processors.py b/lilbert/lib/feature_processors.py
index c00073c..9f357bb 100644
--- a/lilbert/lib/feature_processors.py
+++ b/lilbert/lib/feature_processors.py
@@ -1,14 +1,23 @@
 import numpy as np
 from lib.data_processors import InputFeatures
+from tqdm import tqdm
 
 
-def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
+def convert_examples_to_features(examples, tokenizer, params):
+    if params['task_name'] != 'swag':
+        return convert_examples_to_features_clf(examples, params['label_list'],
+                                                params['max_seq_length'], tokenizer)
+    else:
+        return convert_examples_to_features_swag(examples, params['max_seq_length'], tokenizer)
+
+
+def convert_examples_to_features_clf(examples, label_list, max_seq_length, tokenizer):
     """Loads a data file into a list of `InputBatch`s."""
 
-    label_map = {label : i for i, label in enumerate(label_list)}
+    label_map = {label: i for i, label in enumerate(label_list)}
 
     features = []
-    for (ex_index, example) in enumerate(examples):
+    for (ex_index, example) in tqdm(enumerate(examples), total=len(examples), desc='converting examples'):
         tokens_a = tokenizer.tokenize(example.text_a)
 
         tokens_b = None
@@ -66,10 +75,76 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
 
         label_id = label_map[example.label]
         features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_mask=input_mask,
-                              segment_ids=segment_ids,
-                              label_id=label_id))
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids,
+                          label_id=label_id))
+    return features
+
+
+def convert_examples_to_features_swag(examples, max_seq_length, tokenizer):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # Swag is a multiple choice task. To perform this task using Bert,
+    # we will use the formatting proposed in "Improving Language
+    # Understanding by Generative Pre-Training" and suggested by
+    # @jacobdevlin-google in this issue
+    # https://github.com/google-research/bert/issues/38.
+    #
+    # Each choice will correspond to a sample on which we run the
+    # inference. For a given Swag example, we will create the 4
+    # following inputs:
+    # - [CLS] context [SEP] choice_1 [SEP]
+    # - [CLS] context [SEP] choice_2 [SEP]
+    # - [CLS] context [SEP] choice_3 [SEP]
+    # - [CLS] context [SEP] choice_4 [SEP]
+    # The model will output a single value for each input. To get the
+    # final decision of the model, we will run a softmax over these 4
+    # outputs.
+    features = []
+    for example_index, example in tqdm(enumerate(examples), total=len(examples), desc='converting examples'):
+        # for example_index, example in enumerate(examples):
+        context_tokens = tokenizer.tokenize(example.context_sentence)
+        start_ending_tokens = tokenizer.tokenize(example.start_ending)
+
+        choices_input_ids, choices_input_mask, choices_segment_ids = [], [], []
+        for ending_index, ending in enumerate(example.endings):
+            # We create a copy of the context tokens in order to be
+            # able to shrink it according to ending_tokens
+            context_tokens_choice = context_tokens[:]
+            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
+            # Modifies `context_tokens_choice` and `ending_tokens` in
+            # place so that the total length is less than the
+            # specified length.  Account for [CLS], [SEP], [SEP] with
+            # "- 3"
+            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
+
+            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
+            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding = [0] * (max_seq_length - len(input_ids))
+            input_ids += padding
+            input_mask += padding
+            segment_ids += padding
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            choices_input_ids.append(input_ids)
+            choices_input_mask.append(input_mask)
+            choices_segment_ids.append(segment_ids)
+
+        features.append(
+            InputFeatures(input_ids=choices_input_ids,
+                          input_mask=choices_input_mask,
+                          segment_ids=choices_segment_ids,
+                          label_id=example.label)
+        )
     return features
 
 
@@ -88,7 +163,3 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
             tokens_a.pop()
         else:
             tokens_b.pop()
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
diff --git a/lilbert/lib/tasks.py b/lilbert/lib/tasks.py
index 3d7efa3..f33b7f3 100644
--- a/lilbert/lib/tasks.py
+++ b/lilbert/lib/tasks.py
@@ -6,6 +6,7 @@
     'mrpc': data_processors.MrpcProcessor,
     'sst2': data_processors.SST2Processor,
     'qqp': data_processors.QQPProcessor,
+    'swag': data_processors.SWAGProcessor,
 }
 
 num_labels = {
diff --git a/lilbert/lib/train_eval.py b/lilbert/lib/train_eval.py
index 80fc772..e4765fc 100644
--- a/lilbert/lib/train_eval.py
+++ b/lilbert/lib/train_eval.py
@@ -9,7 +9,6 @@
 from pytorch_pretrained_bert.modeling import BertConfig
 
 from lib import feature_processors, metrics
-from lib.bert import BertForSequenceClassification
 
 
 def train(model, tokenizer, params,
@@ -20,7 +19,7 @@ def train(model, tokenizer, params,
     random.seed(params['seed'])
     np.random.seed(params['seed'])
     torch.manual_seed(params['seed'])
-    
+
     train_steps_per_epoch = int(len(train_examples) / params['train_batch_size'])
     num_train_optimization_steps = train_steps_per_epoch * params['num_train_epochs']
 
@@ -38,32 +37,31 @@ def train(model, tokenizer, params,
                          lr=params['learning_rate'],
                          warmup=params['warmup_proportion'],
                          t_total=num_train_optimization_steps)
-    
+
     global_step = 0
     nb_tr_steps = 0
     tr_loss = 0
-    
+
     train_features = feature_processors.convert_examples_to_features(
         train_examples,
-        params['label_list'],
-        params['max_seq_length'],
-        tokenizer)
+        tokenizer,
+        params)
     print("***** Running training *****")
-    print("Num examples:",  len(train_examples))
+    print("Num examples:", len(train_examples))
     print("Batch size:  ", params['train_batch_size'])
     print("Num steps:   ", num_train_optimization_steps)
     all_input_ids = torch.tensor(
         [f.input_ids for f in train_features],
-         dtype=torch.long)
+        dtype=torch.long)
     all_input_mask = torch.tensor(
         [f.input_mask for f in train_features],
-         dtype=torch.long)
+        dtype=torch.long)
     all_segment_ids = torch.tensor(
         [f.segment_ids for f in train_features],
-         dtype=torch.long)
+        dtype=torch.long)
     all_label_ids = torch.tensor(
         [f.label_id for f in train_features],
-         dtype=torch.long)
+        dtype=torch.long)
     train_data = TensorDataset(all_input_ids,
                                all_input_mask,
                                all_segment_ids,
@@ -117,7 +115,7 @@ def train(model, tokenizer, params,
         'train_loss': tr_loss / nb_tr_steps,
         'train_global_step': global_step,
     }
-    
+
     return model, train_result
 
 
@@ -125,21 +123,20 @@ def predict(model, tokenizer, params, valid_examples):
     random.seed(params['seed'])
     np.random.seed(params['seed'])
     torch.manual_seed(params['seed'])
-    
+
     eval_features = feature_processors.convert_examples_to_features(
-            valid_examples,
-            params['label_list'],
-            params['max_seq_length'],
-            tokenizer)
+        valid_examples,
+        tokenizer,
+        params)
     all_input_ids = torch.tensor(
         [f.input_ids for f in eval_features],
-         dtype=torch.long)
+        dtype=torch.long)
     all_input_mask = torch.tensor(
         [f.input_mask for f in eval_features],
-         dtype=torch.long)
+        dtype=torch.long)
     all_segment_ids = torch.tensor(
         [f.segment_ids for f in eval_features],
-         dtype=torch.long)
+        dtype=torch.long)
     eval_data = TensorDataset(all_input_ids,
                               all_input_mask,
                               all_segment_ids)
@@ -167,9 +164,9 @@ def evaluate(model, tokenizer, params, valid_examples):
     print("***** Running evaluation *****")
     print("Num examples: ", len(valid_examples))
     print("Batch size:   ", params['eval_batch_size'])
-    
+
     prob_preds = predict(model, tokenizer, params, valid_examples)
-    true_labels = np.array([int(example.label) 
+    true_labels = np.array([int(example.label)
                             for i, example in enumerate(valid_examples)])
     result = {
         'eval_loss': metrics.log_loss(true_labels, prob_preds),
diff --git a/lilbert/notebooks/train_bert_swag.py b/lilbert/notebooks/train_bert_swag.py
new file mode 100644
index 0000000..911e57e
--- /dev/null
+++ b/lilbert/notebooks/train_bert_swag.py
@@ -0,0 +1,58 @@
+import numpy as np
+import random
+import torch
+from pytorch_pretrained_bert import BertForMultipleChoice
+import os
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+
+from lib import data_processors, tasks
+from lib.bert import BertForSequenceClassification
+from lib.train_eval import train, evaluate, predict
+
+params = {
+    'data_dir': '../../data/SWAG',
+    'output_dir': '../../output',
+    'cache_dir': '../../model_cache',
+    'task_name': 'swag',
+    'bert_model': 'bert-base-uncased',
+    'max_seq_length': 12,
+    'train_batch_size': 1,
+    'eval_batch_size': 8,
+    'learning_rate': 2e-5,
+    'warmup_proportion': 0.1,
+    'num_train_epochs': 1,
+    'seed': 1331,
+    'device': torch.device(
+        'cuda' if torch.cuda.is_available()
+        else 'cpu')
+}
+
+random.seed(params['seed'])
+np.random.seed(params['seed'])
+torch.manual_seed(params['seed'])
+
+processor = tasks.processors[params['task_name']]()
+tokenizer = BertTokenizer.from_pretrained(
+    params['bert_model'], do_lower_case=True)
+
+train_examples = processor.get_train_examples(params['data_dir'])
+dev_examples = processor.get_dev_examples(params['data_dir'])
+
+model = BertForMultipleChoice.from_pretrained(
+    params['bert_model'],
+    cache_dir=params['cache_dir'], num_choices=4).to(params['device'])
+
+EPOCH_NUM = 1
+
+params['num_train_epochs'] = 1
+checkpoint_files = {
+    'config': 'bert_config.json',
+    'model_weigths': 'model_{}_epoch_{}.pth'.format(
+        params['task_name'], EPOCH_NUM)
+}
+
+model, result = train(model, tokenizer, params,
+                      train_examples,
+                      valid_examples=dev_examples,
+                      checkpoint_files=checkpoint_files)
+print(result)

From 7c573ee562a78dd6e026539174822f25e0db7436 Mon Sep 17 00:00:00 2001
From: mponty <heapnhash@gmail.com>
Date: Fri, 29 Mar 2019 13:12:16 +0000
Subject: [PATCH 4/5] add train_swag notebook and fix SWAGExample label for
 validation

---
 lilbert/lib/data_processors.py          |   4 +-
 lilbert/lib/swag_utils.py               | 213 ---------------------
 lilbert/notebooks/train_bert_swag.ipynb | 237 ++++++++++++++++++++++++
 lilbert/notebooks/train_bert_swag.py    |  58 ------
 4 files changed, 239 insertions(+), 273 deletions(-)
 delete mode 100644 lilbert/lib/swag_utils.py
 create mode 100644 lilbert/notebooks/train_bert_swag.ipynb
 delete mode 100644 lilbert/notebooks/train_bert_swag.py

diff --git a/lilbert/lib/data_processors.py b/lilbert/lib/data_processors.py
index 3371427..fec088f 100644
--- a/lilbert/lib/data_processors.py
+++ b/lilbert/lib/data_processors.py
@@ -280,8 +280,8 @@ def get_train_examples(self, data_dir):
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        lines = self._read_csv(os.path.join(data_dir, "val.csv"), False)
-        return self._create_examples(lines, False)
+        lines = self._read_csv(os.path.join(data_dir, "val.csv"), True)
+        return self._create_examples(lines, True)
 
     def _read_csv(self, input_file, is_training):
         with open(input_file, 'r', encoding='utf-8') as f:
diff --git a/lilbert/lib/swag_utils.py b/lilbert/lib/swag_utils.py
deleted file mode 100644
index c878ae0..0000000
--- a/lilbert/lib/swag_utils.py
+++ /dev/null
@@ -1,213 +0,0 @@
-import csv
-import sys
-from io import open
-from tqdm import tqdm
-import numpy as np
-
-
-class SwagExample(object):
-    """A single training/test example for the SWAG dataset."""
-
-    def __init__(self,
-                 swag_id,
-                 context_sentence,
-                 start_ending,
-                 ending_0,
-                 ending_1,
-                 ending_2,
-                 ending_3,
-                 label=None):
-        self.swag_id = swag_id
-        self.context_sentence = context_sentence
-        self.start_ending = start_ending
-        self.endings = [
-            ending_0,
-            ending_1,
-            ending_2,
-            ending_3,
-        ]
-        self.label = label
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        l = [
-            "swag_id: {}".format(self.swag_id),
-            "context_sentence: {}".format(self.context_sentence),
-            "start_ending: {}".format(self.start_ending),
-            "ending_0: {}".format(self.endings[0]),
-            "ending_1: {}".format(self.endings[1]),
-            "ending_2: {}".format(self.endings[2]),
-            "ending_3: {}".format(self.endings[3]),
-        ]
-
-        if self.label is not None:
-            l.append("label: {}".format(self.label))
-
-        return ", ".join(l)
-
-
-class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
-
-                 ):
-        self.example_id = example_id
-        self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
-            for _, input_ids, input_mask, segment_ids in choices_features
-        ]
-        self.label = label
-
-
-def read_swag_examples(input_file, is_training):
-    with open(input_file, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        lines = []
-        for line in reader:
-            if sys.version_info[0] == 2:
-                line = list(unicode(cell, 'utf-8') for cell in line)
-            lines.append(line)
-
-    if is_training and lines[0][-1] != 'label':
-        raise ValueError(
-            "For training, the input file must contain a label column."
-        )
-
-    examples = [
-        SwagExample(
-            swag_id=line[2],
-            context_sentence=line[4],
-            start_ending=line[5],  # in the swag dataset, the
-            # common beginning of each
-            # choice is stored in "sent2".
-            ending_0=line[7],
-            ending_1=line[8],
-            ending_2=line[9],
-            ending_3=line[10],
-            label=int(line[11]) if is_training else None
-        ) for line in lines[1:]  # we skip the line with the column names
-    ]
-
-    return examples
-
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 is_training):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    # Swag is a multiple choice task. To perform this task using Bert,
-    # we will use the formatting proposed in "Improving Language
-    # Understanding by Generative Pre-Training" and suggested by
-    # @jacobdevlin-google in this issue
-    # https://github.com/google-research/bert/issues/38.
-    #
-    # Each choice will correspond to a sample on which we run the
-    # inference. For a given Swag example, we will create the 4
-    # following inputs:
-    # - [CLS] context [SEP] choice_1 [SEP]
-    # - [CLS] context [SEP] choice_2 [SEP]
-    # - [CLS] context [SEP] choice_3 [SEP]
-    # - [CLS] context [SEP] choice_4 [SEP]
-    # The model will output a single value for each input. To get the
-    # final decision of the model, we will run a softmax over these 4
-    # outputs.
-    features = []
-    # TODO: remove tqdm
-    for example_index, example in tqdm(enumerate(examples), total=len(examples), desc='converting examples'):
-        # for example_index, example in enumerate(examples):
-        context_tokens = tokenizer.tokenize(example.context_sentence)
-        start_ending_tokens = tokenizer.tokenize(example.start_ending)
-
-        choices_features = []
-        for ending_index, ending in enumerate(example.endings):
-            # We create a copy of the context tokens in order to be
-            # able to shrink it according to ending_tokens
-            context_tokens_choice = context_tokens[:]
-            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
-            # Modifies `context_tokens_choice` and `ending_tokens` in
-            # place so that the total length is less than the
-            # specified length.  Account for [CLS], [SEP], [SEP] with
-            # "- 3"
-            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
-
-            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
-            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding = [0] * (max_seq_length - len(input_ids))
-            input_ids += padding
-            input_mask += padding
-            segment_ids += padding
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            choices_features.append((tokens, input_ids, input_mask, segment_ids))
-
-        label = example.label
-
-        # if example_index < 5:
-        #     print("*** Example ***")
-        #     print("swag_id: {}".format(example.swag_id))
-        #     for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
-        #         print("choice: {}".format(choice_idx))
-        #         print("tokens: {}".format(' '.join(tokens)))
-        #         print("input_ids: {}".format(' '.join(map(str, input_ids))))
-        #         print("input_mask: {}".format(' '.join(map(str, input_mask))))
-        #         print("segment_ids: {}".format(' '.join(map(str, segment_ids))))
-        #     if is_training:
-        #         print("label: {}".format(label))
-
-        features.append(
-            InputFeatures(
-                example_id=example.swag_id,
-                choices_features=choices_features,
-                label=label
-            )
-        )
-
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
-
-
-def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
diff --git a/lilbert/notebooks/train_bert_swag.ipynb b/lilbert/notebooks/train_bert_swag.ipynb
new file mode 100644
index 0000000..bc4bef3
--- /dev/null
+++ b/lilbert/notebooks/train_bert_swag.ipynb
@@ -0,0 +1,237 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Скачаем `SWAG` датасет"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git clone https://github.com/rowanz/swagaf.git\n",
+    "!mv swagaf/data/ ../datasets/SWAG\n",
+    "!rm -fr swagaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "sys.path.append('..')\n",
+    "\n",
+    "import numpy as np\n",
+    "import random\n",
+    "import torch\n",
+    "import os\n",
+    "from pytorch_pretrained_bert.tokenization import BertTokenizer\n",
+    "\n",
+    "from lib import data_processors, tasks\n",
+    "from pytorch_pretrained_bert import BertForMultipleChoice\n",
+    "from lib.train_eval import train, evaluate, predict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<torch._C.Generator at 0x7fe090000730>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "params = {\n",
+    "    'data_dir': '../datasets/SWAG',\n",
+    "    'output_dir': '../output',\n",
+    "    'cache_dir': '../model_cache',\n",
+    "    'task_name': 'swag',\n",
+    "    'bert_model': 'bert-base-uncased',\n",
+    "    'max_seq_length': 128,\n",
+    "    'train_batch_size': 12,\n",
+    "    'eval_batch_size': 8,\n",
+    "    'learning_rate': 2e-5,\n",
+    "    'warmup_proportion': 0.1,\n",
+    "    'num_train_epochs': 1,\n",
+    "    'seed': 1331,\n",
+    "    'device': torch.device(\n",
+    "        'cuda' if torch.cuda.is_available()\n",
+    "        else 'cpu')\n",
+    "}\n",
+    "\n",
+    "random.seed(params['seed'])\n",
+    "np.random.seed(params['seed'])\n",
+    "torch.manual_seed(params['seed'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor = tasks.processors[params['task_name']]()\n",
+    "tokenizer = BertTokenizer.from_pretrained(\n",
+    "    params['bert_model'], do_lower_case=True)\n",
+    "\n",
+    "train_examples = processor.get_train_examples(params['data_dir'])\n",
+    "dev_examples = processor.get_dev_examples(params['data_dir'])\n",
+    "\n",
+    "model = BertForMultipleChoice.from_pretrained(\n",
+    "    params['bert_model'],\n",
+    "    cache_dir=params['cache_dir'], num_choices=4).to(params['device'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "converting examples: 100%|██████████| 73546/73546 [01:18<00:00, 937.95it/s] \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "***** Running training *****\n",
+      "Num examples: 73546\n",
+      "Batch size:   12\n",
+      "Num steps:    6128\n",
+      "\n",
+      "Epoch: 1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:   3%|▎         | 202/6129 [00:33<17:18,  5.71it/s]"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-5-51e131470cb9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     11\u001b[0m                       \u001b[0mtrain_examples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m                       \u001b[0mvalid_examples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdev_examples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m                       checkpoint_files=checkpoint_files)\n\u001b[0m",
+      "\u001b[0;32m~/projects/lilbert/lilbert/lib/train_eval.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(model, tokenizer, params, train_examples, valid_examples, checkpoint_files)\u001b[0m\n\u001b[1;32m     83\u001b[0m             \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 85\u001b[0;31m             \u001b[0mtr_loss\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     86\u001b[0m             \u001b[0mnb_tr_examples\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     87\u001b[0m             \u001b[0mnb_tr_steps\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "EPOCH_NUM = 1\n",
+    "\n",
+    "params['num_train_epochs'] = 1\n",
+    "checkpoint_files = {\n",
+    "    'config': 'bert_config.json',\n",
+    "    'model_weigths': 'model_{}_epoch_{}.pth'.format(\n",
+    "        params['task_name'], EPOCH_NUM)\n",
+    "}\n",
+    "\n",
+    "model, result = train(model, tokenizer, params,\n",
+    "                      train_examples,\n",
+    "                      valid_examples=dev_examples,\n",
+    "                      checkpoint_files=checkpoint_files)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "converting examples:   0%|          | 67/20006 [00:00<00:30, 660.73it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "***** Running evaluation *****\n",
+      "Num examples:  20006\n",
+      "Batch size:    8\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "converting examples: 100%|██████████| 20006/20006 [00:23<00:00, 844.60it/s]\n",
+      "Evaluating:  28%|██▊       | 702/2501 [01:44<04:29,  6.69it/s]"
+     ]
+    }
+   ],
+   "source": [
+    "result, prob_preds = evaluate(model, tokenizer, params,\n",
+    "                              dev_examples)\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/lilbert/notebooks/train_bert_swag.py b/lilbert/notebooks/train_bert_swag.py
deleted file mode 100644
index 911e57e..0000000
--- a/lilbert/notebooks/train_bert_swag.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import numpy as np
-import random
-import torch
-from pytorch_pretrained_bert import BertForMultipleChoice
-import os
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-
-from lib import data_processors, tasks
-from lib.bert import BertForSequenceClassification
-from lib.train_eval import train, evaluate, predict
-
-params = {
-    'data_dir': '../../data/SWAG',
-    'output_dir': '../../output',
-    'cache_dir': '../../model_cache',
-    'task_name': 'swag',
-    'bert_model': 'bert-base-uncased',
-    'max_seq_length': 12,
-    'train_batch_size': 1,
-    'eval_batch_size': 8,
-    'learning_rate': 2e-5,
-    'warmup_proportion': 0.1,
-    'num_train_epochs': 1,
-    'seed': 1331,
-    'device': torch.device(
-        'cuda' if torch.cuda.is_available()
-        else 'cpu')
-}
-
-random.seed(params['seed'])
-np.random.seed(params['seed'])
-torch.manual_seed(params['seed'])
-
-processor = tasks.processors[params['task_name']]()
-tokenizer = BertTokenizer.from_pretrained(
-    params['bert_model'], do_lower_case=True)
-
-train_examples = processor.get_train_examples(params['data_dir'])
-dev_examples = processor.get_dev_examples(params['data_dir'])
-
-model = BertForMultipleChoice.from_pretrained(
-    params['bert_model'],
-    cache_dir=params['cache_dir'], num_choices=4).to(params['device'])
-
-EPOCH_NUM = 1
-
-params['num_train_epochs'] = 1
-checkpoint_files = {
-    'config': 'bert_config.json',
-    'model_weigths': 'model_{}_epoch_{}.pth'.format(
-        params['task_name'], EPOCH_NUM)
-}
-
-model, result = train(model, tokenizer, params,
-                      train_examples,
-                      valid_examples=dev_examples,
-                      checkpoint_files=checkpoint_files)
-print(result)

From fd45db4ce85938235da7df54dc810551cb0551bc Mon Sep 17 00:00:00 2001
From: mponty <heapnhash@gmail.com>
Date: Fri, 29 Mar 2019 14:06:04 +0000
Subject: [PATCH 5/5] add evaluation in train_bert_swag.ipynb

---
 lilbert/notebooks/train_bert_swag.ipynb | 79 ++++++++++++++-----------
 1 file changed, 46 insertions(+), 33 deletions(-)

diff --git a/lilbert/notebooks/train_bert_swag.ipynb b/lilbert/notebooks/train_bert_swag.ipynb
index bc4bef3..d7fd280 100644
--- a/lilbert/notebooks/train_bert_swag.ipynb
+++ b/lilbert/notebooks/train_bert_swag.ipynb
@@ -56,7 +56,7 @@
     {
      "data": {
       "text/plain": [
-       "<torch._C.Generator at 0x7fe090000730>"
+       "<torch._C.Generator at 0x7f17a41645b0>"
       ]
      },
      "execution_count": 2,
@@ -109,33 +109,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "converting examples: 100%|██████████| 73546/73546 [01:18<00:00, 937.95it/s] \n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "***** Running training *****\n",
-      "Num examples: 73546\n",
-      "Batch size:   12\n",
-      "Num steps:    6128\n",
-      "\n",
-      "Epoch: 1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:   3%|▎         | 202/6129 [00:33<17:18,  5.71it/s]"
+      "converting examples:   6%|▌         | 4280/73546 [00:05<01:23, 830.33it/s]\n"
      ]
     },
     {
@@ -145,8 +126,13 @@
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-5-51e131470cb9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     11\u001b[0m                       \u001b[0mtrain_examples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m                       \u001b[0mvalid_examples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdev_examples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m                       checkpoint_files=checkpoint_files)\n\u001b[0m",
-      "\u001b[0;32m~/projects/lilbert/lilbert/lib/train_eval.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(model, tokenizer, params, train_examples, valid_examples, checkpoint_files)\u001b[0m\n\u001b[1;32m     83\u001b[0m             \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 85\u001b[0;31m             \u001b[0mtr_loss\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     86\u001b[0m             \u001b[0mnb_tr_examples\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     87\u001b[0m             \u001b[0mnb_tr_steps\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m<ipython-input-4-51e131470cb9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     11\u001b[0m                       \u001b[0mtrain_examples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m                       \u001b[0mvalid_examples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdev_examples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m                       checkpoint_files=checkpoint_files)\n\u001b[0m",
+      "\u001b[0;32m~/projects/lilbert/lilbert/lib/train_eval.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(model, tokenizer, params, train_examples, valid_examples, checkpoint_files)\u001b[0m\n\u001b[1;32m     46\u001b[0m         \u001b[0mtrain_examples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     47\u001b[0m         \u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 48\u001b[0;31m         params)\n\u001b[0m\u001b[1;32m     49\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"***** Running training *****\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     50\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Num examples:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_examples\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/projects/lilbert/lilbert/lib/feature_processors.py\u001b[0m in \u001b[0;36mconvert_examples_to_features\u001b[0;34m(examples, tokenizer, params)\u001b[0m\n\u001b[1;32m      9\u001b[0m                                                 params['max_seq_length'], tokenizer)\n\u001b[1;32m     10\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mconvert_examples_to_features_swag\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexamples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'max_seq_length'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/projects/lilbert/lilbert/lib/feature_processors.py\u001b[0m in \u001b[0;36mconvert_examples_to_features_swag\u001b[0;34m(examples, max_seq_length, tokenizer)\u001b[0m\n\u001b[1;32m    113\u001b[0m             \u001b[0;31m# able to shrink it according to ending_tokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    114\u001b[0m             \u001b[0mcontext_tokens_choice\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcontext_tokens\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 115\u001b[0;31m             \u001b[0mending_tokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstart_ending_tokens\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mending\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    116\u001b[0m             \u001b[0;31m# Modifies `context_tokens_choice` and `ending_tokens` in\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    117\u001b[0m             \u001b[0;31m# place so that the total length is less than the\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.local/lib/python3.6/site-packages/pytorch_pretrained_bert/tokenization.py\u001b[0m in \u001b[0;36mtokenize\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m     91\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     92\u001b[0m         \u001b[0msplit_tokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 93\u001b[0;31m         \u001b[0;32mfor\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbasic_tokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     94\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0msub_token\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwordpiece_tokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     95\u001b[0m                 \u001b[0msplit_tokens\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msub_token\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.local/lib/python3.6/site-packages/pytorch_pretrained_bert/tokenization.py\u001b[0m in \u001b[0;36mtokenize\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m    178\u001b[0m         \u001b[0;31m# characters in the vocabulary because Wikipedia does have some Chinese\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    179\u001b[0m         \u001b[0;31m# words in the English Wikipedia.).\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 180\u001b[0;31m         \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tokenize_chinese_chars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    181\u001b[0m         \u001b[0morig_tokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwhitespace_tokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    182\u001b[0m         \u001b[0msplit_tokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.local/lib/python3.6/site-packages/pytorch_pretrained_bert/tokenization.py\u001b[0m in \u001b[0;36m_tokenize_chinese_chars\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m    227\u001b[0m         \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    228\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mchar\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 229\u001b[0;31m             \u001b[0mcp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mord\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchar\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    230\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_chinese_char\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    231\u001b[0m                 \u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
      ]
     }
@@ -169,16 +155,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
+   "execution_count": 10,
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "converting examples:   0%|          | 67/20006 [00:00<00:30, 660.73it/s]"
+      "converting examples:   0%|          | 72/20006 [00:00<00:27, 719.02it/s]"
      ]
     },
     {
@@ -194,14 +178,43 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "converting examples: 100%|██████████| 20006/20006 [00:23<00:00, 844.60it/s]\n",
-      "Evaluating:  28%|██▊       | 702/2501 [01:44<04:29,  6.69it/s]"
+      "converting examples: 100%|██████████| 20006/20006 [00:23<00:00, 869.80it/s]\n",
+      "Evaluating: 100%|██████████| 2501/2501 [01:02<00:00, 40.08it/s]\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'eval_loss': 1.3916393557536797,\n",
+       " 'eval_accuracy': 0.22358292512246325,\n",
+       " 'eval_f1_score': 0.21350037939898828}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "result, prob_preds = evaluate(model, tokenizer, params,\n",
-    "                              dev_examples)\n",
+    "from sklearn.metrics import f1_score\n",
+    "from lib import metrics\n",
+    "\n",
+    "def f1_score_multiclass(true_labels, prob_preds):\n",
+    "    pred_labels = np.argmax(prob_preds, axis=1)\n",
+    "    return f1_score(true_labels, pred_labels, average='macro')\n",
+    "\n",
+    "print(\"***** Running evaluation *****\")\n",
+    "print(\"Num examples: \", len(dev_examples))\n",
+    "print(\"Batch size:   \", params['eval_batch_size'])\n",
+    "\n",
+    "prob_preds = predict(model, tokenizer, params, dev_examples)\n",
+    "true_labels = np.array([int(example.label)\n",
+    "                        for i, example in enumerate(dev_examples)])\n",
+    "result = {\n",
+    "    'eval_loss': metrics.log_loss(true_labels, prob_preds),\n",
+    "    'eval_accuracy': metrics.accuracy(true_labels, prob_preds),\n",
+    "    'eval_f1_score': f1_score_multiclass(true_labels, prob_preds),\n",
+    "}\n",
     "result"
    ]
   },