Merge pull request #75 from thunlp/dev

fix template format & BMInf CPM (partially supported)
thunlp · Dec 9, 2021 · a478a28 · a478a28
2 parents 0540fd3 + 7dd506c
commit a478a28
Show file tree

Hide file tree

Showing 41 changed files with 2,155 additions and 92 deletions.
diff --git a/experiments/relation_classification_ptr.yaml b/experiments/relation_classification_ptr.yaml
@@ -10,7 +10,7 @@ plm:
     freeze_para: False
     lr: 0.00003
     weight_decay: 0.01
-    eps: 1.0E-6
+    adam_epsilon: 1.0E-6
     scheduler:
       type: 
       num_warmup_steps: 500
@@ -43,6 +43,8 @@ ptr_template:
   choice: 0
   file_path: scripts/RelationClassification/TACRED/ptr_template.txt
   optimize:
+    name: AdamW
+    adam_epsilon: 1.0e-8
     lr: 0.00001
 
 ptr_verbalizer:

diff --git a/openprompt/data_utils/ZH/__init__.py b/openprompt/data_utils/ZH/__init__.py
@@ -0,0 +1,11 @@
+from .closed_QA import *
+from .coreference import *
+from .entity_typing import *
+from .generation import *
+from .nli import *
+from .paraphrase import *
+from .reading_comprehensation import *
+from .relation import *
+from .sentiment import *
+from .summarization import *
+from .topic_classification import *
diff --git a/openprompt/data_utils/ZH/closed_QA.py b/openprompt/data_utils/ZH/closed_QA.py
@@ -0,0 +1,30 @@
+from .processor import *
+
+class ZhiDao(DataProcessor):
+    """
+    """
+    def __init__(self):
+        super().__init__()
+
+    def get_examples(self, data_dir, split):
+        if split != 'train': raise ValueError
+        path = os.path.join(data_dir, f"{split}.csv")
+        with open(path, encoding='utf8') as f:
+            reader = csv.reader(f, delimiter=",")
+            for i, row in enumerate(reader):
+                if i==0: continue
+                title, question, reply, is_best = row
+                if not is_best: continue
+                example = InputExample(
+                    meta = {
+                        "title": title,
+                        "question": question,
+                    },
+                    tgt_text = reply,
+                )
+                examples.append(example)
+
+    def get_templates(self):
+        return [
+            '问题：{title} {question} 回答：',
+        ]
diff --git a/openprompt/data_utils/ZH/coreference.py b/openprompt/data_utils/ZH/coreference.py
@@ -0,0 +1,81 @@
+from .processor import *
+
+class Mandarinograd(CLSProcessor):
+    """
+    @inproceedings{bernard2020mandarinograd,
+    title={Mandarinograd: A Chinese Collection of Winograd Schemas},
+    author={Bernard, Timoth{\'e}e and Han, Ting},
+    booktitle={Proceedings of The 12th Language Resources and Evaluation Conference},
+    pages={21--26},
+    year={2020}
+    }
+    """
+    def __init__(self):
+        super().__init__(
+            labels_origin = [
+                "0", "1",
+            ],
+            labels_mapped = [
+                "不可以", "可以", 
+            ]
+        )
+
+    def get_examples(self, data_dir, split):
+        path = os.path.join(data_dir, f"{split}.json")
+
+        with open(path, encoding='utf8') as f:
+            for example_json in json.load(f).values():
+                example = InputExample(
+                    meta = {
+                        "before": example_json["text"],
+                        "after": example_json["hypothesis"],
+                        "options": self.labels_mapped,
+                    },
+                    tgt_text = self.get_label(example_json["class_id"]),
+                )
+                examples.append(example)
+
+
+
+    def get_templates(self):
+        return [
+            '文本：{before} 问题：上述文本进行指代消解后，可以理解为 "{after}" 吗？{options}',
+        ]
+
+
+class CLUEWSC2020(CLSProcessor):
+    """
+    """
+    def __init__(self):
+        super().__init__(
+            labels_origin = [
+                "false", "true",
+            ],
+            labels_mapped = [
+                "否", "是",
+            ]
+        )
+
+    def get_examples(self, data_dir, split):
+        path = os.path.join(data_dir, f"{split}.json")
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                example = InputExample(
+                    meta = {
+                        "that": example_json["target"]["span1_text"],
+                        "it": example_json["target"]["span2_text"],
+                        "text": example_json["text"],
+                        "options": self.labels_mapped,
+                    },
+                    tgt_text = self.get_label(example_json["label"]),
+                )
+                examples.append(example)
+
+
+
+    def get_templates(self):
+        return [
+            '文本：{text} 问题：在上文中，“{it}“ 指代 "{that}" 吗？{options}',
+        ]
diff --git a/openprompt/data_utils/ZH/entity_typing.py b/openprompt/data_utils/ZH/entity_typing.py
@@ -0,0 +1,214 @@
+from .processor import *
+
+class CMeEE_NER(CLSProcessor):
+    """
+    @inproceedings{hongying2020building,
+    title={Building a Pediatric Medical Corpus: Word Segmentation and Named Entity Annotation},
+    author={Hongying, Zan and Wenxin, Li and Kunli, Zhang and Yajuan, Ye and Baobao, Chang and Zhifang, Sui},
+    booktitle={Workshop on Chinese Lexical Semantics},
+    pages={652--664},
+    year={2020},
+    organization={Springer}
+    }
+    """
+    def __init__(self):
+        super().__init__(
+            labels_origin = [
+                "dis", "sym", "dru", "equ", "pro", "bod", "ite", "mic", "dep",
+            ],
+            labels_mapped = [
+                "疾病", "临床表现", "药物", "医疗设备", "医疗程序", "身体", "医学检验项目", "微生物类", "科室",
+            ]
+        )
+
+    def get_examples(self, data_dir, split):
+        path = os.path.join(data_dir, f"{split}.json")
+        examples = []
+        with open(path, encoding='utf8') as f:
+            for example_json in json.load(f):
+                for span in example_json["entities"]:
+
+                    example = InputExample(
+                        meta = {
+                            "context": example_json["text"],
+                            "entity": span["entity"],
+                            "options": self.labels_mapped,
+                        },
+                        label = self.get_label(span["type"])
+                    )
+                    examples.append(example)
+        return examples
+
+
+
+    def get_templates(self):
+        return [
+            '文本：{context} 问题：上文中，实体“{entity}”是什么类型的? {options}',
+        ]
+
+
+class Resume_NER(CLSProcessor):
+    """
+    @article{zhang2018chinese,  
+    title={Chinese NER Using Lattice LSTM},  
+    author={Yue Zhang and Jie Yang},  
+    booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL)},
+    year={2018}  
+    }
+    """
+    def __init__(self):
+        super().__init__(
+            labels_origin = [
+                "NAME",  "CONT", "LOC", "RACE", "PRO", "EDU", "ORG", "TITLE",
+            ],
+            labels_mapped = [
+                "人名", "国籍", "籍贯", "种族", "专业", "学位", "机构", "职称",
+            ]
+        )
+
+    def get_examples(self, data_dir, split):
+        path = os.path.join(data_dir, f"{split}.jsonline")
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                for span in example_json["span_list"]:
+                    example = InputExample(
+                        meta = {
+                            "context": "".join(example_json["tokens"]),
+                            "entity":  "".join(example_json["tokens"][span["start"]: span["end"]+1]),
+                            "options": self.labels_mapped,
+                        },
+                        tgt_text = self.get_label(span["type"]),
+                    )
+                    examples.append(example)
+
+
+
+    def get_templates(self):
+        return [
+            '文本：{context} 问题：上文中，实体“{entity}”是什么类型的? {options}',
+        ]
+
+
+class Weibo_NER(CLSProcessor):
+    """
+    @inproceedings{peng2015ner,
+    title={Named Entity Recognition for Chinese Social Media with Jointly Trained Embeddings},
+    author={Peng, Nanyun and Dredze, Mark},
+    booktitle={Processings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+    pages={548–-554},
+    year={2015}, File={https://www.aclweb.org/anthology/D15-1064/}, }
+
+    @inproceedings{peng2016improving,
+    title={Improving named entity recognition for Chinese social media with word segmentation representation learning},
+    author={Peng, Nanyun and Dredze, Mark},
+    booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL)},
+    volume={2},
+    pages={149--155},
+    year={2016}, File={https://www.aclweb.org/anthology/P16-2025/}, }
+
+    @inproceedings{he-sun-2017-f,
+    title = "{F}-Score Driven Max Margin Neural Network for Named Entity Recognition in {C}hinese Social Media",
+    author = "He, Hangfeng  and
+    Sun, Xu",
+    booktitle = "Proceedings of the 15th Conference of the {E}uropean Chapter of the Association for Computational Linguistics: Volume 2, Short Papers",
+    month = apr,
+    year = "2017",
+    address = "Valencia, Spain",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/E17-2113",
+    pages = "713--718",
+    }
+    """
+    def __init__(self):
+        super().__init__(
+            labels_origin = [ # TODO 区分 NAM 特指 NOM 泛指
+                "PER.NAM",  "PER.NOM", "LOC.NAM", "LOC.NOM", "ORG.NAM", "ORG.NOM", "GPE.NAM", "GPE.NOM", 
+            ],
+            labels_mapped = [
+                "人", "泛指人", "地点", "泛指地点", "机构", "泛指机构", "地理政治实体", "泛指地理政治实体", 
+            ]
+        )
+
+    def get_examples(self, data_dir, split):
+        path = os.path.join(data_dir, f"{split}.jsonline")
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                for span in example_json["span_list"]:
+                    example = InputExample(
+                        meta = {
+                            "context": "".join(example_json["tokens"]),
+                            "entity": "".join(example_json["tokens"][span["start"]: span["end"]+1]),
+                            "options": self.labels_mapped,
+                        },
+                        tgt_text = self.get_label(span["type"]),
+                    )
+                    examples.append(example)
+
+
+
+    def get_templates(self):
+        return [
+            '文本：{context} 问题：上文中，实体“{entity}”是什么类型的? {options}',
+        ]
+
+
+class DH_MSRA(CLSProcessor):
+    """
+    """
+    def __init__(self):
+        super().__init__(
+            labels_origin = [
+                "PER", "LOC", "ORG"
+            ],
+            labels_mapped = [
+                "人", "地点", "机构"
+            ]
+        )
+
+    def get_examples(self, data_dir, split):
+        if split != 'train': raise ValueError
+        path = os.path.join(data_dir, f"{split}.txt")
+
+        with open(path, encoding='utf8') as f:
+            xs, ys = [], []
+            for line in f:
+                l = line.split()
+                if len(l)==0:
+                    i = 0
+                    while i < len(xs):
+                        if ys[i][0] == 'B':
+                            j = i + 1
+                            while j < len(xs):
+                                if ys[j][0] == 'O': break
+                                j = j + 1
+
+                            example = InputExample(
+                                meta = {
+                                    "context": "".join(xs),
+                                    "entity": "".join(xs[i:j]),
+                                    "options": self.labels_mapped,
+                                },
+                                tgt_text = self.get_label(ys[i][2:]),
+                            )
+                            examples.append(example)
+
+                            i = j
+                        else:
+                            i = i + 1
+
+                    xs, ys = [], []
+                else:
+                    xs.append(l[0])
+                    ys.append(l[1])
+
+
+
+
+    def get_templates(self):
+        return [
+            '文本：{context} 问题：上文中，实体“{entity}”是什么类型的? {options}',
+        ]