feat: update nougat cleaning 3

OpenBioML · Oct 10, 2023 · f69d61f · f69d61f
1 parent d557b8c
commit f69d61f
Showing 1 changed file with 45 additions and 26 deletions.
diff --git a/data/natural/preprocess_nougat.py b/data/natural/preprocess_nougat.py
@@ -15,25 +15,33 @@ def load_mmd_from_path(path):
     return data
 
 
-rm_missing_page_fail = re.compile(r"\n\n\[MISSING_PAGE_FAIL:\d+\]")
-rm_missing_page_empty = re.compile(r"\n\n\[MISSING_PAGE_EMPTY:\d+\]")
-rm_figure_caption_start = re.compile(r"[Ff]igure \d+\.?[:\|]?\s")
-rm_fig_caption_start = re.compile(r"[Ff]ig. \d+\.?[:\|]?\s")
-rm_figure_in_brackets = re.compile(r" \([Ff]igure \d+\.?\)")
-rm_fig_in_brackets = re.compile(r" \([Ff]ig. \d+\.?\)")
-rm_figure_reference = re.compile(r", see [Ff]igure \d+")
-rm_fig_reference = re.compile(r", see [Ff]ig. \d+")
-rm_ref_single = re.compile(r"\s?\[\d+]")
-rm_ref_multi = re.compile(r"\s?\[\d+.+\d\]")
-rm_email_with_text = re.compile(r"[Ee]mail[:\s] \S*@\S*\s?")
-rm_email = re.compile(r"\S*@\S*\s?")
+rm_missing_page_fail = re.compile(r"\n\n\[MISSING_PAGE_FAIL:\d+\]"), ""
+rm_missing_page_empty = re.compile(r"\n\n\[MISSING_PAGE_EMPTY:\d+\]"), ""
+rm_missing_page_post = re.compile(r"\n\n\[MISSING_PAGE_POST\]"), ""
+rm_figure_caption_start = re.compile(r"[Ff]igure \d+\w?\.?[:\|]?\s"), ""
+rm_schema_caption_start = re.compile(r"[Ss]chema \d+\w?\.?[:\|]?\s"), ""
+rm_fig_caption_start = re.compile(r"[Ff]ig. \d+\w?\.?[:\|]?\s"), ""
+rm_figure_in_brackets = re.compile(r" \([Ff]igure \d+\w?\.?\)"), ""
+rm_fig_in_brackets = re.compile(r" \([Ff]ig. \d+\w?\.?\)"), ""
+rm_figure_reference = re.compile(r", see [Ff]igure \d+\w?"), ""
+rm_fig_reference = re.compile(r", see [Ff]ig. \d+\w?"), ""
+rm_ref_single = re.compile(r"\s?\[\d+]"), ""
+rm_ref_multi = re.compile(r"\s?\[\d+.+\d\]"), ""
+rm_email_with_text = re.compile(r"[Ee]mail[:\s] \S*@\S*\s?"), ""
+rm_email = re.compile(r"\S*@\S*\s?"), ""
+rm_incomplete_sentence_start_para = re.compile(r"\n\n[a-z].+?\.\s"), "\n\n"
+rm_incomplete_sentence_end_para = re.compile(r"\.\s[A-Z,a-z][^\.]+?[a-z][,]?\n"), ".\n"
+rm_empty_table = re.compile(r"\n\n\\begin{table}\n\n\\end{table}\nTable.+?\."), "\n"
+rm_double_asterisk = re.compile(r"\*\*"), ""
 
 
 def clean_mmd(mmd):
-    res = [
+    reg_replace = [
         rm_missing_page_fail,
         rm_missing_page_empty,
+        rm_missing_page_post,
         rm_figure_caption_start,
+        rm_schema_caption_start,
         rm_fig_caption_start,
         rm_figure_in_brackets,
         rm_fig_in_brackets,
@@ -43,9 +51,13 @@ def clean_mmd(mmd):
         rm_ref_multi,
         rm_email_with_text,
         rm_email,
+        rm_incomplete_sentence_start_para,
+        rm_incomplete_sentence_end_para,
+        rm_empty_table,
+        rm_double_asterisk,
     ]
-    for r in res:
-        mmd = r.sub("", mmd)
+    for reg, replace in reg_replace:
+        mmd = reg.sub(replace, mmd)
     return mmd
 
 
@@ -95,6 +107,7 @@ def mmd_to_html(mmd):
     "supporting information available",
     "supporting information",
     "table of contents",
+    "corresponding authors:",
     # "abbreviations",
 ]
 
@@ -116,24 +129,27 @@ def html_to_clean_text(html, verbose=False):
             if header_text.lower().find(eh) != -1:
                 cont = False
 
+        first_headers = ["abstract", "introduction"]
+        if i == 0:
+            # check if we don't find a first_headers
+            if not (any([header_text.lower().find(fh) > -1 for fh in first_headers])):
+                cont = False
+
         if cont:
             if header_text.isupper():
                 text += header_text.capitalize()  # or .title() ?
             else:
                 text += header_text
             text += "\n"
 
-            if (
-                i != 0
-            ):  # the first header comes usually with unwanted infos, e.g., author information
-                for sibling in h.next_siblings:
-                    if sibling.name is None:
-                        continue
-                    elif sibling.name.startswith("h"):
-                        break
-                    else:
-                        text += sibling.text
-                        text += "\n\n"
+            for sibling in h.next_siblings:
+                if sibling.name is None:
+                    continue
+                elif sibling.name.startswith("h"):
+                    break
+                else:
+                    text += sibling.text
+                    text += "\n\n"
             text += "\n"
 
     text = text.replace("\n\n\n", "\n\n")  # clean up line breaks
@@ -160,6 +176,9 @@ def create_jsonl_from_dir(path):
             out = {"fn": fn, "text": text}
             with open(path_jsonl, "a") as f:
                 f.write(json.dumps(out) + "\n")
+            # uncomment to diff individual files for debugging
+            # with open(path.replace(".mmd", "_.mmd"), "w") as f:
+            #     f.write(text)
 
 
 if __name__ == "__main__":