diff --git a/data/natural/preprocess_nougat.py b/data/natural/preprocess_nougat.py index 3f5316015..28c9449bc 100644 --- a/data/natural/preprocess_nougat.py +++ b/data/natural/preprocess_nougat.py @@ -15,11 +15,15 @@ def load_mmd_from_path(path): return data +rm_double_asterisk_start = re.compile(r"\n\*\*"), "\n## " +rm_double_asterisk_end = re.compile(r"\*\*\n"), "" +rm_double_asterisk = re.compile(r"\*\*"), "" rm_missing_page_fail = re.compile(r"\n\n\[MISSING_PAGE_FAIL:\d+\]"), "" rm_missing_page_empty = re.compile(r"\n\n\[MISSING_PAGE_EMPTY:\d+\]"), "" rm_missing_page_post = re.compile(r"\n\n\[MISSING_PAGE_POST\]"), "" rm_figure_caption_start = re.compile(r"[Ff]igure \d+\w?\.?[:\|]?\s"), "" rm_schema_caption_start = re.compile(r"[Ss]chema \d+\w?\.?[:\|]?\s"), "" +rm_schema_caption_start = re.compile(r"[Ss]cheme \d+\w?\.?[:\|]?\s"), "" rm_fig_caption_start = re.compile(r"[Ff]ig. \d+\w?\.?[:\|]?\s"), "" rm_figure_in_brackets = re.compile(r" \([Ff]igure \d+\w?\.?\)"), "" rm_fig_in_brackets = re.compile(r" \([Ff]ig. \d+\w?\.?\)"), "" @@ -29,14 +33,18 @@ def load_mmd_from_path(path): rm_ref_multi = re.compile(r"\s?\[\d+.+\d\]"), "" rm_email_with_text = re.compile(r"[Ee]mail[:\s] \S*@\S*\s?"), "" rm_email = re.compile(r"\S*@\S*\s?"), "" +rm_empty_table = re.compile(r"\n\n\\begin{table}\n\n\\end{table}\nTable.+?\."), "\n" rm_incomplete_sentence_start_para = re.compile(r"\n\n[a-z].+?\.\s"), "\n\n" rm_incomplete_sentence_end_para = re.compile(r"\.\s[A-Z,a-z][^\.]+?[a-z][,]?\n"), ".\n" -rm_empty_table = re.compile(r"\n\n\\begin{table}\n\n\\end{table}\nTable.+?\."), "\n" -rm_double_asterisk = re.compile(r"\*\*"), "" + +year_numbers = re.compile(r"[19,20]\d\d\,") def clean_mmd(mmd): reg_replace = [ + rm_double_asterisk_start, + rm_double_asterisk_end, + # rm_double_asterisk, rm_missing_page_fail, rm_missing_page_empty, rm_missing_page_post, @@ -51,10 +59,9 @@ def clean_mmd(mmd): rm_ref_multi, rm_email_with_text, rm_email, + rm_empty_table, rm_incomplete_sentence_start_para, rm_incomplete_sentence_end_para, - rm_empty_table, - rm_double_asterisk, ] for reg, replace in reg_replace: mmd = reg.sub(replace, mmd) @@ -95,6 +102,7 @@ def mmd_to_html(mmd): "funding acs", "funding sources", "graphical toc entry", + "graphical abstract", "keywords", "note", "notes", @@ -107,6 +115,7 @@ def mmd_to_html(mmd): "supporting information available", "supporting information", "table of contents", + "toc", "corresponding authors:", # "abbreviations", ] @@ -172,6 +181,8 @@ def create_jsonl_from_dir(path): print(f"Too short text in: {fn}") elif text.count("Journal of") > 10: print(f'Too many "Journal of" in text: {fn}') + elif len(year_numbers.findall(text)) > 10: + print(f"Too many year numbers in text: {fn}") else: out = {"fn": fn, "text": text} with open(path_jsonl, "a") as f: