Skip to content

Commit

Permalink
feat: update nougat cleaning 3
Browse files Browse the repository at this point in the history
  • Loading branch information
MicPie committed Oct 10, 2023
1 parent d557b8c commit f69d61f
Showing 1 changed file with 45 additions and 26 deletions.
71 changes: 45 additions & 26 deletions data/natural/preprocess_nougat.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,33 @@ def load_mmd_from_path(path):
return data


rm_missing_page_fail = re.compile(r"\n\n\[MISSING_PAGE_FAIL:\d+\]")
rm_missing_page_empty = re.compile(r"\n\n\[MISSING_PAGE_EMPTY:\d+\]")
rm_figure_caption_start = re.compile(r"[Ff]igure \d+\.?[:\|]?\s")
rm_fig_caption_start = re.compile(r"[Ff]ig. \d+\.?[:\|]?\s")
rm_figure_in_brackets = re.compile(r" \([Ff]igure \d+\.?\)")
rm_fig_in_brackets = re.compile(r" \([Ff]ig. \d+\.?\)")
rm_figure_reference = re.compile(r", see [Ff]igure \d+")
rm_fig_reference = re.compile(r", see [Ff]ig. \d+")
rm_ref_single = re.compile(r"\s?\[\d+]")
rm_ref_multi = re.compile(r"\s?\[\d+.+\d\]")
rm_email_with_text = re.compile(r"[Ee]mail[:\s] \S*@\S*\s?")
rm_email = re.compile(r"\S*@\S*\s?")
rm_missing_page_fail = re.compile(r"\n\n\[MISSING_PAGE_FAIL:\d+\]"), ""
rm_missing_page_empty = re.compile(r"\n\n\[MISSING_PAGE_EMPTY:\d+\]"), ""
rm_missing_page_post = re.compile(r"\n\n\[MISSING_PAGE_POST\]"), ""
rm_figure_caption_start = re.compile(r"[Ff]igure \d+\w?\.?[:\|]?\s"), ""
rm_schema_caption_start = re.compile(r"[Ss]chema \d+\w?\.?[:\|]?\s"), ""
rm_fig_caption_start = re.compile(r"[Ff]ig. \d+\w?\.?[:\|]?\s"), ""
rm_figure_in_brackets = re.compile(r" \([Ff]igure \d+\w?\.?\)"), ""
rm_fig_in_brackets = re.compile(r" \([Ff]ig. \d+\w?\.?\)"), ""
rm_figure_reference = re.compile(r", see [Ff]igure \d+\w?"), ""
rm_fig_reference = re.compile(r", see [Ff]ig. \d+\w?"), ""
rm_ref_single = re.compile(r"\s?\[\d+]"), ""
rm_ref_multi = re.compile(r"\s?\[\d+.+\d\]"), ""
rm_email_with_text = re.compile(r"[Ee]mail[:\s] \S*@\S*\s?"), ""
rm_email = re.compile(r"\S*@\S*\s?"), ""
rm_incomplete_sentence_start_para = re.compile(r"\n\n[a-z].+?\.\s"), "\n\n"
rm_incomplete_sentence_end_para = re.compile(r"\.\s[A-Z,a-z][^\.]+?[a-z][,]?\n"), ".\n"
rm_empty_table = re.compile(r"\n\n\\begin{table}\n\n\\end{table}\nTable.+?\."), "\n"
rm_double_asterisk = re.compile(r"\*\*"), ""


def clean_mmd(mmd):
res = [
reg_replace = [
rm_missing_page_fail,
rm_missing_page_empty,
rm_missing_page_post,
rm_figure_caption_start,
rm_schema_caption_start,
rm_fig_caption_start,
rm_figure_in_brackets,
rm_fig_in_brackets,
Expand All @@ -43,9 +51,13 @@ def clean_mmd(mmd):
rm_ref_multi,
rm_email_with_text,
rm_email,
rm_incomplete_sentence_start_para,
rm_incomplete_sentence_end_para,
rm_empty_table,
rm_double_asterisk,
]
for r in res:
mmd = r.sub("", mmd)
for reg, replace in reg_replace:
mmd = reg.sub(replace, mmd)
return mmd


Expand Down Expand Up @@ -95,6 +107,7 @@ def mmd_to_html(mmd):
"supporting information available",
"supporting information",
"table of contents",
"corresponding authors:",
# "abbreviations",
]

Expand All @@ -116,24 +129,27 @@ def html_to_clean_text(html, verbose=False):
if header_text.lower().find(eh) != -1:
cont = False

first_headers = ["abstract", "introduction"]
if i == 0:
# check if we don't find a first_headers
if not (any([header_text.lower().find(fh) > -1 for fh in first_headers])):
cont = False

if cont:
if header_text.isupper():
text += header_text.capitalize() # or .title() ?
else:
text += header_text
text += "\n"

if (
i != 0
): # the first header comes usually with unwanted infos, e.g., author information
for sibling in h.next_siblings:
if sibling.name is None:
continue
elif sibling.name.startswith("h"):
break
else:
text += sibling.text
text += "\n\n"
for sibling in h.next_siblings:
if sibling.name is None:
continue
elif sibling.name.startswith("h"):
break
else:
text += sibling.text
text += "\n\n"
text += "\n"

text = text.replace("\n\n\n", "\n\n") # clean up line breaks
Expand All @@ -160,6 +176,9 @@ def create_jsonl_from_dir(path):
out = {"fn": fn, "text": text}
with open(path_jsonl, "a") as f:
f.write(json.dumps(out) + "\n")
# uncomment to diff individual files for debugging
# with open(path.replace(".mmd", "_.mmd"), "w") as f:
# f.write(text)


if __name__ == "__main__":
Expand Down

0 comments on commit f69d61f

Please sign in to comment.