Skip to content

Commit

Permalink
feat: update nougat cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
MicPie committed Oct 9, 2023
1 parent 4ca1ac1 commit 6df4844
Showing 1 changed file with 53 additions and 17 deletions.
70 changes: 53 additions & 17 deletions data/natural/preprocess_nougat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from bs4 import BeautifulSoup
from tqdm import tqdm

TEXT_CUTOFF = 0


def load_mmd_from_path(path):
with open(path) as f:
Expand All @@ -15,12 +17,13 @@ def load_mmd_from_path(path):

rm_ref_single = re.compile(r"\[\d+\]")
rm_ref_multi = re.compile(r"\[\d+.+\d\]")
rm_missing_page = re.compile(r"\n\n\[MISSING_PAGE_FAIL:\d+\]")
rm_missing_page_fail = re.compile(r"\n\n\[MISSING_PAGE_FAIL:\d+\]")
rm_missing_page_empty = re.compile(r"\n\n\[MISSING_PAGE_EMPTY:\d+\]")
rm_d = re.compile(r"\d")


def clean_mmd(mmd):
res = [rm_ref_single, rm_ref_multi, rm_missing_page]
res = [rm_ref_single, rm_ref_multi, rm_missing_page_fail, rm_missing_page_empty]
for r in res:
mmd = r.sub("", mmd)
return mmd
Expand All @@ -31,23 +34,46 @@ def mmd_to_html(mmd):


exclude_headers = [
"associated content",
"accession codes",
"acknowledgement",
"acknowledgements",
"acknowledgment",
"acknowledgments",
"additional information",
"associated content",
"author contributions",
"author information",
"corresponding author",
"author",
"authors",
"author contributions",
"notes",
"acknowledgments",
"references",
"notes",
"supporting information",
"bibliography",
"code availability",
"acknowledgements",
"author contributions",
"competing interest",
"competing interests statement",
"competing interests",
"conflict of interest",
"conflicts of interest",
"corresponding author",
"data and software availability",
"data availability",
"declaration of competing interest",
"dedication",
"disclaimer",
"financial support",
"funding acs",
"funding sources",
"acknowledgment",
"keywords",
"note",
"notes",
"orcid",
"present address",
"reference",
"references",
"supplementary material",
"supporting formation available",
"supporting information available",
"supporting information",
"table of contents",
# "abbreviations",
]


Expand All @@ -69,7 +95,10 @@ def html_to_clean_text(html, verbose=False):
cont = False

if cont:
text += header_text
if header_text.isupper():
text += header_text.capitalize() # or .title() ?
else:
text += header_text
text += "\n"

if (
Expand All @@ -84,6 +113,8 @@ def html_to_clean_text(html, verbose=False):
text += sibling.text
text += "\n\n"
text += "\n"

text = text.replace("\n\n\n", "\n\n") # clean up line breaks
return text


Expand All @@ -99,9 +130,14 @@ def create_jsonl_from_dir(path):
mmd = clean_mmd(mmd)
html = mmd_to_html(mmd)
text = html_to_clean_text(html)
out = {"fn": fn, "text": text}
with open(path_jsonl, "a") as f:
f.write(json.dumps(out) + "\n")
if len(text) <= TEXT_CUTOFF:
print(f"Too short text in: {fn}")
elif text.count("Journal of") > 10:
print(f'Too many "Journal of" in text: {fn}')
else:
out = {"fn": fn, "text": text}
with open(path_jsonl, "a") as f:
f.write(json.dumps(out) + "\n")


if __name__ == "__main__":
Expand Down

0 comments on commit 6df4844

Please sign in to comment.