Skip to content

Commit

Permalink
feat: init preprocess_nougat.py
Browse files Browse the repository at this point in the history
  • Loading branch information
MicPie committed Oct 6, 2023
1 parent 9d8952a commit 4ca1ac1
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 2 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ repos:
exclude: ^experiments/configs

- repo: https://github.com/psf/black
rev: 23.3.0
rev: 23.9.1
hooks:
- id: black
language_version: python3 # Should be a command that runs python3.6+

- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
rev: 6.1.0
hooks:
- id: flake8
args: [--count, --show-source, --statistics]
Expand Down
109 changes: 109 additions & 0 deletions data/natural/preprocess_nougat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import glob
import json
import re

import markdown
from bs4 import BeautifulSoup
from tqdm import tqdm


def load_mmd_from_path(path):
with open(path) as f:
data = f.read()
return data


rm_ref_single = re.compile(r"\[\d+\]")
rm_ref_multi = re.compile(r"\[\d+.+\d\]")
rm_missing_page = re.compile(r"\n\n\[MISSING_PAGE_FAIL:\d+\]")
rm_d = re.compile(r"\d")


def clean_mmd(mmd):
res = [rm_ref_single, rm_ref_multi, rm_missing_page]
for r in res:
mmd = r.sub("", mmd)
return mmd


def mmd_to_html(mmd):
return markdown.markdown(mmd)


exclude_headers = [
"associated content",
"accession codes",
"author information",
"corresponding author",
"authors",
"author contributions",
"notes",
"acknowledgments",
"references",
"notes",
"supporting information",
"code availability",
"acknowledgements",
"author contributions",
"competing interests",
"funding sources",
"acknowledgment",
]


def html_to_clean_text(html, verbose=False):
soup = BeautifulSoup(html, "html.parser")
text = ""
headers = soup.find_all(re.compile("^h\d$")) # noqa: W605
if verbose:
for i, h in enumerate(headers):
print(i, h.text)
print()

for i, h in enumerate(headers):
header_text = h.text

cont = True # continue after exclude header check
for eh in exclude_headers:
if header_text.lower().find(eh) != -1:
cont = False

if cont:
text += header_text
text += "\n"

if (
i != 0
): # the first header comes usually with unwanted infos, e.g., author information
for sibling in h.next_siblings:
if sibling.name is None:
continue
elif sibling.name.startswith("h"):
break
else:
text += sibling.text
text += "\n\n"
text += "\n"
return text


def create_jsonl_from_dir(path):
print(f"{path=}")
paths = sorted(glob.glob(path + "/*.mmd"))
path_jsonl = path + "/out.jsonl"
print(f"{path_jsonl=}")
for path in (pbar := tqdm(paths)):
fn = path.split("/")[-1].split(".mmd")[0]
pbar.set_postfix_str(fn)
mmd = load_mmd_from_path(path)
mmd = clean_mmd(mmd)
html = mmd_to_html(mmd)
text = html_to_clean_text(html)
out = {"fn": fn, "text": text}
with open(path_jsonl, "a") as f:
f.write(json.dumps(out) + "\n")


if __name__ == "__main__":
path_base = ""
create_jsonl_from_dir(path_base)

0 comments on commit 4ca1ac1

Please sign in to comment.