Skip to content

Commit

Permalink
feat: clean up exclude_headers
Browse files Browse the repository at this point in the history
  • Loading branch information
MicPie committed Oct 11, 2023
1 parent a1af563 commit 8f0ebe1
Showing 1 changed file with 3 additions and 14 deletions.
17 changes: 3 additions & 14 deletions data/natural/preprocess_nougat.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,24 +119,17 @@ def remove_header(mmd, header_span):
exclude_headers = [
"accession codes",
"acknowledgement",
"acknowledgements",
"acknowledgment",
"acknowledgments",
"additional files",
"additional information",
"associated content",
"author contributions",
"author information",
"author",
"authors",
"author", # incl: "author information", "corresponding author",
"bibliography",
"code availability",
"competing interest",
"competing interests statement",
"competing interests",
"conflict of interest",
"conflicts of interest",
"corresponding author",
"data and software availability",
"data availability",
"declaration of competing interest",
Expand All @@ -145,23 +138,19 @@ def remove_header(mmd, header_span):
"financial support",
"funding acs",
"funding sources",
"graphical toc entry",
"graphical toc",
"graphical abstract",
"keywords",
"note",
"notes",
"orcid",
"present address",
"reference",
"references",
"supplementary material",
"supporting formation available",
"supporting information available",
"supporting information",
"table of contents",
# "toc", # creates false positives
"corresponding authors:",
# "abbreviations",
# "toc", # creates false positives
]


Expand Down

0 comments on commit 8f0ebe1

Please sign in to comment.