Skip to content

Commit

Permalink
Filter out tables from DOCX
Browse files Browse the repository at this point in the history
  • Loading branch information
blakeNaccarato committed Aug 2, 2023
1 parent df1db22 commit 8d0de31
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 3 deletions.
4 changes: 2 additions & 2 deletions data/docs.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 24468ff299e6b6f608ceb7e52618af0d.dir
size: 3246802
- md5: 454f0f0ed0a390bc98cda08391d71afc.dir
size: 3235049
nfiles: 11
hash: md5
path: docs
1 change: 1 addition & 0 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ project_paths:
mpl_hide_title: plotting/hide_title.mplstyle
scripts: scripts
zotero: scripts/zotero.lua
filt: scripts/filt.py
csl: scripts/international-journal-of-heat-and-mass-transfer.csl
template: scripts/template.dotx
stage_find_contours: src/boilercv/stages/find_contours.py
Expand Down
53 changes: 53 additions & 0 deletions scripts/filt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Filter tables from Pandoc input."""

from dataclasses import dataclass
from typing import Any

from pandocfilters import toJSONFilter

REMOVE: list[Any] = []
"""Signals the removal of a block."""

memory: list[str] = []
"""Memory of block keys for multi-block matching."""


def main(
key: str, _value: str, _format: str, _meta: dict[str, Any]
) -> list[Any] | None:
checks = [check_table(key)]
if memory and not any(c.match for c in checks):
memory.clear()
if any(c.remove for c in checks):
return REMOVE # Remove this block


@dataclass
class Check:
"""Whether the current block is part of a match, and whether to remove it."""

match: bool = True
"""Whether the current block is part of a match."""

remove: bool = True
"""Whether to remove the current block."""


TABLE = ["RawBlock", "Plain", "Str"]
"""Sequence of blocks representing a table."""


def check_table(key: str) -> Check:
if key == TABLE[0]:
memory.append(key)
return Check()
if memory and key == TABLE[1] and memory[-1] == TABLE[0]:
memory.append(key)
return Check()
if memory and key == TABLE[2] and memory[-2:] == TABLE[:2]:
return Check()
return Check(match=False, remove=False)


if __name__ == "__main__":
toJSONFilter(main)
1 change: 1 addition & 0 deletions src/boilercv/models/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class ProjectPaths(CreatePathsModel):
# ! SCRIPTS
scripts: DirectoryPath = project / "scripts"
zotero: FilePath = scripts / "zotero.lua"
filt: FilePath = scripts / "filt.py"
csl: FilePath = scripts / "international-journal-of-heat-and-mass-transfer.csl"
template: FilePath = scripts / "template.dotx"

Expand Down
5 changes: 4 additions & 1 deletion src/boilercv/pre_repro.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ async def report(nbs: dict[Path, str]):
for kwarg, path in dict(
workdir=PARAMS.paths.md,
template=PARAMS.project_paths.template,
filt=PARAMS.project_paths.filt,
zotero=PARAMS.project_paths.zotero,
csl=PARAMS.project_paths.csl,
docx=PARAMS.paths.docx / nb.with_suffix(".docx").name,
Expand Down Expand Up @@ -185,7 +186,7 @@ async def wrapped(*args, **kwargs):

@preserve_dir
async def report_on_notebook(
workdir: str, template: str, zotero: str, csl: str, docx: str, md: str
workdir: str, template: str, filt: str, zotero: str, csl: str, docx: str, md: str
):
"""Generate a DOCX report from a notebook.
Expand All @@ -202,6 +203,8 @@ async def report_on_notebook(
" --from markdown-auto_identifiers" # Avoids bookmark pollution around Markdown headers
" --to docx" # The output format
f" --reference-doc {template}" # The template to export literature reviews to
# Custom filter to strip out dataframes
f" --filter {filt}"
# Zotero Lua filter and metadata passed to it
f" --lua-filter {zotero}" # Needs to be the one downloaded from the tutorial page https://retorque.re/zotero-better-bibtex/exporting/pandoc/#from-markdown-to-zotero-live-citations
" --metadata zotero_library:3" # Corresponds to "Nucleate pool boiling [3]"
Expand Down

0 comments on commit 8d0de31

Please sign in to comment.