diff --git a/data/docs.dvc b/data/docs.dvc index cc17c5d1..2b668514 100644 --- a/data/docs.dvc +++ b/data/docs.dvc @@ -1,6 +1,6 @@ outs: -- md5: 24468ff299e6b6f608ceb7e52618af0d.dir - size: 3246802 +- md5: 454f0f0ed0a390bc98cda08391d71afc.dir + size: 3235049 nfiles: 11 hash: md5 path: docs diff --git a/params.yaml b/params.yaml index e43f1528..fffb3710 100644 --- a/params.yaml +++ b/params.yaml @@ -11,6 +11,7 @@ project_paths: mpl_hide_title: plotting/hide_title.mplstyle scripts: scripts zotero: scripts/zotero.lua + filt: scripts/filt.py csl: scripts/international-journal-of-heat-and-mass-transfer.csl template: scripts/template.dotx stage_find_contours: src/boilercv/stages/find_contours.py diff --git a/scripts/filt.py b/scripts/filt.py new file mode 100644 index 00000000..7d597bd3 --- /dev/null +++ b/scripts/filt.py @@ -0,0 +1,53 @@ +"""Filter tables from Pandoc input.""" + +from dataclasses import dataclass +from typing import Any + +from pandocfilters import toJSONFilter + +REMOVE: list[Any] = [] +"""Signals the removal of a block.""" + +memory: list[str] = [] +"""Memory of block keys for multi-block matching.""" + + +def main( + key: str, _value: str, _format: str, _meta: dict[str, Any] +) -> list[Any] | None: + checks = [check_table(key)] + if memory and not any(c.match for c in checks): + memory.clear() + if any(c.remove for c in checks): + return REMOVE # Remove this block + + +@dataclass +class Check: + """Whether the current block is part of a match, and whether to remove it.""" + + match: bool = True + """Whether the current block is part of a match.""" + + remove: bool = True + """Whether to remove the current block.""" + + +TABLE = ["RawBlock", "Plain", "Str"] +"""Sequence of blocks representing a table.""" + + +def check_table(key: str) -> Check: + if key == TABLE[0]: + memory.append(key) + return Check() + if memory and key == TABLE[1] and memory[-1] == TABLE[0]: + memory.append(key) + return Check() + if memory and key == TABLE[2] and memory[-2:] == TABLE[:2]: + return Check() + return Check(match=False, remove=False) + + +if __name__ == "__main__": + toJSONFilter(main) diff --git a/src/boilercv/models/paths.py b/src/boilercv/models/paths.py index 23c7fa75..0e3462aa 100644 --- a/src/boilercv/models/paths.py +++ b/src/boilercv/models/paths.py @@ -36,6 +36,7 @@ class ProjectPaths(CreatePathsModel): # ! SCRIPTS scripts: DirectoryPath = project / "scripts" zotero: FilePath = scripts / "zotero.lua" + filt: FilePath = scripts / "filt.py" csl: FilePath = scripts / "international-journal-of-heat-and-mass-transfer.csl" template: FilePath = scripts / "template.dotx" diff --git a/src/boilercv/pre_repro.py b/src/boilercv/pre_repro.py index 57b377f4..10fcdf55 100644 --- a/src/boilercv/pre_repro.py +++ b/src/boilercv/pre_repro.py @@ -127,6 +127,7 @@ async def report(nbs: dict[Path, str]): for kwarg, path in dict( workdir=PARAMS.paths.md, template=PARAMS.project_paths.template, + filt=PARAMS.project_paths.filt, zotero=PARAMS.project_paths.zotero, csl=PARAMS.project_paths.csl, docx=PARAMS.paths.docx / nb.with_suffix(".docx").name, @@ -185,7 +186,7 @@ async def wrapped(*args, **kwargs): @preserve_dir async def report_on_notebook( - workdir: str, template: str, zotero: str, csl: str, docx: str, md: str + workdir: str, template: str, filt: str, zotero: str, csl: str, docx: str, md: str ): """Generate a DOCX report from a notebook. @@ -202,6 +203,8 @@ async def report_on_notebook( " --from markdown-auto_identifiers" # Avoids bookmark pollution around Markdown headers " --to docx" # The output format f" --reference-doc {template}" # The template to export literature reviews to + # Custom filter to strip out dataframes + f" --filter {filt}" # Zotero Lua filter and metadata passed to it f" --lua-filter {zotero}" # Needs to be the one downloaded from the tutorial page https://retorque.re/zotero-better-bibtex/exporting/pandoc/#from-markdown-to-zotero-live-citations " --metadata zotero_library:3" # Corresponds to "Nucleate pool boiling [3]"