diff --git a/CHANGES.rst b/CHANGES.rst index 5a13563..2237718 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changelog ========= +0.0.2 (2024-07-05) +------------------ + +- `from-pdf-pt` now uses `*.pdf` as default glob + + 0.0.1 (2024-05-02) ------------------ diff --git a/setup.py b/setup.py index 3463da6..e075e22 100644 --- a/setup.py +++ b/setup.py @@ -31,11 +31,11 @@ def _read(f): }, packages=find_namespace_packages(where='src'), install_requires=[ - "llm-dataset-converter>=0.2.1", + "llm-dataset-converter>=0.2.4", "pypdf", "simple-range>=0.0.3", ], - version="0.0.1", + version="0.0.2", author='Peter Reutemann', author_email='fracpete@waikato.ac.nz', entry_points={ diff --git a/src/ldc_pdf/pretrain/_pdf.py b/src/ldc_pdf/pretrain/_pdf.py index 4d12790..436c852 100644 --- a/src/ldc_pdf/pretrain/_pdf.py +++ b/src/ldc_pdf/pretrain/_pdf.py @@ -102,7 +102,7 @@ def initialize(self): Initializes the reading, e.g., for opening files or databases. """ super().initialize() - self._inputs = locate_files(self.source, input_lists=self.source_list, fail_if_empty=True) + self._inputs = locate_files(self.source, input_lists=self.source_list, fail_if_empty=True, default_glob="*.pdf") if self.page_range is None: self.page_range = "first-last" if self.combine_pages is None: