Format with black

RapidAI · Aug 24, 2023 · efa0591 · efa0591
1 parent cbc5146
commit efa0591
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,9 @@
     <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
     <a href="https://pypi.org/project/rapidocr-pdf/"><img alt="PyPI" src="https://img.shields.io/pypi/v/rapidocr-pdf"></a>
     <a href="https://pepy.tech/project/rapidocr-pdf"><img src="https://static.pepy.tech/personalized-badge/rapidocr-pdf?period=total&units=abbreviation&left_color=grey&right_color=blue&left_text=Downloads"></a>
+    <a href="https://semver.org/"><img alt="SemVer2.0" src="https://img.shields.io/badge/SemVer-2.0-brightgreen"></a>
+    <a href="https://github.com/psf/black"><img src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
+    <a href="https://choosealicense.com/licenses/apache-2.0/"><img alt="GitHub" src="https://img.shields.io/github/license/RapidAI/RapidOCRPDF"></a>
 </p>
 
 - 依托于[RapidOCR](https://github.com/RapidAI/RapidOCR)仓库，快速提取PDF中文字，包括扫描版PDF、加密版PDF。

diff --git a/demo.py b/demo.py
@@ -5,6 +5,6 @@
 
 pdf_extracter = PDFExtracter()
 
-pdf_path = 'tests/test_files/image.pdf'
+pdf_path = "tests/test_files/image.pdf"
 texts = pdf_extracter(pdf_path)
 print(texts)
diff --git a/docs/docs.md b/docs/docs.md
@@ -4,6 +4,9 @@
     <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
     <a href="https://pypi.org/project/rapidocr-pdf/"><img alt="PyPI" src="https://img.shields.io/pypi/v/rapidocr-pdf"></a>
     <a href="https://pepy.tech/project/rapidocr-pdf"><img src="https://static.pepy.tech/personalized-badge/rapidocr-pdf?period=total&units=abbreviation&left_color=grey&right_color=blue&left_text=Downloads"></a>
+    <a href="https://semver.org/"><img alt="SemVer2.0" src="https://img.shields.io/badge/SemVer-2.0-brightgreen"></a>
+    <a href="https://github.com/psf/black"><img src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
+    <a href="https://choosealicense.com/licenses/apache-2.0/"><img alt="GitHub" src="https://img.shields.io/github/license/RapidAI/RapidOCRPDF"></a>
 </p>
 
 - Relying on [RapidOCR](https://github.com/RapidAI/RapidOCR), quickly extract text from PDF, including scanned PDF and encrypted PDF.

diff --git a/rapidocr_pdf/main.py b/rapidocr_pdf/main.py
@@ -14,25 +14,29 @@
 try:
     from rapidocr_onnxruntime import RapidOCR
 except:
-    warnings.warn("Can't find the rapidocr_onnxruntime module,"
-                  "try to import the rapidocr_openvino")
+    warnings.warn(
+        "Can't find the rapidocr_onnxruntime module,"
+        "try to import the rapidocr_openvino"
+    )
     from rapidocr_openvino import RapidOCR
 
 
-class PDFExtracter():
+class PDFExtracter:
     def __init__(self, dpi=200):
         self.dpi = dpi
         self.text_sys = RapidOCR()
         self.empyt_list = []
 
-    def __call__(self, content: Union[str, Path, bytes]) -> List[List[Union[str, str, str]]]:
+    def __call__(
+        self, content: Union[str, Path, bytes]
+    ) -> List[List[Union[str, str, str]]]:
         try:
             file_type = self.which_type(content)
         except (FileExistsError, TypeError) as e:
-            raise PDFExtracterError('The input content is empty.') from e
+            raise PDFExtracterError("The input content is empty.") from e
 
-        if file_type != 'pdf':
-            raise PDFExtracterError('The file type is not PDF format.')
+        if file_type != "pdf":
+            raise PDFExtracterError("The file type is not PDF format.")
 
         try:
             pdf_data = self.load_pdf(content)
@@ -45,26 +49,24 @@ def __call__(self, content: Union[str, Path, bytes]) -> List[List[Union[str, str
         ocr_res_dict = self.get_ocr_res(page_img_dict)
 
         final_result = {**txts_dict, **ocr_res_dict}
-        final_result = dict(sorted(final_result.items(),
-                                   key=lambda x: int(x[0])))
-        final_result = [[k, v, '1.0'] for k, v in final_result.items()]
+        final_result = dict(sorted(final_result.items(), key=lambda x: int(x[0])))
+        final_result = [[k, v, "1.0"] for k, v in final_result.items()]
         return final_result
 
     @staticmethod
     def load_pdf(pdf_content: Union[str, Path, bytes]) -> bytes:
         if isinstance(pdf_content, (str, Path)):
             if not Path(pdf_content).exists():
-                raise PDFExtracterError(f'{pdf_content} does not exist.')
+                raise PDFExtracterError(f"{pdf_content} does not exist.")
 
-            with open(pdf_content, 'rb') as f:
+            with open(pdf_content, "rb") as f:
                 data = f.read()
             return data
 
         if isinstance(pdf_content, bytes):
             return pdf_content
 
-        raise PDFExtracterError(
-            f'{type(pdf_content)} is not in [str, Path, bytes].')
+        raise PDFExtracterError(f"{type(pdf_content)} is not in [str, Path, bytes].")
 
     def extract_texts(self, pdf_data: bytes) -> Tuple[Dict, List]:
         texts, page_idxs = {}, []
@@ -95,17 +97,17 @@ def get_ocr_res(self, page_img_dict: Dict) -> Dict:
             preds, _ = self.text_sys(v)
             if preds:
                 _, rec_res, _ = list(zip(*preds))
-                ocr_res[str(k)] = '\n'.join(rec_res)
+                ocr_res[str(k)] = "\n".join(rec_res)
         return ocr_res
 
     @staticmethod
     def which_type(content: Union[bytes, str, Path]) -> str:
         if isinstance(content, (str, Path)) and not Path(content).exists():
-            raise FileExistsError(f'{content} does not exist.')
+            raise FileExistsError(f"{content} does not exist.")
 
         kind = filetype.guess(content)
         if kind is None:
-            raise TypeError(f'The type of {content} does not support.')
+            raise TypeError(f"The type of {content} does not support.")
 
         return kind.extension
 
@@ -116,8 +118,9 @@ class PDFExtracterError(Exception):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('-path', '--file_path', type=str,
-                        help='File path, PDF or images')
+    parser.add_argument(
+        "-path", "--file_path", type=str, help="File path, PDF or images"
+    )
     args = parser.parse_args()
 
     pdf_extracter = PDFExtracter()
@@ -126,5 +129,5 @@ def main():
     print(result)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/setup.py b/setup.py
@@ -9,14 +9,14 @@
 
 
 def get_readme():
-    readme_path = './docs/docs.md'
-    with open(readme_path, 'r', encoding='utf-8') as f:
+    readme_path = "./docs/docs.md"
+    with open(readme_path, "r", encoding="utf-8") as f:
         readme = f.read()
     return readme
 
 
-MODULE_NAME = 'rapidocr_pdf'
-VERSION_NUM = '0.0.1'
+MODULE_NAME = "rapidocr_pdf"
+VERSION_NUM = "0.0.1"
 
 obtainer = GetPyPiLatestVersion()
 try:
@@ -25,44 +25,45 @@ def get_readme():
         VERSION_NUM = obtainer.version_add_one(latest_version)
 
     if len(sys.argv) > 2:
-        match_str = ' '.join(sys.argv[2:])
+        match_str = " ".join(sys.argv[2:])
         matched_versions = obtainer.extract_version(match_str)
         if matched_versions:
             VERSION_NUM = matched_versions
 except ValueError:
     warnings.warn(
-        f'The package {MODULE_NAME} seems to be submitting for the first time.')
+        f"The package {MODULE_NAME} seems to be submitting for the first time."
+    )
 
 sys.argv = sys.argv[:2]
 
 setuptools.setup(
     name=MODULE_NAME,
     version=VERSION_NUM,
     platforms="Any",
-    description='Tools of extracting PDF content based on RapidOCR',
+    description="Tools of extracting PDF content based on RapidOCR",
     long_description=get_readme(),
-    long_description_content_type='text/markdown',
+    long_description_content_type="text/markdown",
     author="SWHL",
     author_email="liekkaskono@163.com",
     url="https://github.com/RapidAI/RapidOCRPDF",
-    license='Apache-2.0',
+    license="Apache-2.0",
     packages=[MODULE_NAME],
-    install_requires=['filetype', 'pymupdf'],
-    keywords=['rapidocr_pdf,rapidocr_onnxruntime,ocr,onnxruntime,openvino'],
+    install_requires=["filetype", "pymupdf"],
+    keywords=["rapidocr_pdf,rapidocr_onnxruntime,ocr,onnxruntime,openvino"],
     classifiers=[
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        'Programming Language :: Python :: 3.10',
-        'Programming Language :: Python :: 3.11',
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
     ],
-    python_requires='>=3.6,<3.12',
+    python_requires=">=3.6,<3.12",
     entry_points={
-        'console_scripts': [f'{MODULE_NAME}={MODULE_NAME}.main:main'],
+        "console_scripts": [f"{MODULE_NAME}={MODULE_NAME}.main:main"],
     },
     extras_require={
-        'onnxruntime': ['rapidocr_onnxruntime'],
-        'openvino': ['rapidocr_openvino']
-    }
+        "onnxruntime": ["rapidocr_onnxruntime"],
+        "openvino": ["rapidocr_openvino"],
+    },
 )