Skip to content

Commit

Permalink
Format with black
Browse files Browse the repository at this point in the history
  • Loading branch information
SWHL committed Aug 24, 2023
1 parent cbc5146 commit efa0591
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 43 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
<a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
<a href="https://pypi.org/project/rapidocr-pdf/"><img alt="PyPI" src="https://img.shields.io/pypi/v/rapidocr-pdf"></a>
<a href="https://pepy.tech/project/rapidocr-pdf"><img src="https://static.pepy.tech/personalized-badge/rapidocr-pdf?period=total&units=abbreviation&left_color=grey&right_color=blue&left_text=Downloads"></a>
<a href="https://semver.org/"><img alt="SemVer2.0" src="https://img.shields.io/badge/SemVer-2.0-brightgreen"></a>
<a href="https://github.com/psf/black"><img src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
<a href="https://choosealicense.com/licenses/apache-2.0/"><img alt="GitHub" src="https://img.shields.io/github/license/RapidAI/RapidOCRPDF"></a>
</p>

- 依托于[RapidOCR](https://github.com/RapidAI/RapidOCR)仓库,快速提取PDF中文字,包括扫描版PDF、加密版PDF。
Expand Down
2 changes: 1 addition & 1 deletion demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@

pdf_extracter = PDFExtracter()

pdf_path = 'tests/test_files/image.pdf'
pdf_path = "tests/test_files/image.pdf"
texts = pdf_extracter(pdf_path)
print(texts)
3 changes: 3 additions & 0 deletions docs/docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
<a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
<a href="https://pypi.org/project/rapidocr-pdf/"><img alt="PyPI" src="https://img.shields.io/pypi/v/rapidocr-pdf"></a>
<a href="https://pepy.tech/project/rapidocr-pdf"><img src="https://static.pepy.tech/personalized-badge/rapidocr-pdf?period=total&units=abbreviation&left_color=grey&right_color=blue&left_text=Downloads"></a>
<a href="https://semver.org/"><img alt="SemVer2.0" src="https://img.shields.io/badge/SemVer-2.0-brightgreen"></a>
<a href="https://github.com/psf/black"><img src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
<a href="https://choosealicense.com/licenses/apache-2.0/"><img alt="GitHub" src="https://img.shields.io/github/license/RapidAI/RapidOCRPDF"></a>
</p>

- Relying on [RapidOCR](https://github.com/RapidAI/RapidOCR), quickly extract text from PDF, including scanned PDF and encrypted PDF.
Expand Down
43 changes: 23 additions & 20 deletions rapidocr_pdf/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,29 @@
try:
from rapidocr_onnxruntime import RapidOCR
except:
warnings.warn("Can't find the rapidocr_onnxruntime module,"
"try to import the rapidocr_openvino")
warnings.warn(
"Can't find the rapidocr_onnxruntime module,"
"try to import the rapidocr_openvino"
)
from rapidocr_openvino import RapidOCR


class PDFExtracter():
class PDFExtracter:
def __init__(self, dpi=200):
self.dpi = dpi
self.text_sys = RapidOCR()
self.empyt_list = []

def __call__(self, content: Union[str, Path, bytes]) -> List[List[Union[str, str, str]]]:
def __call__(
self, content: Union[str, Path, bytes]
) -> List[List[Union[str, str, str]]]:
try:
file_type = self.which_type(content)
except (FileExistsError, TypeError) as e:
raise PDFExtracterError('The input content is empty.') from e
raise PDFExtracterError("The input content is empty.") from e

if file_type != 'pdf':
raise PDFExtracterError('The file type is not PDF format.')
if file_type != "pdf":
raise PDFExtracterError("The file type is not PDF format.")

try:
pdf_data = self.load_pdf(content)
Expand All @@ -45,26 +49,24 @@ def __call__(self, content: Union[str, Path, bytes]) -> List[List[Union[str, str
ocr_res_dict = self.get_ocr_res(page_img_dict)

final_result = {**txts_dict, **ocr_res_dict}
final_result = dict(sorted(final_result.items(),
key=lambda x: int(x[0])))
final_result = [[k, v, '1.0'] for k, v in final_result.items()]
final_result = dict(sorted(final_result.items(), key=lambda x: int(x[0])))
final_result = [[k, v, "1.0"] for k, v in final_result.items()]
return final_result

@staticmethod
def load_pdf(pdf_content: Union[str, Path, bytes]) -> bytes:
if isinstance(pdf_content, (str, Path)):
if not Path(pdf_content).exists():
raise PDFExtracterError(f'{pdf_content} does not exist.')
raise PDFExtracterError(f"{pdf_content} does not exist.")

with open(pdf_content, 'rb') as f:
with open(pdf_content, "rb") as f:
data = f.read()
return data

if isinstance(pdf_content, bytes):
return pdf_content

raise PDFExtracterError(
f'{type(pdf_content)} is not in [str, Path, bytes].')
raise PDFExtracterError(f"{type(pdf_content)} is not in [str, Path, bytes].")

def extract_texts(self, pdf_data: bytes) -> Tuple[Dict, List]:
texts, page_idxs = {}, []
Expand Down Expand Up @@ -95,17 +97,17 @@ def get_ocr_res(self, page_img_dict: Dict) -> Dict:
preds, _ = self.text_sys(v)
if preds:
_, rec_res, _ = list(zip(*preds))
ocr_res[str(k)] = '\n'.join(rec_res)
ocr_res[str(k)] = "\n".join(rec_res)
return ocr_res

@staticmethod
def which_type(content: Union[bytes, str, Path]) -> str:
if isinstance(content, (str, Path)) and not Path(content).exists():
raise FileExistsError(f'{content} does not exist.')
raise FileExistsError(f"{content} does not exist.")

kind = filetype.guess(content)
if kind is None:
raise TypeError(f'The type of {content} does not support.')
raise TypeError(f"The type of {content} does not support.")

return kind.extension

Expand All @@ -116,8 +118,9 @@ class PDFExtracterError(Exception):

def main():
parser = argparse.ArgumentParser()
parser.add_argument('-path', '--file_path', type=str,
help='File path, PDF or images')
parser.add_argument(
"-path", "--file_path", type=str, help="File path, PDF or images"
)
args = parser.parse_args()

pdf_extracter = PDFExtracter()
Expand All @@ -126,5 +129,5 @@ def main():
print(result)


if __name__ == '__main__':
if __name__ == "__main__":
main()
45 changes: 23 additions & 22 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@


def get_readme():
readme_path = './docs/docs.md'
with open(readme_path, 'r', encoding='utf-8') as f:
readme_path = "./docs/docs.md"
with open(readme_path, "r", encoding="utf-8") as f:
readme = f.read()
return readme


MODULE_NAME = 'rapidocr_pdf'
VERSION_NUM = '0.0.1'
MODULE_NAME = "rapidocr_pdf"
VERSION_NUM = "0.0.1"

obtainer = GetPyPiLatestVersion()
try:
Expand All @@ -25,44 +25,45 @@ def get_readme():
VERSION_NUM = obtainer.version_add_one(latest_version)

if len(sys.argv) > 2:
match_str = ' '.join(sys.argv[2:])
match_str = " ".join(sys.argv[2:])
matched_versions = obtainer.extract_version(match_str)
if matched_versions:
VERSION_NUM = matched_versions
except ValueError:
warnings.warn(
f'The package {MODULE_NAME} seems to be submitting for the first time.')
f"The package {MODULE_NAME} seems to be submitting for the first time."
)

sys.argv = sys.argv[:2]

setuptools.setup(
name=MODULE_NAME,
version=VERSION_NUM,
platforms="Any",
description='Tools of extracting PDF content based on RapidOCR',
description="Tools of extracting PDF content based on RapidOCR",
long_description=get_readme(),
long_description_content_type='text/markdown',
long_description_content_type="text/markdown",
author="SWHL",
author_email="liekkaskono@163.com",
url="https://github.com/RapidAI/RapidOCRPDF",
license='Apache-2.0',
license="Apache-2.0",
packages=[MODULE_NAME],
install_requires=['filetype', 'pymupdf'],
keywords=['rapidocr_pdf,rapidocr_onnxruntime,ocr,onnxruntime,openvino'],
install_requires=["filetype", "pymupdf"],
keywords=["rapidocr_pdf,rapidocr_onnxruntime,ocr,onnxruntime,openvino"],
classifiers=[
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
],
python_requires='>=3.6,<3.12',
python_requires=">=3.6,<3.12",
entry_points={
'console_scripts': [f'{MODULE_NAME}={MODULE_NAME}.main:main'],
"console_scripts": [f"{MODULE_NAME}={MODULE_NAME}.main:main"],
},
extras_require={
'onnxruntime': ['rapidocr_onnxruntime'],
'openvino': ['rapidocr_openvino']
}
"onnxruntime": ["rapidocr_onnxruntime"],
"openvino": ["rapidocr_openvino"],
},
)

0 comments on commit efa0591

Please sign in to comment.