-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPyMuPDF
52 lines (43 loc) · 1.71 KB
/
PyMuPDF
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import fitz # PyMuPDF
def print_annotation_details(doc):
for page in doc:
print(f"Processing Page {page.number + 1}")
for annot in page.annots():
annot_type = annot.type
print(f"Annotation Type: {annot_type[0]} - {annot_type[1]}")
print(f"Field Name: {annot.field_name}")
print(f"Field Value: {annot.field_value}")
print(f"Position: {annot.rect}")
print("-" * 30)
# Path to the directory with PDF files
pdf_dir = '.' # Adjust if your PDFs are in a different directory
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
# Process each PDF and print details about each annotation
for file_name in pdf_files:
print(f"Processing file: {file_name}")
with fitz.open(os.path.join(pdf_dir, file_name)) as doc:
print_annotation_details(doc)
print("=" * 50)
import fitz
def extract_text_and_images(doc):
for page in doc:
# Extract text
text = page.get_text("text")
print("Extracted text:")
print(text)
# Extract images
image_list = page.get_images(full=True)
print("Found images:")
for img in image_list:
xref = img[0] # XREF of the image
base_image = doc.extract_image(xref)
print(f"Image {xref} details: {base_image['width']}x{base_image['height']} pixels")
# Optionally save the image to disk
image_filename = f'image_{xref}.png'
with open(image_filename, 'wb') as imgfile:
imgfile.write(base_image['image'])
# Open the PDF
file_path = 'path_to_your_pdf.pdf' # Specify the path to your PDF
doc = fitz.open(file_path)
extract_text_and_images(doc)