-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_image.py
253 lines (206 loc) · 9.78 KB
/
remove_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import sys
import os
import argparse
from io import StringIO
from PyPDF2 import PdfWriter as PdfFileWriter, PdfReader as PdfFileReader
from pdfrw import PdfReader, PdfWriter, PdfDict, PdfArray, PdfName
parser = argparse.ArgumentParser(
description='Remove images from PDFs to reduce size.')
parser.add_argument('input_pdf', help='input pdf file')
parser.add_argument('-a', '--aggressive', action='store_true',
help='Use aggressive strategy provided by PyPDF2 to remove images.')
parser.add_argument('-i', '--ignore-side-effects', action='store_true',
help='Ignore side effects of removing images. This may cause other /XObjects to be removed, if some /Image are deeply nested inside /XObjects that contains other subtypes of data.')
parser.add_argument('-r', '--remove-size', type=int,
help='Force remove images with size above SIZE bytes, even if it is not displayed or referenced. Default: 100000')
result_filename = "pdfs/image_removed.pdf"
arg_strip_size = 100000
arg_ignore_side_effects = False
# This is cherry-picked from a pdfrw official example.
def find_objects(source, valid_types=(PdfName.XObject,),
valid_subtypes=(PdfName.Image,),
no_follow=(PdfName.Parent,),
isinstance=isinstance, id=id, sorted=sorted,
reversed=reversed, PdfDict=PdfDict):
'''
Find all the objects of a particular kind in a document
or array. Defaults to looking for Form and Image XObjects.
This could be done recursively, but some PDFs
are quite deeply nested, so we do it without
recursion.
Note that we don't know exactly where things appear on pages,
but we aim for a sort order that is (a) mostly in document order,
and (b) reproducible. For arrays, objects are processed in
array order, and for dicts, they are processed in key order.
'''
container = (PdfDict, PdfArray)
# Allow passing a list of pages, or a dict
if isinstance(source, PdfDict):
source = [source]
else:
source = list(source)
visited = set()
source.reverse()
while source:
obj = source.pop()
arr = []
if not isinstance(obj, container):
continue
myid = id(obj)
if myid in visited:
continue
visited.add(myid)
if isinstance(obj, PdfDict):
if obj.Type in valid_types and obj.Subtype in valid_subtypes:
yield obj
arr = [y for (x, y) in sorted(obj.iteritems())
if x not in no_follow]
else:
# TODO: This forces resolution of any indirect objects in
# the array. It may not be necessary. Don't know if
# reversed() does any voodoo underneath the hood.
# It's cheap enough for now, but might be removeable.
obj and obj[0]
arr = obj
source.extend(reversed(arr))
def remove_image(pdf_filename, aggressive=False):
"""
Remove image from pdf file.
params:
pdf_filename: the pdf file to remove image
"""
print("Linearizing input PDF...")
os.system(f"qpdf --linearize --replace-input \"{pdf_filename}\"")
input_pdf = PdfFileReader(open(pdf_filename, "rb"), strict=True)
output_pdf = PdfFileWriter()
for i in range(len(input_pdf.pages)):
page = input_pdf.pages[i]
output_pdf.add_page(page)
# output_pdf.remove_images(ignore_byte_string_object=aggressive)
# output_pdf.remove_images()
output_pdf.add_metadata(input_pdf.metadata or {})
with open(result_filename, "wb") as outputStream:
# sys.stderr = StringIO() # to suppress warnings
output_pdf.write(outputStream)
# sys.stderr = sys.__stderr__
print("Linearizing result PDF...")
os.system(f"qpdf --linearize --replace-input \"{result_filename}\"")
def strip_objects(pdf, heuristic_compare_info_obj_list, compare_keys=("Width", "Height", "Length")):
for i, page in enumerate(pdf.pages):
# skip empty pages
if not page.Resources.XObject:
continue
# Map all the objects in the page using the objects id as the key and
# the resource name as the value.
name_map = {indirect_obj.indirect[0]: name for name,
indirect_obj in page.Resources.XObject.items()}
has_deleted = False
before_keys = page.Resources.XObject.keys()
for compare_info in heuristic_compare_info_obj_list:
for obj_id in name_map:
obj = page.Resources.XObject[name_map[obj_id]]
if obj is None:
print(f"P{i} - skipping missing object", obj_id)
continue
if "/Resources" in obj:
"""Handle nested /XObjects.
>>> obj["/Resources"]
{
'/ProcSet': [ '/PDF', '/Text', '/ImageC', '/ImageB' ],
'/XObject': {
'/Im0': {
'/Height': '955',
'/Subtype': '/Image',
'/Type': '/XObject',
'/Width': '1103',
'/Length': '286609'
}
}
}
"""
resources_obj = obj["/Resources"]
can_delete_resource = True
has_matching_nested_obj = False
total_length = 0
if "/Length" in obj:
total_length = int(obj["/Length"])
if "/XObject" in resources_obj:
obj_unwrap_root = resources_obj["/XObject"]
for nested_id, nested_obj in obj_unwrap_root.items():
if nested_obj is None:
continue
if "/Length" in nested_obj:
total_length += int(nested_obj["/Length"])
if "/Type" not in nested_obj or nested_obj["/Type"] != "/XObject":
can_delete_resource = False
print(
f"P{i} - encountered an XObject that contains non-XObject object", nested_obj, obj_id)
continue
if "/Subtype" not in nested_obj or nested_obj["/Subtype"] != "/Image":
can_delete_resource = False
print(
f"P{i} - encountered an XObject that contains non-Image XObject", nested_obj, obj_id)
continue
does_nested_obj_match = True
for key in compare_keys:
if key in compare_info and key in nested_obj and compare_info[key] == nested_obj[key]:
does_nested_obj_match = False
break
if does_nested_obj_match:
print(
f"P{i} - found matching nested XObject {nested_id} in {obj_id}")
has_matching_nested_obj = True
if has_matching_nested_obj and (can_delete_resource or arg_ignore_side_effects):
print(f"P{i} - deleting object", obj_id,
"of size", total_length)
del page.Resources.XObject[name_map[obj_id]]
has_deleted = True
break
elif has_matching_nested_obj:
print(
f"P{i} - object {obj_id} has matching nested objects but cannot be deleted due to side-effects. Specify -f to ignore this.")
else:
does_obj_match = True
for key in compare_keys:
if ('/' + key) not in obj or obj['/' + key] != compare_info[key]:
does_obj_match = False
break
if does_obj_match:
print(f"P{i} - deleting object", obj_id,
"of size", "/Length" in obj and obj["/Length"] or -1)
del page.Resources.XObject[name_map[obj_id]]
has_deleted = True
break
after_keys = page.Resources.XObject.keys()
if has_deleted:
print(f"P{i} - before: {str(before_keys)}, after: {str(after_keys)}")
return pdf
def main():
if len(sys.argv) < 2:
parser.print_help()
return
args = parser.parse_args()
global arg_strip_size, arg_ignore_side_effects
arg_strip_size = args.remove_size or arg_strip_size
arg_ignore_side_effects = args.ignore_side_effects or arg_ignore_side_effects
aggressive = args.aggressive or False
pdf_filename = args.input_pdf
print(f"Pruning images from {pdf_filename}...")
remove_image(pdf_filename, aggressive)
result_pdf = PdfReader(result_filename)
strip_obj_info_list = []
for obj in find_objects(result_pdf.pages):
if obj.Length and int(obj.Length) > arg_strip_size:
strip_obj_info_list.append({
"Width": obj.Width,
"Height": obj.Height,
"Length": obj.Length
})
print(f"Stripping /Image /XObjects larger than {arg_strip_size}...")
strip_objects(result_pdf, strip_obj_info_list)
PdfWriter().write(result_filename, result_pdf)
print(f"Final result PDF saved as {result_filename}")
if __name__ == "__main__":
main()
else:
print("This script is not intended to be imported.")