-
Notifications
You must be signed in to change notification settings - Fork 1
/
image_downloader.py
85 lines (65 loc) · 2.17 KB
/
image_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import sys, os, multiprocessing, urllib3
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import gzip
from multiprocessing.pool import ThreadPool
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def parse_data(path='data/datasets/products/products.tsv'):
key_url_list = []
with open(path) as f:
for i, l in enumerate(f):
row = l.split('\t')
asin = row[0]
url = row[3].strip() if len(row) > 3 else None
if i % 10000 == 0:
print('read lines: {}'.format(i))
key_url_list.append((asin, url))
if i % 1000 == 0:
print(i,row)
return key_url_list
def download_image(key_url):
outdir = sys.argv[2]
key, url = key_url
filename = os.path.join(outdir, '%s.jpg' % key)
if os.path.exists(filename):
print('Image %s already exists. Skipping download.' % filename)
return
try:
# print('Trying to get %s.' % url)
http = urllib3.PoolManager()
response = http.request('GET', url)
image_data = response.data
except Exception as e:
print(e)
print('Warning: Could not download image %s from %s' % (key, url))
return
try:
pil_image = Image.open(BytesIO(image_data))
except:
print('Warning: Failed to parse image %s %s' % (key, url))
return
try:
pil_image_rgb = pil_image.convert('RGB')
except:
print('Warning: Failed to convert image %s to RGB' % key)
return
try:
pil_image_rgb.save(filename, format='JPEG')
except:
print('Warning: Failed to save image %s' % filename)
return
def run():
if len(sys.argv) != 3:
print('Syntax: %s <train|validation|test.json> <output_dir/>' % sys.argv[0])
sys.exit(0)
data_file, out_dir = sys.argv[1:]
if not os.path.exists(out_dir):
os.mkdir(out_dir)
key_url_list = parse_data(data_file)
pool = ThreadPool(processes=20)
with tqdm(total=len(key_url_list)) as t:
for _ in pool.imap_unordered(download_image, key_url_list):
t.update(1)
if __name__ == '__main__':
run()