-
Notifications
You must be signed in to change notification settings - Fork 0
/
imagedownload.py
executable file
·90 lines (75 loc) · 2.35 KB
/
imagedownload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import datetime
import gevent
import hashlib
import os
import csv
import requests
import http.client
import ssl
import time
import sys
import magic
import urllib.request
import pandas as pd
from gevent import monkey, socket
from gevent.pool import Pool
from socket import timeout
from socket import error as SocketError
from urllib.parse import urlparse
monkey.patch_socket()
pool = Pool(30)
#the users words
user_word = sys.argv[1]
df = pd.read_csv('output.csv', error_bad_lines=False)
# print(df)
df.columns = ['url', 'extra']
df.drop(['extra'], axis=1, inplace=True)
# print(df.head())
df.index.names = ['index']
df.reset_index(inplace=True)
# print(df.head())
# url_list = df[['index', 'url']].tolist()
url_list = df.values.tolist()
# print(url_list)
# camera_urls = []
# for camera in cameras:
# # append the index, url, and current hash to a list
# camera_urls.append([camera[0], camera[2], camera[7]])
# Using threading to download files here
def download_file(index, url):
# print('starting %s' % url)
try:
#context = ssl._create_unverified_context()
data = urllib.request.urlopen(url, timeout=3).read()
filepath = '258.{0}.jpg'.format((str(index+1)).zfill(4))
f = open('256_ObjectCategories/258.{0}/{1}'.format(user_word, filepath), 'wb')
f.write(data)
f.close()
cwd = os.getcwd()
full_path = cwd + "/256_ObjectCategories/258." + user_word + "/" + filepath
if magic.from_file(full_path, mime=True) == 'image/jpeg':
#print("[INFO] Image from {0} is different. Saving image...".format(index+1))
print(full_path)
else:
os.remove(filepath)
print("{0} is a wrong file type.".format(index+1))
except urllib.error.HTTPError as err:
print(err)
except urllib.error.URLError as err:
print(err)
except timeout as err:
print(err)
except http.client.HTTPException as err:
print(err)
except http.client.IncompleteRead as err:
print(err)
except http.client.ImproperConnectionState as err:
print(err)
except http.client.RemoteDisconnected as err:
print(err)
except ConnectionResetError as err:
print(err)
except SocketError as err:
print(err)
jobs = [pool.spawn(download_file, index, url) for index, url in url_list]
print('Downloaded images')