-
Notifications
You must be signed in to change notification settings - Fork 4
/
google_dl.py
executable file
·186 lines (159 loc) · 7.16 KB
/
google_dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python3
# Sample usage:
# ./google_dl.py foo bar
# ./google_dl.py -s http://example.com -f pdf foo bar
# ./google_dl.py "foo bar site:http://example.com filetype:pdf"
import os, sys
import argparse
import urllib.request
from urllib.error import URLError, HTTPError, ContentTooShortError
from xgoogle.search import GoogleSearch, SearchError
import re
import socket
import mimetypes
class GoogleDl():
def __init__(self, query, filetypes, site, resultsperpage, maxresults, repeat):
if filetypes:
filetypes = re.split(",", filetypes)
query += " filetype:" + filetypes.pop(0)
for filetype in filetypes:
query += " OR filetype:" + filetype
if site:
query += " site:" + site
print(query)
self.gs = GoogleSearch(query, random_agent=True, repeat=repeat)
self.gs.results_per_page = int(resultsperpage)
self.maxresults = int(maxresults)
self.lastpage = False
def getTotal(self):
return len(self.results)
def dlFileOld(self, url, path):
try:
urllib.request.urlretrieve(url, filename=path)
except HTTPError as err:
print("Error: %s, reason: %s." % (err.code, err.reason))
return False
except ContentTooShortError as err:
print("Error: The downloaded data is less than the expected amount, so skipping.")
return False
except urllib.error.URLError:
print("Error: Reading socket timed out, try again later.")
return False
def dlFile(self, url, path):
request = urllib.request.Request(url, headers={"User-Agent": self.gs.browser.get_user_agent()})
try:
with urllib.request.urlopen(request) as i, open(path, "wb") as o:
o.write(i.read())
except URLError:
print("Error: Reading socket timed out, try again later.")
return False
except HTTPError as err:
print("Error: %s, reason: %s." % (err.code, err.reason))
return False
except ContentTooShortError as err:
print("Error: The downloaded data is less than the expected amount, so skipping.")
return False
except OSError as err:
print("Error: %s raised when tried to save the file '%s'" % (err.strerror, err.filename))
sys.exit(1)
def __iter__(self):
self.count = 0
return self
def __next__(self):
if self.lastpage or self.count >= self.maxresults:
raise StopIteration
results = self.gs.get_results()
if not results:
raise StopIteration
# Only stop if no results are returned
if len(results) == 0:
self.lastpage = True
self.count += len(results)
return results
def get_path_via_url(url, dest = ".", dirs = False):
# Decode URL, use path part, strip its head & tail
path = urllib.parse.unquote(urllib.parse.urlparse(url)[2]).strip("/")
if path is "":
pseudoPath = urllib.parse.unquote(urllib.parse.urlparse(url)[1])
else:
pseudoPath = ""
# Destination dir must have a trailing slash
if not dest.endswith("/"):
dest += "/"
# Check whether URL ends with an extension, so we should gather MIME info from response header or not
extension = re.search("\.(\w+)$", path)
if extension is None or pseudoPath is not "":
request = urllib.request.Request(url, method="HEAD")
try:
response = urllib.request.urlopen(request)
mimetype = re.search("(\w+/\w+)", dict(response.getheaders())["Content-Type"])
if mimetype is not None:
path += pseudoPath + mimetypes.guess_extension(mimetype.group(1))
else:
path += pseudoPath + ".html"
except BaseException:
path += pseudoPath + ".html"
if dirs:
return dest + path
else:
# Return only the last element from the '/' delimeted path list
return dest + re.split("/", path)[-1]
if __name__ == "__main__":
# Parse arguments.
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("-?", "--help",
action="help", help="Show this help message and exit")
parser.add_argument("-v", "--verbose",
action="store_true", dest="verbose", help="increase output verbosity.")
parser.add_argument("-d", "--download-dir",
action="store", dest="dest", help="Directory to download files.", default=".")
parser.add_argument("-s", "--site",
action="store", dest="site", help="Site to search for.", default=None)
parser.add_argument("-f", "--file-type",
action="store", dest="filetype", help="Comma-separated list of file types to download.", default=None)
parser.add_argument("-x", "--force-directories",
action="store_true", dest="dirs", help="Create a hierarchy of directories based on the URL.")
parser.add_argument("-t", "--timeout",
action="store", dest="timeout", help="Set socket read timeout for downloading in seconds (float).", default=None)
parser.add_argument("-r", "--repeat",
action="store_true", dest="repeat", help="Set filter of search results.", default=None)
parser.add_argument("-m", "--max-results",
action="store", dest="maxresults", help="Set maximum results to scrape.", default=1000)
parser.add_argument("-p", "--results-per-page",
action="store", dest="resultsperpage", help="Set number of results per page.", default=50)
parser.add_argument("query", nargs="+", help="Query to search for.")
# Build query string
args = parser.parse_args()
query = " ".join(args.query)
# Set timeout if there's any
if args.timeout:
socket.setdefaulttimeout(float(args.timeout))
# Download if doesn't exist locally
try:
page = GoogleDl(query, args.filetype, args.site, args.resultsperpage, args.maxresults, args.repeat)
#print("Query: %s" % (query) if args.verbose else "")
n_pages = 1
n_results = 0
for results in page:
print("Trying to download results from page #%d (results %d-%d)" % (n_pages, n_results, n_results + len(results)))
n_results += len(results)
n = 0
for result in results:
n += 1
url = result.getURL()
path = get_path_via_url(url, args.dest, args.dirs)
filename = os.path.basename(path)
dirname = os.path.dirname(path)
os.makedirs(dirname, 0o755, True)
print("Page: %i, File: %i" % (n_pages, n))
print("Downloading '%s' from '%s' into %s..." % (filename.encode('utf-8').strip(), url.encode('utf-8').strip(), dirname))
if os.path.isfile(path):
print("File '%s' already exists, skipping." % (path.encode('utf-8').strip()))
else:
page.dlFile(url, path)
n_pages += 1
print("Done, downloaded %i files." % n_results)
except KeyboardInterrupt:
sys.exit()
except SearchError as err:
print("Search failed: %s" % (err))