-
Notifications
You must be signed in to change notification settings - Fork 0
/
iscrap.py
90 lines (79 loc) · 2.65 KB
/
iscrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import urllib.request
import urllib.parse
import re
import os
choice = """
Which web-comic to scrap?
(1) xkcd.com
(2) smbc-comics.com
Enter an integer value:"""
ch = int(input(choice))
#=============
#https://xkcd.com
if ch==1:
url = "https://c.xkcd.com/random/comic/"
try:
loop = int(input("Enter Number of images to download:"))
i=1
t=loop
folderName = input("Enter Folder Name (New folder will be created in current dir):")
while(loop):
print("Downloading "+ str(int((i/t)*100)) +"%...")
i=i+1
headers = {}
headers['User-Agent']='Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'
req = urllib.request.Request(url,headers=headers)
resp = urllib.request.urlopen(req)
respData = str(resp.read())
respData = respData.replace("\\n","\n")
respData = respData.replace("\\t","\t")
respData = respData.replace("</html>\n\n'","</html>")
respData = respData.replace("b'<","<")
image = re.findall(r'<div id="comic">\n<img src="(.*?)"',respData)
nextl = re.findall(r'<div id="comic">\n<img src="(.*?)"',respData)
for img in image:
if not os.path.exists(folderName):
os.makedirs(folderName)
imgname = img.split('/')[-1]
idata = urllib.request.urlretrieve("https:"+img,folderName+"/"+imgname)
loop-=1
print("Images Saved in "+folderName+"/")
except Exception as e:
print(e)
print(url)
#===================================
# ###http://www.smbc-comics.com/
elif ch==2:
url = "http://www.smbc-comics.com/random.php"
try:
loop = int(input("Enter Number of images to download:"))
i=1
t=loop
folderName = input("Enter Folder Name (New folder will be created in current dir):")
while(loop):
print("Downloading "+ str(int((i/t)*100)) +"%...")
i=i+1
headers = {}
headers['User-Agent']='Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'
req = urllib.request.Request(url,headers=headers)
resp = urllib.request.urlopen(req)
respData = str(resp.read())
respData = respData.replace("\\n","\n")
respData = respData.replace("\\t","\t")
respData = respData.replace("\\r","\r")
respData = respData.replace("\\'","'")
respData = respData.replace("</html>'","</html>")
respData = respData.replace("b'\n<","<")
image = re.findall(r'src="(.*?)" id="cc-comic"',respData)
for img in image:
if not os.path.exists(folderName):
os.makedirs(folderName)
imgname = img.split('/')[-1]
idata = urllib.request.urlretrieve(img,folderName+"/"+imgname)
loop-=1
print("Images Saved in "+folderName+"/")
except Exception as e:
print(e)
print(url)
else:
print("Invalid Choice")