-
Notifications
You must be signed in to change notification settings - Fork 40
/
huaban.py
108 lines (90 loc) · 2.43 KB
/
huaban.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
'''
python 2.7.12
'''
import requests
from parsel import Selector
import time
import re, random, os
def scraw_pin_ids():
pin_ids = []
pin_id = '1068018182'
flag = True
while flag:
try:
url = "http://huaban.com/favorite/beauty/"
headers1 = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Accept':'application/json',
'X-Request':'JSON',
'X-Requested-With':'XMLHttpRequest',
}
params = {
'j0l4lymf':'',
'max':pin_id,
'limit':'20',
'wfl':'1',
}
z1 = requests.get(url, params=params, headers=headers1)
if z1.json()['pins']:
for i in z1.json()['pins']:
pin_ids.append(i['pin_id'])
pin_id = pin_ids[-1]
print i['pin_id']
# with open("pin_ids.txt",'ab') as f:
# f.write(str(i['pin_id'])+"\n")
# f.close()
time.sleep(0.001)
else:
flag = False
return set(pin_ids)
except:
continue
def scraw_urls(pin_ids):
urls = []
urlss = ['http://huaban.com/pins/' + str(i) +'/' for i in pin_ids]
for url in urlss:
try:
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
z3 = requests.get(url, headers=headers)
text = z3.text
pattern = re.compile('"key":"(.*?)"', re.S)
items = re.findall(pattern, text)
urls.extend(items)
print items
print '============================================================================================================'
except:
continue
return set(urls)
def download(urls):
headers1 = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
n = 1
urls = set(urls)
for url in urls:
try:
if not os.path.exists(os.path.join(file_path, "huaban")):
os.makedirs(os.path.join(file_path, "huaban"))
os.chdir(file_path + '\\' + "huaban")
try:
url = 'http://img.hb.aicdn.com/' + url
r = requests.get(url, headers=headers1)
if len(r.content)>40000:
with open(str(n)+".jpg", 'wb') as f:
f.write(r.content)
f.close()
print u"第" + str(n) + u"张图片下载成功"
n+=1
# time.sleep(3)
except:
continue
except:
continue
# 图片存储路径
file_path = 'E:\selfprogress\programming\project\pa1024\huabannnnnnn'
pin_ids = scraw_pin_ids()
urls = scraw_urls(pin_ids)
download(urls)