-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler_biquge2.py
82 lines (72 loc) · 2.56 KB
/
crawler_biquge2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# 爬取笔趣阁, 多线程版本
# 容易被封IP
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
downloaded_book = []
def get_html(url):
try:
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'}
r = requests.get(url, timeout=30, headers=header)
r.encoding = r.apparent_encoding
return r.text
except:
return "ERROR"
def get_book():
# 获取不同排行榜的书单
url = 'http://www.qu.la/paihangbang/'
soup = BeautifulSoup(get_html(url), 'lxml')
tags = soup.find_all('div', class_='index_toplist')
novel = []
nums = 1
for i in tags:
type_name = i.find('span').text
novel.append({'type': type_name})
id = 'tabData_' + str(nums)
novel_list = i.find('div', {'id': id})
for n in novel_list.find_all('a'):
novel.append({n.text: 'http://www.qu.la'+n.get('href')})
nums += 1
return novel
def download_list(name):
global downloaded_book
for n, h in name.items():
if h[-2:] == "排行":
if h != "玄幻奇幻排行":
print('\n\n')
print(h)
else:
if n not in downloaded_book:
print("下载===>%s" % n)
downloaded_book.append(n)
download_novel_chapter(n, h)
def download_novel_chapter(name, link):
# 获取每本书籍对应的章节列表
try:
soup = BeautifulSoup(get_html(link), 'lxml')
characters = soup.find_all('div', {'id': 'list'})
characters = BeautifulSoup(str(characters), 'lxml')
characters = characters.find_all('a')
for i in characters:
chapter_name = i.text
chapter_link = 'http://www.qu.la' + i.get('href')
download_novel(name, chapter_name, chapter_link)
except:
print("获取小说章节出错")
def download_novel(name, chapter_name, chapter_link):
# 下载章节内容
try:
soup = BeautifulSoup(get_html(chapter_link).replace('<br/>', '\n'), 'lxml')
content = soup.find('div', {'id': 'content'}).text.replace('chaptererror();', '')
with open('{}.txt'.format(name), 'a+') as f:
print("下载:%s" % chapter_name)
f.write(chapter_name + '\n\n')
f.write(content)
except:
print("下载小说中发生错误")
if __name__ == '__main__':
pool = Pool()
pool = Pool(4)
# 线程数量
novels = get_book()
pool.map(download_list, novels)