This repository has been archived by the owner on Sep 11, 2020. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 119
/
wqxtDownloader.py
298 lines (269 loc) · 8.77 KB
/
wqxtDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
from utils import *
from wqxtPDF import *
import json
import logging
import jwt
import socket
from urllib import error
from io import BytesIO
from PIL import Image
class wqxtDownloader():
fileExt = ".jpg";
downloadFolder = "books/IMG";
sleepRange = {
"start": 10,
"end": 20,
"precision": 1
};
errorConfig = {
"times": 5, # 最大错误次数
"sleep": 5, # 无效图片错误
"httpSleep": 20 # http错误次数
};
# 构造函数
def __init__( self, bid ):
# 储存输入列表
self.bid = bid;
self.jwt_key = self.getJwtKey();
self.login = self.isLogin();
bookInfo = self.initread();
self.name = bookInfo['name'];
self.page = int(bookInfo['pages']);
self.kData = self.getK();
folder = self.getFolder();
self.createFolder( folder );
self.folder = folder;
self.catatree = self.getCatatree();
self.invalidpic = self.getInvalidPicInfo()
# 获得配置文件的jwt_key
def getJwtKey( self ):
conf = getConf();
jwt_key = conf.get('wqxt', 'jwt_key');
return jwt_key;
# 获得下载页面的基础URL
def getBaseUrl( self, page ):
return "https://lib-nuanxin.wqxuetang.com/page/img/{}/{}".format( self.bid, str(page) );
# 初始化阅读书籍
def initread( self ):
url = "https://lib-nuanxin.wqxuetang.com/v1/read/initread?bid={}".format( self.bid );
curl = get_value("urllib");
request = curl.request.urlopen(url);
data = request.read().decode("UTF-8");
# {"data":[],"errcode":8003,"errmsg":"很抱歉,您访问的图书不存在"} #图书不存在
bookInfo = json.loads( data );
if bookInfo["errcode"] == 8003:
raise BIDError(bookInfo["errmsg"])
pages = bookInfo['data'];
return pages;
# data: {
# canread: 1
# upperlimit: 1
# bid: "{BID}"
# toshelf: null
# name: "{书本名称}"
# title: "《书本标题》"
# pages: "页码"
# coverurl: "https://bookask-cover.oss-cn-beijing.aliyuncs.com/c/3/209/{BID}/{BID}.jpg!b"
# volume_list: []
# ismultivolumed: "0"
# lastpage: "1"
# last_volume: "1"
# price: "价格"
# sellprice: "出售价格"
# canreadpages: "可以阅读的数量"
# uid: null
# textbook: "0"
# }
# 获得目录
def getCatatree( self ):
url = "https://lib-nuanxin.wqxuetang.com/v1/book/catatree?bid={}".format( self.bid );
curl = get_value("urllib");
request = curl.request.urlopen(url);
data = request.read().decode("UTF-8");
cataTree = json.loads( data );
cataTreeData = cataTree['data'];
# self.parseCatatree( cataTreeData );
return cataTreeData;
# 获得书本信息
def getBookInfo( self ):
# https://lib-nuanxin.wqxuetang.com/page/size/?bid=
# {"data":{"bid":"","d":{"w":"524.16","h":"737.28"},"isocr":false},"errcode":0,"errmsg":"success"}
pass;
# 获得解密序列
def getK( self ):
url = "https://lib-nuanxin.wqxuetang.com/v1/read/k?bid={}".format( self.bid );
curl = get_value("urllib");
request = curl.request.urlopen(url);
data = request.read().decode("UTF-8");
kInfo = json.loads( data );
kData = json.dumps(kInfo['data']);
return kData;
def getPageUrl( self, page ):
baseUrl = self.getBaseUrl( page );
getKparmas = self.generateKparmas( page );
pageUrl = baseUrl + "?k=" + getKparmas;
return pageUrl;
def generateKparmas( self, page ):
jwt_key = self.jwt_key;
curTime = str(int(time.time()));
time_sq3 = curTime + "000";
jwt_data = {
"p": page,
"t": time_sq3,
"b": self.bid,
"w": 1000,
"k": self.kData,
"iat": curTime
};
jwt_enc = jwt.encode( jwt_data, jwt_key, algorithm='HS256');
return jwt_enc.decode(encoding='utf-8');
def start( self, args ):
lNumber = len(args);
if lNumber == 0:
start = 1;
end = self.page;
elif lNumber == 1:
start = 1;
end = args[0];
else:
start = args[0];
end = args[1];
start = int(start)
end = int(end)
# 计算总页码
countNum = end - start + 1;
# 记录当前次数
downloadTimes = 1;
# 本次操作的页码列表
pageLists = [];
bookName = self.name;
bid = self.bid;
logging.info("{}开始下载{},共 {} 页".format( str(bid), bookName, str(countNum) ));
for page in range( start, end+1 ):
url = self.getPageUrl( page );
path = self.getImgPath( page );
Errortimes = 0
while(True):
try:
downloadPage = self.downloadImage( url, path );
pageLists.append( path );
if downloadPage:
sleepRange = self.sleepRange;
ts = getRandom( sleepRange['start'], sleepRange['end'], sleepRange['precision'] );
logging.info("{}下载成功 第{}页({}/{}) 随机{}s".format( str(bid), page, str(downloadTimes), str(countNum), str(ts) ));
time.sleep( ts )
else:
logging.warning("{}跳过下载 第{}页({}/{})".format( str(bid), page, str(downloadTimes), str(countNum) ));
downloadTimes += 1;
break;
except socket.timeout:
Errortimes += 1;
logging.error("{}下载超时 第{}页({}/{}) 正在重试第{}次".format( str(bid), page, str(downloadTimes), str(countNum), str(Errortimes) ));
except InvalidPictureError:
Errortimes += 1;
sleepTime = self.errorConfig['sleep'];
logging.error("{} 获取到了失败的图片,暂停{}秒 第{}页({}/{}) 正在重试第{}次".format( str(bid), str(sleepTime), page, str(downloadTimes), str(countNum), str(Errortimes)));
self.kData = self.getK(); # 重新获取k
url = self.getPageUrl( page ); # 重新生成url
time.sleep( sleepTime );
# except error.HTTPError:
except error.URLError:
Errortimes += 1;
httpSleepTime = self.errorConfig['httpSleep'];
logging.error("{} 发生了严重错误,暂停{}秒 第{}页({}/{}) 正在重试第{}次".format( str(bid), str(httpSleepTime), page, str(downloadTimes), str(countNum), str(Errortimes)));
self.kData = self.getK(); # 重新获取k
url = self.getPageUrl( page ); # 重新生成url
time.sleep( httpSleepTime )
if Errortimes > self.errorConfig['times']:
raise TooManyRetry;
# PDF
name = "_".join([ self.name, str(start), str(end) ]);
catatree = self.catatree;
# 如果不是下载完整书籍,需要对catatree进行处理。 @todo
pdf = wqxtPDF( bid, name, lNumber, start, end, catatree, pageLists);
pdf.generatePDF();
def getFolder( self ):
downloadFolder = self.downloadFolder;
bid = self.bid;
folder = "/".join([ downloadFolder, bid ])
return folder;
def createFolder( self, folder ):
mKStatus = mkdir( folder );
if mKStatus:
logging.info("成功创建文件夹 {}".format(folder));
else:
logging.warning("失败创建文件夹 {}".format(folder));
def downloadImage( self, url, path ):
curl = get_value("urllib");
isExists = os.path.exists(path)
if not isExists:
bid = self.bid;
headers = {
"referer": "https://lib-nuanxin.wqxuetang.com/read/pdf/{}".format(bid)
};
requestPer = curl.request.Request(url=url, headers=headers);
request = curl.request.urlopen(requestPer, timeout=10);
data = request.read()
compareNBytes = 10000;
if data[0:compareNBytes] == self.invalidpic[0:compareNBytes]:
raise InvalidPictureError
if len( data )<=5:
raise InvalidPictureError
if data[:4] == b'\xff\xd8\xff\xe0': # 是不是jpg文件
f = open(path,"wb")
f.write(data)
f.close()
else:
self.img_converter(data, path)
return True;
else:
return False;
def img_converter(self, in_img, path):
img = BytesIO(in_img)
origin_img = Image.open(img)
jpg_img = origin_img.convert('RGB')
jpg_img.save(path)
img.close()
del img
def getImgPath( self, page ):
fileExt = self.fileExt;
folder = self.folder;
path = "{folder}/{page}{fileExt}".format( folder=folder, page=str(page), fileExt=fileExt );
return path;
def getInvalidPicInfo(self):
f = open('invalid_pic.jpg','rb');
invalidpic = f.read()
f.close()
return invalidpic;
def isLogin( self ):
url = "https://lib-nuanxin.wqxuetang.com/v1/login/checklogin";
curl = get_value("urllib");
request = curl.request.urlopen(url);
data = request.read().decode("UTF-8");
userInfo = json.loads( data );
if userInfo['errcode'] == 0:
return True;
else:
raise NoLoginError(userInfo['errmsg'])
class BIDError(Exception):
def __init__(self, errmsg):
logging.critical("获取图书内容失败,图书编号错误!");
self.errmsg = errmsg
def __str__(self):
return self.errmsg
class NoLoginError(Exception):
def __init__(self, errmsg):
logging.critical("远程服务器返回尚未登录,检查是否成功登录或cookies是否设置正确");
self.errmsg = errmsg
def __str__(self):
return self.errmsg
class InvalidPictureError(Exception):
def __str__(self):
return "获取到了无效的图片"
class TooManyRetry(Exception):
def __init__(self):
logging.critical("重试次数过多,程序终止,请尝试重新执行main.py");
def __str__(self):
return "重试次数超出设定次数"