-
Notifications
You must be signed in to change notification settings - Fork 1
/
中国联通招标采购信息爬取.py
262 lines (235 loc) · 9.99 KB
/
中国联通招标采购信息爬取.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import os, time, re, redis, pymysql,time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
from city_data import get_city_dict
from Regular_Expression import regularExpression
# os.system('cd /Users/杰/AppData/Local/Google/Chrome/Application')
# os.system('chrome.exe --remote-debugging-port=9222 -user-data-dir="c:/selenium/automationprofile"')
class China_unicom(object):
def __init__(self):
self.city_dict = get_city_dict()
self.pattern01 = r'服务区域:(.*?)<'
self.pattern02 = r'服务地点:(.*?)<'
self.pattern03 = r'服务地址:(.*?)<'
self.pattern04 = r'采.*?购.*?人(.*?)<'
self.pattern05 = r'比.*?选.*?人:(.*?)<'
self.pattern08 = r'地址:(.*?)<'
self.pattern06 = r'联系地址:(.*?)<'
self.pattern07 = r'详细地址:(.*?)<'
# 正则表达式的规则列表
self.pattern_list = [self.pattern01, self.pattern02, self.pattern03, self.pattern04, self.pattern08, self.pattern05, self.pattern06, self.pattern07]
# self._arguments = []
self.base_url = 'http://www.chinaunicombidding.cn'
self.chrome_options = Options()
# chrome_options.add_experimental_option("debuggerAddress","127.0.0.1:9222")
self.chrome_options.add_argument('disable-infobars')
self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# self.option = webdriver.ChromeOptions()
self.going_to_crawl_bid = '{}&type=1'
self.going_to_crawl_result = '{}&type=2'
self.going_to_crawl_single = '{}&type=3'
self.duplicate_part = 'http://www.chinaunicombidding.cn/jsp/cnceb/web/info1/infoList.jsp?page='
self.conn = pymysql.connect(host='0.0.0.0',
user='root',
password='jiayou875',
database='zb_data',
# database='test_demo',
port=3306,
charset='utf8')
self.cur = self.conn.cursor()
pool = redis.ConnectionPool(host='0.0.0.0', port=6379, db=15)
self.r = redis.Redis(connection_pool=pool)
# 转换成localtime
now_time = '%.0f' % time.time()
time_local = time.localtime(int(now_time))
# 转换成新的时间格式(2016-05-05 20:28:54)
# dt = time.strftime("%Y-%m-%d %H:%M:%S",time_local)
self.dt = time.strftime("%Y-%m-%d", time_local)
def upload_items(self, items):
if items['addr_id'] == '':
items['addr_id'] = '100'
try:
if items['addr_id'] != '' and items['title'] != '' and items['url'] != '' and items['intro'] != '' and items[
'web_time'] != '':
items['web_time'] = int(time.mktime(time.strptime(items['web_time'], "%Y-%m-%d")))
# 正式上传到服务器
sql = "INSERT INTO ztb_py_data (catid,title,style,addtime,adddate,areaid,linkurl,content) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');" % (
items['type_id'], items['title'], items['source_name'], items['time'], items['web_time'],
items['addr_id'],
items['url'], pymysql.escape_string(items['intro']))
time.sleep(0.1)
self.cur.execute(sql)
self.conn.commit()
self.r.hincrby(self.dt, items['source_name'])
# 单机测试
# sql = "INSERT INTO winkboy (catid,title,style,addtime,adddate,areaid,linkurl,content) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');" % ( items['type_id'], items['title'], items['source_name'], items['time'], items['web_time'], items['addr_id'], items['url'], pymysql.escape_string(items['intro']))
# self.cur.execute(sql)
# self.conn.commit()
else:
try:
items['web_time'] = int(time.mktime(time.strptime(items['web_time'], "%Y-%m-%d")))
except:
pass
sql = "INSERT INTO ztb_error_infos (catid,title,style,addtime,adddate,areaid,status,linkurl,content) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');" % (
items['type_id'], items['title'], items['source_name'], items['time'], items['web_time'],
items['addr_id'], 3, items['url'], pymysql.escape_string(items['intro']))
self.cur.execute(sql)
self.conn.commit()
except Exception as e:
print("数据上传失败")
print(items['title'])
print(items['url'])
print(e)
def get_response(self, resource, driver):
list_content = etree.HTML(resource)
list_url = list_content.xpath('//div[@id="div1"]//tr[@height="35px"]')
print(len(list_url))
if len(list_url) == 0:
pass
for each_tr in list_url[:]:
items = {}
items['intro'] = ''
items['addr_id'] = ''
items['title'] = ''
items['url'] = ''
items['web_time'] = ''
items["time"] = '%.0f' % time.time()
items['title'] = each_tr.xpath('.//span/@title')[0].strip()
# 获取文章id、然后使用网页前缀拼接文章id获取到文章的真实id
article_url = each_tr.xpath('.//span/@onclick')[0]
items['url'] = self.base_url + re.search(r'window.open\("(.*?)","",', article_url, re.S).group(1)
print(items['url'])
js = "window.open({})".format('"' + items['url'] + '"') # 可以看到是打开新的标签页 不是窗口
# print(js)
time.sleep(0.01)
Sucess = True
# while Sucess:
# try:
# driver.execute_script(js)
# driver.implicitly_wait(30)
# Sucess = False
# except:
# driver.switch_to_window(3)
# driver.refresh()
# time.sleep(2.5)
try:
driver.execute_script(js)
driver.implicitly_wait(30)
except:
driver.switch_to_window(3)
driver.refresh()
now_handle = driver.current_window_handle
# print(driver.window_handles, now_handle)
driver.switch_to_window(driver.window_handles[driver.window_handles.index(now_handle) + 1])
# 获取日期
items['web_time'] = each_tr.xpath('.//td/following-sibling::td/text()')[0].strip()
# 获取到地址然后通过城市表获取城市id
try:
items['address'] = each_tr.xpath('.//td[3]/text()')[0].strip()
except:
pass
dirty_article = driver.page_source
# print(dirty_article)
try:
# print(1111111)
time.sleep(0.01)
dirty_article = re.search(r'<body.*?>(.*?)</body></html>',str(dirty_article), re.S).group(1)
dirty_article = re.sub(r'href="', 'href="http://www.chinaunicombidding.cn', dirty_article, flags = re.S)
dirty_article = re.sub(r'<iframe.*?>.*?</iframe>', '', dirty_article, flags = re.S)
# print(2222222)
# 将文章的垃圾数据进行清洗
clean_article = re.sub(regularExpression, ' ', dirty_article)
items["intro"] = clean_article
# print(items["intro"])
except Exception as e:
print(e)
pass
# 如果标题出现失败、基本可以证明是失败公示、所以将其纳入38257
if '中标' in items['title'] or '成交' in items['title'] or '结果' in items['title'] or '失败' in items['title'] or '流标' in items['title'] or '候选人' in items['title'] or '中选人' in items['title'] or '作废' in items['title'] or '终止' in items['title']:
items['type_id'] = '38257'
elif '更正' in items['title'] or '变更' in items['title'] or '答疑' in items['title'] or '澄清' in items['title'] or '补充' in items['title'] or '延期' in items['title']:
items['type_id'] = '38256'
else:
items['type_id'] = '38255'
try:
for each_city in self.city_dict:
if each_city in items['address']:
items['addr_id'] = self.city_dict[each_city]
break
except:
pass
# 如果从地址栏找不到地址 则从标题获取
if items['addr_id'] == '':
for each_city in self.city_dict:
if each_city in items['title']:
items['addr_id'] = self.city_dict[each_city]
break
if items['addr_id'] == '':
for each_pattern in self.pattern_list:
try:
search_text = re.search(each_pattern, dirty_article, re.S).group(1)
for city_name in self.city_dict:
if city_name in search_text:
items['addr_id'] = self.city_dict[city_name]
break
except:
continue
if items['addr_id'] != '':
break
items["source_name"] = '中国联通采购与招标网'
self.upload_items(items)
# print(items['title'])
driver.close()
time.sleep(0.01)
driver.switch_to_window(driver.window_handles[driver.window_handles.index(now_handle)])
# break
def crawl_first_page(self, driver):
driver.get('http://www.chinaunicombidding.cn/jsp/cnceb/web/info1/infoList.jsp?page=1&type=1')
driver.implicitly_wait(25)
self.get_response(driver.page_source, driver)
def run(self):
all_list_pages = []
all_list_pages.extend([self.going_to_crawl_bid.format(i) for i in range(1, 20)])
all_list_pages.extend([self.going_to_crawl_result.format(i) for i in range(1, 7)])
all_list_pages.extend([self.going_to_crawl_single.format(i) for i in range(1, 3)])
print(len(all_list_pages))
while all_list_pages:
print(all_list_pages)
driver = webdriver.Chrome(executable_path='C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe',options=self.chrome_options)
time.sleep(1)
current_page = all_list_pages[0]
js = "window.open({})".format('"' + self.duplicate_part + current_page + '"') # 可以看到是打开新的标签页 不是窗口
# print(js)
time.sleep(1)
driver.execute_script(js)
time.sleep(2)
all_list_pages.pop(all_list_pages.index(all_list_pages[0]))
driver.implicitly_wait(20)
time.sleep(1)
driver.switch_to_window(driver.window_handles[1])
current_times = time.localtime(time.time())
print(current_times)
# print(driver.page_source[:500])
if '<input' in driver.page_source[:500]:
print(current_page)
all_list_pages.append(current_page)
else:
print('success')
self.get_response(driver.page_source, driver)
driver.quit()
if __name__ == '__main__':
while True:
try:
current_time = time.localtime(time.time())
if current_time.tm_hour == 11 and current_time.tm_min == 35 and current_time.tm_sec == 0:
c = China_unicom()
c.run()
elif current_time.tm_hour == 20 and current_time.tm_min == 37 and current_time.tm_sec == 0:
c = China_unicom()
c.run()
except:
pass
#
# c = China_unicom()
# c.run()