-
Notifications
You must be signed in to change notification settings - Fork 0
/
loader.py
106 lines (83 loc) · 2.88 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
Parser module for bot.
Created on 03.03.2021
@author: Ruslan Dolovanyuk
"""
import os
import aiohttp
import asyncio
import multiprocessing as mp
from itertools import repeat
from bs4 import BeautifulSoup
class URL:
BASE = 'https://smsta.ru/m/sms/'
FUN = 'm_fun'
LOVE = 'm_love'
MISS = 'm_miss'
EROTIC = 'm_erotic'
def __call__(self):
urls = [
''.join([URL.BASE, URL.FUN]),
''.join([URL.BASE, URL.LOVE]),
''.join([URL.BASE, URL.MISS]),
''.join([URL.BASE, URL.EROTIC]),
]
return urls
async def load(client, url):
result = None
async with client.get(url) as response:
if 200 == response.status:
result = await response.read()
return result
async def get_pages(urls, pages):
"""Load pages from site."""
async with aiohttp.ClientSession() as client:
tasks = [load(client, url) for url in urls]
results = await asyncio.gather(*tasks)
for result in results:
pages.append(result)
await asyncio.sleep(0.3)
def parse(page, ):
"""Parsing data from loaded pages."""
bs = BeautifulSoup(page, 'html.parser')
result = {'posts': [], 'entries': [], 'urls': []}
content = bs.find('div', {'id': 'content'})
for post in content.find_all('div', {'class': 'post_03'}):
result['posts'].append(post.find('div').text.strip())
for entry in content.find('div', {'class': 'entry'}).find_all('p'):
for span in entry.find_all('span'):
span.decompose()
result['entries'].append(entry.text.strip())
for link in content.find('nav', {'class': 'list_00'}).find('div', {'class': 'list_03'}).find_all('a'):
result['urls'].append(link.get('href'))
return result
async def get_data():
"""Load, parse and return data from site."""
data = []
pages = []
urls = URL()
await get_pages(urls(), pages)
with mp.Pool(processes=mp.cpu_count()) as pool:
results = pool.map(parse, pages)
data.extend([item for result in results for item in result['posts']])
data.extend([item for result in results for item in result['entries']])
pages.clear()
urls = [''.join([URL.BASE, url]) for result in results for url in result['urls']]
await get_pages(urls, pages)
with mp.Pool(processes=mp.cpu_count()) as pool:
results = pool.map(parse, pages)
data.extend([item for result in results for item in result['posts']])
data.extend([item for result in results for item in result['entries']])
return data
async def worker(data):
result = await get_data()
for item in result:
data.append(item)
def main():
data = []
asyncio.run(worker(data))
with open(os.path.join('debug', 'compliments.log'), 'w', encoding='utf-8') as fout:
for item in data:
fout.write(f'{item}\n***\n')
if '__main__' == __name__:
main()