-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawl.py
executable file
·92 lines (76 loc) · 3.15 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/python
import os, sys, logging
import requests
import click
from functools import reduce
from dotenv import load_dotenv
from scrapy.cmdline import execute
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from seosnap_cachewarmer.service import SeosnapService
from seosnap_cachewarmer.spider import SeosnapSpider
# from seosnap_cachewarmer.tagSpider import SeosnapTagSpider
from seosnap_cachewarmer.state import SeosnapState
from seosnap_cachewarmer.sitemap import SeoSnapSitemapRefresher
import xml.etree.ElementTree as ET
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '.env'))
configure_logging(install_root_handler=False)
logging.basicConfig(
filename=os.path.join(os.path.dirname(os.path.dirname(__file__)), 'logs/cachewarmer.log'),
level=os.getenv('CACHEWARMER_LOG_LEVEL')
)
@click.group()
def cli():
pass
@cli.command()
@click.argument('website_ids')
def load(website_ids: str, **args):
for website_id in website_ids.split(','):
click.echo(f'Loading website: {website_id}')
arg_tokens = reduce(lambda x, y: x + y, [['-a', f'{k}={v}'] for k, v in args.items()])
execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={website_id}'] + arg_tokens)
@cli.command()
@click.argument('website_ids')
@click.option('--follow_next', type=bool, default=True, help='Follows rel-next links if enabled')
@click.option('--recache', type=bool, default=True, help='Recached all pages instead of not yet cached ones')
@click.option('--clean_old_pages_after', type=bool, default=False, help='Remove all pages where updated_at lower then the start of the finished crawl')
@click.option('--use_queue', type=bool, default=False, help='Cache urls from the queue instead of the sitemap')
@click.option('--mobile', type=bool, default=False, help='Whether a mobile version should be rendered instead')
def cache(website_ids: str, **args):
try:
settings = get_project_settings()
process = CrawlerProcess(settings)
for website_id in website_ids.split(','):
process.crawl(
SeosnapSpider,
website_id=website_id,
**args
)
process.start()
except Exception as e:
click.echo(str(e), err=True)
@cli.command()
@click.argument('website_ids')
def sync(website_ids: str, *args, **kwargs):
print('Start sync')
for website_id in website_ids.split(','):
service = SeosnapService()
service.sync_pages(website_id)
@cli.command()
@click.argument('website_ids')
def redooldqueue(website_ids: str, *args, **kwargs):
print('Start redo')
for website_id in website_ids.split(','):
service = SeosnapService()
service.queue_old_redo(website_id)
@cli.command()
@click.argument('website_ids')
def clean(website_ids: str):
service = SeosnapService()
for website_id in website_ids.split(','):
service.clean_queue(int(website_id))
logging.info(f'Cleaned the queue for website: {website_id}')
click.echo(f'Cleaned the queue for websites: {website_ids}')
if __name__ == '__main__':
cli()