forked from nicolas17/covid-ar-json
-
Notifications
You must be signed in to change notification settings - Fork 0
/
request.py
42 lines (34 loc) · 1.24 KB
/
request.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# SPDX-FileCopyrightText: 2020 Nicolás Alvarez <nicolas.alvarez@gmail.com>
#
# SPDX-License-Identifier: MIT
import requests
import logging
import datetime
import re
from bs4 import BeautifulSoup
sess = requests.session()
sess.headers = {'User-Agent': 'CovidParser/0.1 (+nicolas.alvarez+covid@gmail.com)'}
def date_from_header(header_text):
match = re.match('Reporte Diario (?:\w+ )?/ (\d+)-(\d+)-(\d+) \(.*\)', header_text)
if match:
d = int(match.group(1))
m = int(match.group(2))
y = int(match.group(3))
return datetime.date(y,m,d)
def get_pdfs():
resp = sess.get("https://www.argentina.gob.ar/coronavirus/informe-diario?cache-bust=%d" % datetime.datetime.now().minute)
logging.info("Parsing HTML page")
soup = BeautifulSoup(resp.content, 'html.parser')
links = soup.find('div', class_='downloads').find_all('a')
logging.info("Found {} links".format(len(links)))
for elem in links:
url = elem["href"]
header = elem.parent.p
header_text = header.text
date = date_from_header(header_text)
yield (url, date)
def download_file(url, fd):
resp = sess.get(url)
for chunk in resp.iter_content(chunk_size=4096):
fd.write(chunk)
fd.flush()