-
Notifications
You must be signed in to change notification settings - Fork 2
/
justel_lib.py
186 lines (160 loc) · 5.71 KB
/
justel_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import re
import os
import time
import logging
import requests
from slugify import slugify
from datetime import datetime
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup as bs
from reg_lib import (
RE_CLEANUP,
RE_FORMATS,
RE_KEY_MASKS,
)
logger = logging.getLogger(__name__)
def format_text(text, clean=False):
for mod in RE_FORMATS:
(reg, rep) = mod
text = re.sub(reg, rep, text, flags=re.M)
if clean:
for mod in RE_CLEANUP:
(reg, rep) = mod
text = re.sub(reg, rep, text, flags=re.M)
return text
def meta2md(data, dump=False):
out = ['---']
for k, v in data.items():
if type(v) is list:
out.append(f'{k}: {", ".join(v)}')
else:
out.append(f'{k}: {v}')
out.append('\n---\n')
if dump:
print("\n".join(out))
return "\n".join(out)
def meta_get(raw_text, html_text, data={}):
for key, mask in RE_KEY_MASKS:
for line in raw_text.splitlines():
matches = re.match(mask, line)
if matches:
data[key] = matches['value'].replace(':', '').strip()
break
data['updated'] = [
x.group() for x in re.finditer(r'([0-9AB]{10})', raw_text)
if x.group() != data.setdefault('number', None)
]
return data
def soup2meta(soup, meta_input={}):
meta_table = soup.find('body').findChildren('table', recursive=False)[1]
content = meta_table.findAll('th')[1]
for br in content.find_all('br'):
br.replace_with("\n")
raw_text = content.getText()
return meta_get(raw_text, content, meta_input)
def soup2md(soup, clean=False, meta_input={}, do_return_meta=False):
# Get meta
meta = soup2meta(soup, meta_input)
text_meta = meta2md(meta)
# Get text
table = soup.find('body').findChildren('table', recursive=False)[3]
for br in soup.find_all('br'):
br.replace_with("\n")
raw_text = table.getText()
text = format_text(raw_text, clean)
if do_return_meta:
return f"{text_meta}{text}", meta
else:
return f"{text_meta}{text}"
def justel_urls(startDate, endDate=False, interval='month', dtype='loi'):
sdt = datetime.strptime(startDate, '%Y-%m-%d')
edt = datetime.strptime(endDate, '%Y-%m-%d') if endDate else datetime.now()
cur = sdt
delta = relativedelta(months=1)
isY, isD = False, False
if interval == 'day':
delta = relativedelta(days=1)
isD = True
elif interval == 'year':
delta = relativedelta(years=1)
isY = True
elif interval != 'month':
logger.warning("Date interval %s is not in supported list (day, year or month), applying default value (month)", interval)
MASK_BASE = "http://www.ejustice.just.fgov.be/eli/{dtype}/{year}"
MASK_MONTH = "/{month:02d}"
MASK_DAY = "/{day:02d}"
while cur <= edt:
url = MASK_BASE.format(year=cur.year, dtype=dtype)
if not isY:
url += MASK_MONTH.format(month=cur.month)
if isD:
url += MASK_DAY.format(day=cur.day)
yield url
cur += delta
def justel_eli_scan(url):
cur = 0
links = []
while cur < 4:
r = requests.get(url, allow_redirects=False)
if r.status_code == 200:
break
time.sleep(2)
cur += 1
if r.status_code != 200:
logger.warning("Bad response code: %s", r.status_code)
return False, links
if 'error select' in r.text:
logger.warning('SQL Error on server')
return False, links
if '<center><FONT SIZE=6>Aide ELI ' in r.text:
logger.warning('Page does not exist')
return False, links
try:
soup = bs(r.text, 'html5lib')
for link in soup.find_all('a', text=re.compile('\W*Justel\W*')):
links.append(link.get('href'))
except Exception as exc:
logger.exception(exc)
return False, links
return True, links
def justel_doc_scan(url):
cur = 0
while cur < 4:
r = requests.get(url, allow_redirects=False)
if r.status_code == 200:
break
time.sleep(2)
cur += 1
if r.status_code != 200:
logger.warning("Bad response code: %s", r.status_code)
return False, ""
if '<center><FONT SIZE=6>Aide ELI ' in r.text:
logger.warning('Page does not exist')
return False, ""
if '(NOTE : pas de texte disponible)' in r.text:
logger.warning('Text not available')
return False, ""
if '(NOTA : geen tekst beschikbaar)' in r.text:
logger.warning('Text not available')
return False, ""
if 'La version intégrale et consolidée de ce texte n\'est pas disponible.' in r.text:
logger.warning('Consolidated version not available')
return False, ""
if 'De geconsolideerde versie van deze tekst is niet beschikbaar.' in r.text:
logger.warning('Consolidated version not available')
return False, ""
return True, r.text
def store_md(output_dir, md, meta):
basepath = os.path.abspath(output_dir)
slug_type = slugify(meta['dtype'], max_length=12, word_boundary=True)
pub_date = datetime.strptime(meta['pubDate'], '%d-%m-%Y').strftime('%Y-%m-%d')
if 'subTitle' in meta and meta['subTitle']:
slug_title = slugify(meta['title'] + '-' + meta['subTitle'], max_length=60, word_boundary=True)
else:
slug_title = slugify(meta['title'], max_length=25, word_boundary=True)
filename = f"{slug_type}_{pub_date}_{meta['lang']}_{slug_title}_{meta['number']}.md"
filepath = os.path.join(basepath, meta['number'][:4], filename)
os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, "w") as f:
f.write(md)
return filepath