-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_links_is-enes-site.py
103 lines (80 loc) · 2.79 KB
/
check_links_is-enes-site.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import sys
import urllib.error as uerr
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
print("Checking links for the IS-ENES3 site at \
https://is.enes.org")
ignore_links = [
"http://foundation.zurb.com/", # this is part of Phlow crap
"https://mpimet.mpg.de/en/home.html", # this actually works
]
error_codes = ["404"]
# mainfunc
def _check_site(site, site_prefix):
"""Check if links inside provided links are active."""
error_counts = []
# main links
main_links = _get_main_links(site)
# exit if empty
if not main_links:
raise ValueError("Main links empy; site may be down!")
# filter links
filtered_main_links = _filter(main_links)
# loop through branches
for sub_link in filtered_main_links:
major = _test_link(sub_link)
if major:
error_counts.extend(major)
# test sub-sub-links only if they belong to the site
if site_prefix in sub_link:
links = _get_main_links(sub_link)
filtered_links = _filter(links)
# exit if empty
if not filtered_links:
raise ValueError("Main links empy; site may be down!")
# test links
for link in filtered_links:
if link not in filtered_main_links:
minor = _test_link(link)
if minor:
error_counts.extend(minor)
if error_counts:
print("Found errors, check output!")
sys.exit(1)
def _get_main_links(site):
"""Grab the main links of the site."""
req = Request(site)
pages = urlopen(req)
soup = BeautifulSoup(pages, 'html.parser')
all_links = []
# grab all links
for link in soup.findAll('a'):
all_links.append(link.get('href'))
return all_links
def _filter(links_list):
"""Pass a links list for filtering."""
filtered_links = [l for l in links_list if l is not None]
filtered_links = [l for l in filtered_links if len(l) > 4]
filtered_links = [l for l in filtered_links if l.startswith("http")]
return filtered_links
def _test_link(link):
"""Test link for activity."""
error_count = []
if link in ignore_links:
return
# print(f"Examining {link}")
try:
code = urlopen(link).getcode()
if code != 200:
print(f"WARNING: {link} accessible but code is not 200!")
except uerr.HTTPError as exc:
for err_code in error_codes:
if err_code in str(exc):
error_count.append(str(exc))
print(f"ERROR: {link} gives {exc}")
return error_count
# run routine
# pass the site you need checking as first arg
# pass the prefix of secondary links as second arg
_check_site("https://is.enes.org/",
"https://is.enes.org/")