Skip to content

Commit

Permalink
Merge pull request #8 from 1tayH/fix/invalid-url-parsing
Browse files Browse the repository at this point in the history
Filter out malformed hosts
  • Loading branch information
1tayH authored Jul 19, 2018
2 parents 0a0d50f + 713cb7c commit 969070c
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions noisy.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,12 @@ def _normalize_link(link, root_url):
:param root_url: the URL the DOM was loaded from
:return: absolute link
"""
parsed_url = urlparse(link)
try:
parsed_url = urlparse(link)
except ValueError:
# urlparse can get confused about urls with the ']'
# character and thinks it must be a malformed IPv6 URL
return None
parsed_root_url = urlparse(root_url)

# '//' means keep the current protocol used to access this URL
Expand Down Expand Up @@ -104,7 +109,7 @@ def _should_accept_url(self, url):
:param url: full url to be checked
:return: boolean of whether or not the url should be accepted and potentially visited
"""
return self._is_valid_url(url) and not self._is_blacklisted(url)
return url and self._is_valid_url(url) and not self._is_blacklisted(url)

def _extract_urls(self, body, root_url):
"""
Expand Down

0 comments on commit 969070c

Please sign in to comment.