From a097298e14ab0c79389ee1e445fc9f6c7dfc153f Mon Sep 17 00:00:00 2001 From: Frankie Robertson Date: Thu, 11 Feb 2021 11:51:14 +0200 Subject: [PATCH] Skip unparsable urls --- newspaper/extractors.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/newspaper/extractors.py b/newspaper/extractors.py index 962554014..d3ef8526d 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -466,7 +466,10 @@ def get_meta_img_url(self, article_url, doc): top_meta_image = try_one or try_two or try_three or try_four if top_meta_image: - return urljoin(article_url, top_meta_image) + try: + return urljoin(article_url, top_meta_image) + except ValueError: + pass return '' def get_meta_type(self, doc): @@ -573,8 +576,15 @@ def get_img_urls(self, article_url, doc): img_tags = self.parser.getElementsByTag(doc, **img_kwargs) urls = [img_tag.get('src') for img_tag in img_tags if img_tag.get('src')] - img_links = set([urljoin(article_url, url) - for url in urls]) + + img_links = set() + for url in urls: + try: + joined_url = urljoin(article_url, url) + except ValueError: + continue + else: + img_links.add(joined_url) return img_links def get_first_img_url(self, article_url, top_node): @@ -585,7 +595,10 @@ def get_first_img_url(self, article_url, top_node): node_images = self.get_img_urls(article_url, top_node) node_images = list(node_images) if node_images: - return urljoin(article_url, node_images[0]) + try: + return urljoin(article_url, node_images[0]) + except ValueError: + pass return '' def _get_urls(self, doc, titles):