From 094ee84ca0806d790e0bfc276d6d027a3df319b3 Mon Sep 17 00:00:00 2001
From: Nikhil Dhandre
Date: Sun, 2 Aug 2020 16:42:35 +0530
Subject: [PATCH] Fix link parsing
---
linkstatus/linkstatus.py | 7 +++--
linkstatus/parser.py | 53 +++++--------------------------------
setup.cfg | 2 +-
tests/data.yaml | 18 ++++++++++---
tests/data/markdown_file.md | 8 +++---
5 files changed, 28 insertions(+), 60 deletions(-)
diff --git a/linkstatus/linkstatus.py b/linkstatus/linkstatus.py
index fa80773..4a1b1c6 100644
--- a/linkstatus/linkstatus.py
+++ b/linkstatus/linkstatus.py
@@ -6,7 +6,6 @@
import pkg_resources
import requests
-from linkstatus.parser import link_validator
from linkstatus.parser import parse_file
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
@@ -28,12 +27,13 @@ def link_status(link, timeout=5):
status_code = requests.get(link, headers=headers, timeout=timeout).status_code
except requests.exceptions.SSLError:
status_code = requests.get(link, verify=False, headers=headers, timeout=timeout).status_code
+ except requests.exceptions.MissingSchema:
+ status_code = "Schema missing try with http/https"
except Exception: # noqa
# TODO: include exception in logging
status_code = None
- pass
- return status_code == 200, status_code
+ return status_code == requests.codes.ok, status_code
def all_files(source, recursive=False):
@@ -80,7 +80,6 @@ def main(source, recursive, timeout, retry):
for f in files:
links = parse_file(f)
- links = link_validator(links)
if links:
click.echo(click.style("Links in File: '{}'".format(f), bg="blue", fg="white"))
diff --git a/linkstatus/parser.py b/linkstatus/parser.py
index 1c40a5e..9bd6212 100644
--- a/linkstatus/parser.py
+++ b/linkstatus/parser.py
@@ -1,30 +1,21 @@
-import re
from collections import namedtuple
-import markdown
+from urlextract import URLExtract
-REGULAR_EXP = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
+LINKS = namedtuple("LINKS", ["line", "urls", "skip"])
-LINKS = namedtuple("LINKS", ["line", "urls", "skip", "valid"])
+EXTRACTOR = URLExtract()
def parse_line(line):
"""Parse links from line/string
Args:
- string: data string
+ line: data line
Returns:
list of links
"""
- string = line.strip()
- html_format = markdown.markdown(string, output_format="html")
- links = re.findall(REGULAR_EXP, html_format)
-
- # TODO: Improve regex to remove this workaround for trailing
or
- links = [
- l.replace("", "").replace("", "").replace("", "").replace(")", "")
- for l in links
- ]
+ links = EXTRACTOR.find_urls(line)
return links
@@ -41,37 +32,5 @@ def parse_file(file_path):
line_links = parse_line(line)
if line_links:
skip = True if "noqa" in line else False
- links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip, valid=False))
+ links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip))
return links
-
-
-def link_validator(links_list):
- """Validate link
- Args:
- links_list: List of links.
-
- Return:
- Named tuple of the valid and invalid links.
- """
- validated_list = []
-
- regex = re.compile(
- r"^(?:http|ftp)s?://" # http:// or https://
- r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"
- # for domain
- r"localhost|" # localhost...
- r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
- r"(?::\d+)?" # optional port
- r"(?:/?|[/?]\S+)$",
- re.IGNORECASE,
- )
-
- for link in links_list:
- urls = []
- for i in link.urls:
- if re.match(regex, i):
- urls.append(i)
- else:
- validated_list.append(LINKS(line=link.line, urls=[i], valid=False, skip=True))
- validated_list.append(LINKS(line=link.line, urls=urls, skip=False, valid=True))
- return validated_list
diff --git a/setup.cfg b/setup.cfg
index a615a41..1b68c82 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,8 +35,8 @@ zip_safe = False
setup_requires = setuptools_scm
install_requires =
click
- markdown
requests
+ urlextract
include_package_data = True
python_requires = >=3.6
diff --git a/tests/data.yaml b/tests/data.yaml
index f778728..1a0dcb0 100644
--- a/tests/data.yaml
+++ b/tests/data.yaml
@@ -18,9 +18,12 @@ data:
status: False
line: 'L8'
markdown_file.md:
- 'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet':
+ 'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet#links':
status: True
line: 'L4'
+ 'www.google.com':
+ status: False
+ line: 'L6'
'http://www.google.com':
status: True
line: 'L8'
@@ -33,16 +36,25 @@ data:
'http://www.example.com':
status: True
line: 'L24'
+ 'https://www.mozilla.org':
+ status: True
+ line: 'L28'
+ 'http://slashdot.org':
+ status: True
+ line: 'L30'
+ 'http://www.foo.com':
+ status: True
+ line: 'L32'
'https://github.com/pythonpune/linkcheck':
status: False
line: 'L34'
'https://github.com//pythonpune/':
status: True
line: 'L39'
- 'http://:':
+ 'http://localhost:8080':
status: False
line: 'L41'
- 'https://:/pages':
+ 'https://localhost:8080/foo':
status: False
line: 'L43'
recursive:
diff --git a/tests/data/markdown_file.md b/tests/data/markdown_file.md
index 912d842..eca62d9 100644
--- a/tests/data/markdown_file.md
+++ b/tests/data/markdown_file.md
@@ -29,7 +29,7 @@ Some text to show that the reference links can follow later.
[1]: http://slashdot.org
-[link text itself]: http://www.example.com
+[link text itself]: http://www.foo.com
[broken link](https://github.com/pythonpune/linkcheck)
@@ -38,8 +38,6 @@ Some text to show that the reference links can follow later.
https://github.com//pythonpune/
-http://:
+http://localhost:8080
-https://:/pages
-
-file:///etc/hosts
+https://localhost:8080/foo