From 094ee84ca0806d790e0bfc276d6d027a3df319b3 Mon Sep 17 00:00:00 2001 From: Nikhil Dhandre Date: Sun, 2 Aug 2020 16:42:35 +0530 Subject: [PATCH] Fix link parsing --- linkstatus/linkstatus.py | 7 +++-- linkstatus/parser.py | 53 +++++-------------------------------- setup.cfg | 2 +- tests/data.yaml | 18 ++++++++++--- tests/data/markdown_file.md | 8 +++--- 5 files changed, 28 insertions(+), 60 deletions(-) diff --git a/linkstatus/linkstatus.py b/linkstatus/linkstatus.py index fa80773..4a1b1c6 100644 --- a/linkstatus/linkstatus.py +++ b/linkstatus/linkstatus.py @@ -6,7 +6,6 @@ import pkg_resources import requests -from linkstatus.parser import link_validator from linkstatus.parser import parse_file CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @@ -28,12 +27,13 @@ def link_status(link, timeout=5): status_code = requests.get(link, headers=headers, timeout=timeout).status_code except requests.exceptions.SSLError: status_code = requests.get(link, verify=False, headers=headers, timeout=timeout).status_code + except requests.exceptions.MissingSchema: + status_code = "Schema missing try with http/https" except Exception: # noqa # TODO: include exception in logging status_code = None - pass - return status_code == 200, status_code + return status_code == requests.codes.ok, status_code def all_files(source, recursive=False): @@ -80,7 +80,6 @@ def main(source, recursive, timeout, retry): for f in files: links = parse_file(f) - links = link_validator(links) if links: click.echo(click.style("Links in File: '{}'".format(f), bg="blue", fg="white")) diff --git a/linkstatus/parser.py b/linkstatus/parser.py index 1c40a5e..9bd6212 100644 --- a/linkstatus/parser.py +++ b/linkstatus/parser.py @@ -1,30 +1,21 @@ -import re from collections import namedtuple -import markdown +from urlextract import URLExtract -REGULAR_EXP = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" +LINKS = namedtuple("LINKS", ["line", "urls", "skip"]) -LINKS = namedtuple("LINKS", ["line", "urls", "skip", "valid"]) +EXTRACTOR = URLExtract() def parse_line(line): """Parse links from line/string Args: - string: data string + line: data line Returns: list of links """ - string = line.strip() - html_format = markdown.markdown(string, output_format="html") - links = re.findall(REGULAR_EXP, html_format) - - # TODO: Improve regex to remove this workaround for trailing

or - links = [ - l.replace("

", "").replace("", "").replace("", "").replace(")", "") - for l in links - ] + links = EXTRACTOR.find_urls(line) return links @@ -41,37 +32,5 @@ def parse_file(file_path): line_links = parse_line(line) if line_links: skip = True if "noqa" in line else False - links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip, valid=False)) + links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip)) return links - - -def link_validator(links_list): - """Validate link - Args: - links_list: List of links. - - Return: - Named tuple of the valid and invalid links. - """ - validated_list = [] - - regex = re.compile( - r"^(?:http|ftp)s?://" # http:// or https:// - r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" - # for domain - r"localhost|" # localhost... - r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip - r"(?::\d+)?" # optional port - r"(?:/?|[/?]\S+)$", - re.IGNORECASE, - ) - - for link in links_list: - urls = [] - for i in link.urls: - if re.match(regex, i): - urls.append(i) - else: - validated_list.append(LINKS(line=link.line, urls=[i], valid=False, skip=True)) - validated_list.append(LINKS(line=link.line, urls=urls, skip=False, valid=True)) - return validated_list diff --git a/setup.cfg b/setup.cfg index a615a41..1b68c82 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,8 +35,8 @@ zip_safe = False setup_requires = setuptools_scm install_requires = click - markdown requests + urlextract include_package_data = True python_requires = >=3.6 diff --git a/tests/data.yaml b/tests/data.yaml index f778728..1a0dcb0 100644 --- a/tests/data.yaml +++ b/tests/data.yaml @@ -18,9 +18,12 @@ data: status: False line: 'L8' markdown_file.md: - 'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet': + 'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet#links': status: True line: 'L4' + 'www.google.com': + status: False + line: 'L6' 'http://www.google.com': status: True line: 'L8' @@ -33,16 +36,25 @@ data: 'http://www.example.com': status: True line: 'L24' + 'https://www.mozilla.org': + status: True + line: 'L28' + 'http://slashdot.org': + status: True + line: 'L30' + 'http://www.foo.com': + status: True + line: 'L32' 'https://github.com/pythonpune/linkcheck': status: False line: 'L34' 'https://github.com//pythonpune/': status: True line: 'L39' - 'http://:': + 'http://localhost:8080': status: False line: 'L41' - 'https://:/pages': + 'https://localhost:8080/foo': status: False line: 'L43' recursive: diff --git a/tests/data/markdown_file.md b/tests/data/markdown_file.md index 912d842..eca62d9 100644 --- a/tests/data/markdown_file.md +++ b/tests/data/markdown_file.md @@ -29,7 +29,7 @@ Some text to show that the reference links can follow later. [1]: http://slashdot.org -[link text itself]: http://www.example.com +[link text itself]: http://www.foo.com [broken link](https://github.com/pythonpune/linkcheck) @@ -38,8 +38,6 @@ Some text to show that the reference links can follow later. https://github.com//pythonpune/ -http://: +http://localhost:8080 -https://:/pages - -file:///etc/hosts +https://localhost:8080/foo