Skip to content

Commit

Permalink
Merge pull request #41 from digitronik/fix_parsing
Browse files Browse the repository at this point in the history
Fix link parsing
  • Loading branch information
amolkahat authored Aug 2, 2020
2 parents 3a0119c + 094ee84 commit 7d34e51
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 60 deletions.
7 changes: 3 additions & 4 deletions linkstatus/linkstatus.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import pkg_resources
import requests

from linkstatus.parser import link_validator
from linkstatus.parser import parse_file

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
Expand All @@ -28,12 +27,13 @@ def link_status(link, timeout=5):
status_code = requests.get(link, headers=headers, timeout=timeout).status_code
except requests.exceptions.SSLError:
status_code = requests.get(link, verify=False, headers=headers, timeout=timeout).status_code
except requests.exceptions.MissingSchema:
status_code = "Schema missing try with http/https"
except Exception: # noqa
# TODO: include exception in logging
status_code = None
pass

return status_code == 200, status_code
return status_code == requests.codes.ok, status_code


def all_files(source, recursive=False):
Expand Down Expand Up @@ -80,7 +80,6 @@ def main(source, recursive, timeout, retry):

for f in files:
links = parse_file(f)
links = link_validator(links)
if links:
click.echo(click.style("Links in File: '{}'".format(f), bg="blue", fg="white"))

Expand Down
53 changes: 6 additions & 47 deletions linkstatus/parser.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,21 @@
import re
from collections import namedtuple

import markdown
from urlextract import URLExtract

REGULAR_EXP = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
LINKS = namedtuple("LINKS", ["line", "urls", "skip"])

LINKS = namedtuple("LINKS", ["line", "urls", "skip", "valid"])
EXTRACTOR = URLExtract()


def parse_line(line):
"""Parse links from line/string
Args:
string: data string
line: data line
Returns:
list of links
"""
string = line.strip()
html_format = markdown.markdown(string, output_format="html")
links = re.findall(REGULAR_EXP, html_format)

# TODO: Improve regex to remove this workaround for trailing </p> or </li>
links = [
l.replace("</p>", "").replace("</li>", "").replace("</a>", "").replace(")", "")
for l in links
]
links = EXTRACTOR.find_urls(line)
return links


Expand All @@ -41,37 +32,5 @@ def parse_file(file_path):
line_links = parse_line(line)
if line_links:
skip = True if "noqa" in line else False
links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip, valid=False))
links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip))
return links


def link_validator(links_list):
"""Validate link
Args:
links_list: List of links.
Return:
Named tuple of the valid and invalid links.
"""
validated_list = []

regex = re.compile(
r"^(?:http|ftp)s?://" # http:// or https://
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"
# for domain
r"localhost|" # localhost...
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)

for link in links_list:
urls = []
for i in link.urls:
if re.match(regex, i):
urls.append(i)
else:
validated_list.append(LINKS(line=link.line, urls=[i], valid=False, skip=True))
validated_list.append(LINKS(line=link.line, urls=urls, skip=False, valid=True))
return validated_list
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ zip_safe = False
setup_requires = setuptools_scm
install_requires =
click
markdown
requests
urlextract
include_package_data = True
python_requires = >=3.6

Expand Down
18 changes: 15 additions & 3 deletions tests/data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@ data:
status: False
line: 'L8'
markdown_file.md:
'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet':
'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet#links':
status: True
line: 'L4'
'www.google.com':
status: False
line: 'L6'
'http://www.google.com':
status: True
line: 'L8'
Expand All @@ -33,16 +36,25 @@ data:
'http://www.example.com':
status: True
line: 'L24'
'https://www.mozilla.org':
status: True
line: 'L28'
'http://slashdot.org':
status: True
line: 'L30'
'http://www.foo.com':
status: True
line: 'L32'
'https://github.com/pythonpune/linkcheck':
status: False
line: 'L34'
'https://github.com//pythonpune/':
status: True
line: 'L39'
'http://<hostname>:<port>':
'http://localhost:8080':
status: False
line: 'L41'
'https://<hostname>:<port>/pages':
'https://localhost:8080/foo':
status: False
line: 'L43'
recursive:
Expand Down
8 changes: 3 additions & 5 deletions tests/data/markdown_file.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Some text to show that the reference links can follow later.

[1]: http://slashdot.org

[link text itself]: http://www.example.com
[link text itself]: http://www.foo.com

[broken link](https://github.com/pythonpune/linkcheck)

Expand All @@ -38,8 +38,6 @@ Some text to show that the reference links can follow later.

https://github.com//pythonpune/

http://<hostname>:<port>
http://localhost:8080

https://<hostname>:<port>/pages

file:///etc/hosts
https://localhost:8080/foo

0 comments on commit 7d34e51

Please sign in to comment.