-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl_extractor.py
39 lines (30 loc) · 978 Bytes
/
url_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""
Utility functions for URL extraction and validation.
This module contains helper functions to identify and extract URLs.
"""
import re
from urllib.parse import urlparse
def is_url(string):
"""
Check if a string is a well-formatted URL.
Args:
string (str): The input string to be checked.
Returns:
bool: True if the string is a valid URL, False otherwise.
"""
try:
result = urlparse(string)
return all([result.scheme, result.netloc])
except ValueError:
return False
def extract_urls(text_content):
"""
Extract valid URLs from the given text content.
Args:
text_content (str): The input text content.
Returns:
list: A list of unique, valid URLs found in the content.
"""
url_pattern = re.compile(r'(?:https?://)?(?:[\w/\-?=%.]+\.)+[\w/\-&?=%.]+')
urls = url_pattern.findall(text_content)
return list(dict.fromkeys(url for url in urls if is_url(url)))