Correctly tokenize URLs in angle brackets (#27)

tsproisl · Feb 9, 2024 · fb788e6 · fb788e6
1 parent cb1d001
commit fb788e6
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 3 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,5 +1,9 @@
 # CHANGELOG #
 
+## Version 2.4.1, 2024-02-09 ##
+
+- Fix issue #27 (URLs in angle brackets).
+
 ## Version 2.4.0, 2023-12-23 ##
 
 - New feature: SoMaJo can output character offsets for tokens,

diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@
 #    a new release
 [project]
 name = "SoMaJo"
-version = "2.4.0"
+version = "2.4.1"
 description = "A tokenizer and sentence splitter for German and English web and social media texts."
 readme = "README.md"
 requires-python = ">=3.8"

diff --git a/src/somajo/tokenizer.py b/src/somajo/tokenizer.py
@@ -88,8 +88,8 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
         self.email = re.compile(r"\b[\w.%+-]+(?:@| \[at\] )[\w.-]+(?:\.| \[?dot\]? )\p{L}{2,}\b")
         # simple regex for urls that start with http or www
         # no square brackets and spaces in URL: [^][ ]
-        self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+?\(\S*?\)[^][ ]*(?=$|[\'. "!?,;])', re.IGNORECASE)
-        self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+[^][\'. "!?,;:()]', re.IGNORECASE)
+        self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+?\(\S*?\)[^][<> ]*(?=$|[\'. "!?,;])', re.IGNORECASE)
+        self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+[^][<>\'. "!?,;:()]', re.IGNORECASE)
         self.doi = re.compile(r'\bdoi:10\.\d+/\S+', re.IGNORECASE)
         self.doi_with_space = re.compile(r'(?<=\bdoi: )10\.\d+/\S+', re.IGNORECASE)
         # regex for ISBNs adapted from:

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -1040,6 +1040,9 @@ def test_emails_urls_26(self):
     def test_emails_urls_27(self):
         self._equal("link: [Linktext „viel“ Text](https://other_link.com).", "link : [ Linktext „ viel “ Text ] ( https://other_link.com ) .")
 
+    def test_emails_urls_28(self):
+        self._equal("link: <https://one_link.com>.", "link : < https://one_link.com > .")
+
 
 class TestAbbreviations(TestTokenizer):
     def test_abbreviations_01(self):