Skip to content

Commit

Permalink
Correctly tokenize URLs in angle brackets (#27)
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Proisl committed Feb 9, 2024
1 parent cb1d001 commit fb788e6
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# CHANGELOG #

## Version 2.4.1, 2024-02-09 ##

- Fix issue #27 (URLs in angle brackets).

## Version 2.4.0, 2023-12-23 ##

- New feature: SoMaJo can output character offsets for tokens,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# a new release
[project]
name = "SoMaJo"
version = "2.4.0"
version = "2.4.1"
description = "A tokenizer and sentence splitter for German and English web and social media texts."
readme = "README.md"
requires-python = ">=3.8"
Expand Down
4 changes: 2 additions & 2 deletions src/somajo/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
self.email = re.compile(r"\b[\w.%+-]+(?:@| \[at\] )[\w.-]+(?:\.| \[?dot\]? )\p{L}{2,}\b")
# simple regex for urls that start with http or www
# no square brackets and spaces in URL: [^][ ]
self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+?\(\S*?\)[^][ ]*(?=$|[\'. "!?,;])', re.IGNORECASE)
self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+[^][\'. "!?,;:()]', re.IGNORECASE)
self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+?\(\S*?\)[^][<> ]*(?=$|[\'. "!?,;])', re.IGNORECASE)
self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+[^][<>\'. "!?,;:()]', re.IGNORECASE)
self.doi = re.compile(r'\bdoi:10\.\d+/\S+', re.IGNORECASE)
self.doi_with_space = re.compile(r'(?<=\bdoi: )10\.\d+/\S+', re.IGNORECASE)
# regex for ISBNs adapted from:
Expand Down
3 changes: 3 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,9 @@ def test_emails_urls_26(self):
def test_emails_urls_27(self):
self._equal("link: [Linktext „viel“ Text](https://other_link.com).", "link : [ Linktext „ viel “ Text ] ( https://other_link.com ) .")

def test_emails_urls_28(self):
self._equal("link: <https://one_link.com>.", "link : < https://one_link.com > .")


class TestAbbreviations(TestTokenizer):
def test_abbreviations_01(self):
Expand Down

0 comments on commit fb788e6

Please sign in to comment.