clean emial, url

voicegain · Jul 14, 2021 · 01af462 · 01af462
1 parent ea51052
commit 01af462
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 2 deletions.
diff --git a/setup.py b/setup.py
@@ -8,11 +8,11 @@
 
 setuptools.setup(
     name="transcribe-compare",
-    version="0.2.3",
+    version="0.2.4",
     author="Huishen Zhan, Kuo Zhang, Jacek Jarmulak",
     author_email="huishen@voicegain.ai, kuo@voicegain.ai, jacek@voicegain.ai",
     description="Voicegain Compare transcription",
-    download_url='https://github.com/voicegain/transcription-compare/archive/0.2.3.tar.gz',
+    download_url='https://github.com/voicegain/transcription-compare/archive/0.2.4.tar.gz',
     long_description=long_description,
     long_description_content_type="text/markdown",
     packages=setuptools.find_packages(),

diff --git a/transcription_compare/tokenizer/special_token_utils.py b/transcription_compare/tokenizer/special_token_utils.py
@@ -0,0 +1,66 @@
+from string import punctuation
+
+URL_ = [".com", ".edu", ".net", ".org", ".ai"]
+
+def process_email(word):
+    """
+    0. strip the "." at the end.
+    if "@" exists, replace and dot
+    :param word:
+    :return:
+    """
+    word = word.rstrip(punctuation)
+    if "@" in word:
+        word = word.replace("@", " at ").replace(".", " dot ").replace("-", " dash ")
+        return word
+    return None
+
+
+def process_url(word):
+    # www.google.com/help
+    #
+    """
+    0. strip the "." at the end.
+       "google.com."
+    1. what's url???
+       ends with ".com", ".edu", ".net", ".org", ".ai"
+       # todo
+       or ".com", ".edu", ".net", ".org", ".ai" + "/" + something(i dont think people will say slash if nothing )
+
+    2. rules: replace "." -> dot. "slash"
+    :param word:
+    :return:
+    """
+    word = word.rstrip(punctuation)
+    for one_url in URL_:
+        # print("word[-len(one_url):]", word[-len(one_url):])
+        if word[-len(one_url):] == one_url:
+            word = word.replace(".", " dot ").replace("-", " dash ")
+            return word
+        if one_url + "/" in word:
+            word = word.replace(".", " dot ").replace("/", " slash ").replace("-", " dash ")
+            return word
+
+    return None
+
+
+def process_and(word): # "AT and T"
+    if "&" in word:
+        word = word.replace("&", " and ")
+        return word
+
+    return None
+
+
+# # test = ["im@haha.com1.?haha.", "test2@gmail", "test3@",
+# #  "www.google.com", "computerhope.com/", "vg.ai",
+# #  "haha.net/hello", "at&t"]
+# test = ["test3@",
+#  "www.google.com", "computerhope.com/",
+#  "haha.net/hello"]
+# methods = [process_email, process_url, process_and]
+# for one in test:
+#     for method in methods:
+#         print("one, ", one)
+#         print("method", method, method(one), "\n")
+# # test3@
diff --git a/transcription_compare/tokenizer/word_tokenizer.py b/transcription_compare/tokenizer/word_tokenizer.py
@@ -1,5 +1,6 @@
 from .abstract_tokenizer import AbstractTokenizer
 from nltk.tokenize import word_tokenize
+from .special_token_utils import *
 from ..tokens import Token
 import re
 import string
@@ -36,6 +37,24 @@ def tokenize(self, token_string, brackets_list=None, to_lower=False, remove_punc
         :return:split token_string
         """
 
+        split_tokens = token_string.split()
+        methods = [process_email, process_url, process_and]
+        new_tokens = []
+        for token in split_tokens:
+            updated = False
+            for method in methods:
+                updated_word = method(token)
+                if updated_word:
+                    new_tokens.append(updated_word)
+                    # print("method", method)
+                    # print("updated_word", token, updated_word)
+                    updated = True
+                    break
+            if not updated:
+                new_tokens.append(token)
+
+        token_string = " ".join(new_tokens)
+
         def clean_words_dont_have_brackets(s):
             # do punctuation or lower
             # print('exclude_brackets_word', s)