Skip to content

Commit

Permalink
clean emial, url
Browse files Browse the repository at this point in the history
  • Loading branch information
HannaHUp committed Jul 14, 2021
1 parent ea51052 commit 01af462
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 2 deletions.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

setuptools.setup(
name="transcribe-compare",
version="0.2.3",
version="0.2.4",
author="Huishen Zhan, Kuo Zhang, Jacek Jarmulak",
author_email="huishen@voicegain.ai, kuo@voicegain.ai, jacek@voicegain.ai",
description="Voicegain Compare transcription",
download_url='https://github.com/voicegain/transcription-compare/archive/0.2.3.tar.gz',
download_url='https://github.com/voicegain/transcription-compare/archive/0.2.4.tar.gz',
long_description=long_description,
long_description_content_type="text/markdown",
packages=setuptools.find_packages(),
Expand Down
66 changes: 66 additions & 0 deletions transcription_compare/tokenizer/special_token_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from string import punctuation

URL_ = [".com", ".edu", ".net", ".org", ".ai"]

def process_email(word):
"""
0. strip the "." at the end.
if "@" exists, replace and dot
:param word:
:return:
"""
word = word.rstrip(punctuation)
if "@" in word:
word = word.replace("@", " at ").replace(".", " dot ").replace("-", " dash ")
return word
return None


def process_url(word):
# www.google.com/help
#
"""
0. strip the "." at the end.
"google.com."
1. what's url???
ends with ".com", ".edu", ".net", ".org", ".ai"
# todo
or ".com", ".edu", ".net", ".org", ".ai" + "/" + something(i dont think people will say slash if nothing )
2. rules: replace "." -> dot. "slash"
:param word:
:return:
"""
word = word.rstrip(punctuation)
for one_url in URL_:
# print("word[-len(one_url):]", word[-len(one_url):])
if word[-len(one_url):] == one_url:
word = word.replace(".", " dot ").replace("-", " dash ")
return word
if one_url + "/" in word:
word = word.replace(".", " dot ").replace("/", " slash ").replace("-", " dash ")
return word

return None


def process_and(word): # "AT and T"
if "&" in word:
word = word.replace("&", " and ")
return word

return None


# # test = ["im@haha.com1.?haha.", "test2@gmail", "test3@",
# # "www.google.com", "computerhope.com/", "vg.ai",
# # "haha.net/hello", "at&t"]
# test = ["test3@",
# "www.google.com", "computerhope.com/",
# "haha.net/hello"]
# methods = [process_email, process_url, process_and]
# for one in test:
# for method in methods:
# print("one, ", one)
# print("method", method, method(one), "\n")
# # test3@
19 changes: 19 additions & 0 deletions transcription_compare/tokenizer/word_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .abstract_tokenizer import AbstractTokenizer
from nltk.tokenize import word_tokenize
from .special_token_utils import *
from ..tokens import Token
import re
import string
Expand Down Expand Up @@ -36,6 +37,24 @@ def tokenize(self, token_string, brackets_list=None, to_lower=False, remove_punc
:return:split token_string
"""

split_tokens = token_string.split()
methods = [process_email, process_url, process_and]
new_tokens = []
for token in split_tokens:
updated = False
for method in methods:
updated_word = method(token)
if updated_word:
new_tokens.append(updated_word)
# print("method", method)
# print("updated_word", token, updated_word)
updated = True
break
if not updated:
new_tokens.append(token)

token_string = " ".join(new_tokens)

def clean_words_dont_have_brackets(s):
# do punctuation or lower
# print('exclude_brackets_word', s)
Expand Down

0 comments on commit 01af462

Please sign in to comment.