-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tokenization.py
92 lines (69 loc) · 2.8 KB
/
Tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re
import nltk
from gensim.utils import tokenize
# from gensim.summarization.textcleaner import split_sentences
from spacy.lang.en import English
from keras.preprocessing.text import text_to_word_sequence
# nltk.download()
paragraph = """Tokenization is one of the first step in any NLP pipeline. Tokenization is nothing but splitting the raw text into small chunks of words or sentences, called tokens. If the text is split into words, then it’s called as 'Word Tokenization' and if it's split into sentences then it’s called as 'Sentence Tokenization'. Generally 'space' is used to perform the word tokenization and characters like 'periods, exclamation point and newline char are used for Sentence Tokenization."""
"""Tokenization Using NLTK"""
print('Tokenization Using NLTK')
# Tokenizing words
words = nltk.word_tokenize(paragraph)
print(words)
# Tokenizing sentences
sentences = nltk.sent_tokenize(paragraph)
print(sentences)
text = """There are multiple ways we can perform tokenization on given text data. We can choose any method based on langauge, library and purpose of modeling."""
"""Tokenization Using Python's Inbuilt Method"""
print("\nTokenization Using Python's Inbuilt Method")
# Word Tokenization
tokens = text.split() # Split text by whitespace
print(tokens)
# Sentence Tokenization
tokens = text.split('. ') # split the given text by full stop (.)
print(tokens)
"""Tokenization Using Regular Expressions (RegEx)"""
print('\nTokenization Using Regular Expressions (RegEx)')
# Word Tokenization
tokens = re.findall('[\w]+', text)
print(tokens)
# Sentence Tokenization
tokens_sent = re.compile('[.!?] ').split(text)
print(tokens_sent)
"""Tokenization Using spaCy"""
print('\nTokenization Using spaCy')
# Word Tokenization
nlp = English() # Load English tokenizer
my_doc = nlp(text)
token_list = []
for token in my_doc:
token_list.append(token.text)
print(token_list)
# Sentence Tokenization
nlp = English() # Load English tokenizer
sbd = nlp.create_pipe('sentencizer') # Create the pipeline 'sentencizer' component
nlp.add_pipe(sbd) # Add component to the pipeline
doc = nlp(text)
sentence_list = []
for sentence in doc.sents:
sentence_list.append(sentence.text)
print(sentence_list)
"""Tokenization using Keras"""
print('\nTokenization using Keras')
# Word Tokenization
tokens = text_to_word_sequence(text)
print(tokens)
# Sentence Tokenization
tokens = text_to_word_sequence(text, split=".", filters="!.\n")
print(tokens)
tokens = text_to_word_sequence(text, split=".")
print(tokens)
"""Tokenization using Gensim"""
print('\nTokenization using Gensim')
# Word Tokenization
tokens = list(tokenize(text))
print(tokens)
# Sentence Tokenization
# sentences = list(split_sentences(text))
# print(sentences)