You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
6 frames
in preprocess(data)
3 data = remove_punctuation(data) #remove comma seperately
4 data = remove_apostrophe(data)
----> 5 data = remove_stop_words(data)
6 data = convert_numbers(data)
7 data = stemming(data)
in remove_stop_words(data)
1 def remove_stop_words(data):
2 stop_words = stopwords.words('english')
----> 3 words = word_tokenize(str(data))
4 new_text = ""
5 for w in words:
/usr/local/lib/python3.6/dist-packages/nltk/tokenize/init.py in word_tokenize(text, language, preserve_line)
126 :type preserver_line: bool
127 """
--> 128 sentences = [text] if preserve_line else sent_tokenize(text, language)
129 return [token for sent in sentences
130 for token in _treebank_word_tokenizer.tokenize(sent)]
/usr/local/lib/python3.6/dist-packages/nltk/tokenize/init.py in sent_tokenize(text, language)
92 :param language: the model name in the Punkt corpus
93 """
---> 94 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
95 return tokenizer.tokenize(text)
96
/usr/local/lib/python3.6/dist-packages/nltk/data.py in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
832
833 # Load the resource.
--> 834 opened_resource = _open(resource_url)
835
836 if format == 'raw':
/usr/local/lib/python3.6/dist-packages/nltk/data.py in open(resource_url)
950
951 if protocol is None or protocol.lower() == 'nltk':
--> 952 return find(path, path + ['']).open()
953 elif protocol.lower() == 'file':
954 # urllib might not use mode='rb', so handle this one ourselves:
when i run
processed_text = []
processed_title = []
for i in dataset[:N]:
file = open(i[0], 'r', encoding="utf8", errors='ignore')
text = file.read().strip()
file.close()
i got the error
LookupError Traceback (most recent call last)
in ()
8 file.close()
9
---> 10 processed_text.append(word_tokenize((preprocess(text))))
11 processed_title.append(word_tokenize((preprocess(i[1]))))
6 frames
in preprocess(data)
3 data = remove_punctuation(data) #remove comma seperately
4 data = remove_apostrophe(data)
----> 5 data = remove_stop_words(data)
6 data = convert_numbers(data)
7 data = stemming(data)
in remove_stop_words(data)
1 def remove_stop_words(data):
2 stop_words = stopwords.words('english')
----> 3 words = word_tokenize(str(data))
4 new_text = ""
5 for w in words:
/usr/local/lib/python3.6/dist-packages/nltk/tokenize/init.py in word_tokenize(text, language, preserve_line)
126 :type preserver_line: bool
127 """
--> 128 sentences = [text] if preserve_line else sent_tokenize(text, language)
129 return [token for sent in sentences
130 for token in _treebank_word_tokenizer.tokenize(sent)]
/usr/local/lib/python3.6/dist-packages/nltk/tokenize/init.py in sent_tokenize(text, language)
92 :param language: the model name in the Punkt corpus
93 """
---> 94 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
95 return tokenizer.tokenize(text)
96
/usr/local/lib/python3.6/dist-packages/nltk/data.py in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
832
833 # Load the resource.
--> 834 opened_resource = _open(resource_url)
835
836 if format == 'raw':
/usr/local/lib/python3.6/dist-packages/nltk/data.py in open(resource_url)
950
951 if protocol is None or protocol.lower() == 'nltk':
--> 952 return find(path, path + ['']).open()
953 elif protocol.lower() == 'file':
954 # urllib might not use mode='rb', so handle this one ourselves:
/usr/local/lib/python3.6/dist-packages/nltk/data.py in find(resource_name, paths)
671 sep = '*' * 70
672 resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep)
--> 673 raise LookupError(resource_not_found)
674
675
LookupError:
Resource punkt not found.
Please use the NLTK Downloader to obtain the resource:
Searched in:
- '/root/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
- '/usr/nltk_data'
- '/usr/lib/nltk_data'
- ''
even I imported
nltk
and nltk.download('punkt')
what I should do?
The text was updated successfully, but these errors were encountered: