-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTXTnlp_tok.py
98 lines (80 loc) · 2.39 KB
/
TXTnlp_tok.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 18 17:49:26 2018
@author: LEIHAO
"""
import os
import io
import types
from textblob import TextBlob, Word
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer as lemma
### Functions
# Function to convert Penn-Treebank POS tags to simplified (WordNet) POS tags
def posWN(posTB):
if posTB.startswith('J'):
return 'a'
elif posTB.startswith('V'):
return 'v'
elif posTB.startswith('N'):
return 'n'
elif posTB.startswith('R'):
return 'r'
elif posTB.startswith('A'):
return 's'
else:
return ''
# POS function
def pos(blob):
#tok = [token[0] for token in blob.pos_tags if len(token[0])>3]
#tokW = [Word(token) for token in tok]
#tokn = len(tok)
#posTB = [pos[1] for pos in blob.pos_tags if len(pos[0])>3]
#posW = [posWN(TB) for TB in posTB]
posW = [posWN(pos[1]) for pos in blob.pos_tags if len(pos[0])>3]
return posW
# Tokenizer function
def token(blob):
tok = [token[0] for token in blob.pos_tags if len(token[0])>3]
return tok
# Lemmatizer function
def lem(article):
from nltk.stem.wordnet import WordNetLemmatizer as lemma
#tokn = len(article.blob.words)
tokn=len(article.tok)
posn = len(article.pos)
if tokn == posn:
#words = article.blob.words
words=article.tok
else:
words = [token[0] for token in article.blob.pos_tags if len(token[0])>3]
tokn = len(words)
lems = []
for j in range(0, tokn):
if article.pos[j] == '':
verb = words[j].lemmatize('v')
noun = words[j].lemmatize('n')
if len(verb) == len(noun):
lems.append(words[j])
elif len(verb) < len(noun):
lems.append(verb)
else:
lems.append(noun)
else:
lems.append(words[j].lemmatize(article.pos[j]))
lems = [token.lower() for token in lems]
article.lem = lems
return article
# Counter and stopwords remover
def cntr(lems, stpw):
from collections import Counter
cnt = Counter(lems)
cnt = [(cnt[entry], entry) for entry in cnt]
cnt.sort()
cnt.reverse()
wordfreq = []
for i in range(0, len(cnt)):
if cnt[i][1] not in stpw:
wordfreq.append(cnt[i])
return wordfreq