-
Notifications
You must be signed in to change notification settings - Fork 10
/
FileCreationWithBigrams.py
181 lines (153 loc) · 5.59 KB
/
FileCreationWithBigrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"Creating File using Bigrams"
import re
import nltk
import string
import enchant
import operator
from nltk.corpus import stopwords
from collections import OrderedDict
from textblob import TextBlob, Word
from textblob import Blobber
from textblob.taggers import NLTKTagger
apostropheList = {"n't" : "not","aren't" : "are not","can't" : "cannot","couldn't" : "could not","didn't" : "did not","doesn't" : "does not", \
"don't" : "do not","hadn't" : "had not","hasn't" : "has not","haven't" : "have not","he'd" : "he had","he'll" : "he will", \
"he's" : "he is","I'd" : "I had","I'll" : "I will","I'm" : "I am","I've" : "I have","isn't" : "is not","it's" : \
"it is","let's" : "let us","mustn't" : "must not","shan't" : "shall not","she'd" : "she had","she'll" : "she will", \
"she's" : "she is", "shouldn't" : "should not","that's" : "that is","there's" : "there is","they'd" : "they had", \
"they'll" : "they will", "they're" : "they are","they've" : "they have","we'd" : "we had","we're" : "we are","we've" : "we have", \
"weren't" : "were not", "what'll" : "what will","what're" : "what are","what's" : "what is","what've" : "what have", \
"where's" : "where is","who'd" : "who had", "who'll" : "who will","who're" : "who are","who's" : "who is","who've" : "who have", \
"won't" : "will not","wouldn't" : "would not", "you'd" : "you had","you'll" : "you will","you're" : "you are","you've" : "you have"}
stopWords = stopwords.words("english")
exclude = set(string.punctuation)
exclude.remove("_")
linkPtrn = re.compile("^(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?$")
#English vocabulary
enchVocab = enchant.Dict("en_US")
vocabList = set(w.lower() for w in nltk.corpus.words.words())
def fileCreation(reviewContent,filename):
phrasesDict = dict()
for a in range(len(reviewContent)): #Stores the score of the nouns
for i in range(len(reviewContent[a])):
line_words = reviewContent[a][i]
phrases = TextBlob(line_words).noun_phrases
for p in phrases:
if(len(p.split()) == 2):
if(p not in phrasesDict):
phrasesDict[p] = 1
else:
phrasesDict[p] += 1
#Calling filterAdj function
filterAdj(phrasesDict,filename)
def filterAdj(phrasesDict,filename):
phrasesDict = OrderedDict(sorted(phrasesDict.items(), key=operator.itemgetter(1), reverse=True))
newPhrases = dict()
exclude = set(string.punctuation)
exclude.remove("_")
for line_words, count in phrasesDict.items():
#Preprocessing text
line_words = ' '.join([apostropheList[word] if word in apostropheList else word for word in line_words.split()])
line_words = ''.join(ch for ch in line_words if ch not in exclude)
line_words = re.sub(r' [a-z][$]? ', ' ', line_words)
line_words = [Word(word).lemmatize() for word in line_words.split() if(word not in stopwords.words("english") and not word.isdigit()) and len(word) > 2]
line_words = ' '.join(line_words)
if(len(line_words.strip(" ").split()) == 2):
if(line_words in newPhrases):
newPhrases[line_words] += count
else:
newPhrases[line_words] = count
#Bigrams from the file
newPhrases = OrderedDict(sorted(newPhrases.items(), key=operator.itemgetter(1), reverse=True))
#Applying Threshold to Bigrams
nouns1 = []
for key, value in newPhrases.items():
if value >= 3:
nouns1.append(key)
stopWords = stopwords.words("english")
exclude = set(string.punctuation)
reviewTitle = []
reviewContent = []
#Reading the original file
with open(filename) as f:
review = []
for line in f:
if line[:6] == "[+][t]":
if review:
reviewContent.append(review)
review = []
reviewTitle.append(line.split("[+][t]")[1].rstrip("\r\n"))
elif line[:6] == "[-][t]":
if review:
reviewContent.append(review)
review = []
reviewTitle.append(line.split("[-][t]")[1].rstrip("\r\n"))
else:
if "##" in line:
x = line.split("##")
#if len(x[0]) != 0:
for i in range(1, len(x)):
review.append(x[i].rstrip("\r\n"))
else:
continue
reviewContent.append(review)
#tb = Blobber(pos_tagger=PerceptronTagger())
tb = Blobber(pos_tagger=NLTKTagger())
nounScores = dict()
#Writing to a file
f = open('modified.txt', 'w')
for a in range(len(reviewContent)):
f.write("[t]")
#Finding Bigrams in title
text = reviewTitle[a]
x = tb(text).tags #NLTK tagger
e = 0
while e<len(x):
tagList = ""
temp = ""
wrt = x[e][0]
e = e+1
count = e
tp = 0
if(count<len(x) and (x[count-1][1] == "NN" or "JJ") and (x[count][1] == "NN" or "JJ")):
tagList = x[count-1][0] + " " + x[count][0]
temp = x[count][0]
count = count+1
if tagList != "":
if tagList in nouns1:
tagList = tagList.replace(' ', '')
f.write(tagList)
tp = 1
e = count
if tp == 0:
f.write(wrt)
f.write(" ")
f.write("\r\n")
#Finding bigrams in review
for i in range(len(reviewContent[a])):
text = reviewContent[a][i]
x = tb(text).tags #NLTK tagger
tagList = []
e = 0
f.write("##")
while e<len(x):
tagList = ""
temp = ""
wrt = x[e][0]
e = e+1
count = e
tp = 0
if(count<len(x) and (x[count-1][1] == "NN" or "JJ") and (x[count][1] == "NN" or "JJ")):
tagList = x[count-1][0] + " " + x[count][0]
temp = x[count][0]
count = count+1
if tagList != "":
#Checking if consecutive nouns we found out are in noun phrases
if tagList in nouns1:
tagList = tagList.replace(' ', '')
f.write(tagList)
tp = 1
e = count
if tp == 0:
f.write(wrt)
f.write(" ")
f.write(".\r\n")