-
Notifications
You must be signed in to change notification settings - Fork 17
/
classifier.py
243 lines (202 loc) · 7.82 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
'''
Contains main implementation of a generic Classifier for tweets.
Reads in language model and stores into a model.
'''
import csv
import re
import nltk
import os
import cPickle as pickle
import random
class Classifier:
'''
rawfname -> name of file containing raw training data
force == True iff user wants to overwrite classifier data
grams -> list of n-grams to use:
1, unigrams; 2, bigrams; 3, trigrams; and so on
so [1,2] means unigrams + bigrams
__init__(self, rawfname, modelfname, force, grams)
'''
def __init__(self, rawfname, *args, **kargs):
self.rawfname = rawfname
self.force = kargs.get("force", False)
self.numgrams = kargs.get("grams", [1])
# create modelfname using numgrams variable
# e.g if self.numgrams = [1,2], then
# self.modelfname = 'model1-2.dat'
self.modelfname = "model%s.dat" % \
(reduce(lambda x, y: str(x)+'-'+str(y),
self.numgrams))
# weight to use in self.weightedProb
self.weight = kargs.get("weight", 0.00005)
# The number of lines to train on. Use during development
# to train on only a small chunk of the training set
self.filesubset = kargs.get("filesubset", "all")
# counts of tweets in each class
# [x,y] where
# x -> number of tweets in negative class
# y -> number of tweets in positive class
self.tweetcounts = [0, 0]
# counts of feature/class combinations
# stores (feature) => [x, y] where
# x -> number of times feature appears in negative class
# y -> number of times feature appears in positive class
self.ftweetcounts = {}
def incFC(self, f, c):
'''
Increment count of a feature/class pair
'''
self.ftweetcounts.setdefault(f, [0, 0])
self.ftweetcounts[f][c] += 1
def incC(self, c):
'''
Increment count of a class
'''
self.tweetcounts[c] += 1
def getFC(self, f, c):
'''
Return number of times a features has appeared in a class
'''
if f in self.ftweetcounts:
return float(self.ftweetcounts[f][c])
return 0.0
def getC(self, c):
'''
Return number of features in a class
'''
return float(self.tweetcounts[c])
def getTotal(self):
'''
Return total number of features
'''
return sum(self.tweetcounts)
def getFeatures(self, item):
'''
Each feature has weight 1
That is, even if the word 'obama' appears >10 times
in a tweet, it is counted only once in that particular tweet
'''
flist = []
for gram in self.numgrams:
tokenized = nltk.word_tokenize(item)
for i in range(len(tokenized)-gram+1):
flist.append(" ".join(tokenized[i:i+gram]))
return set(flist)
def train(self, c, item):
'''
Trains the classifier using item (for now, just text) on a
specific class
c -> class (number)
'''
features = self.getFeatures(item)
for f in features:
self.incFC(f, c)
self.incC(c)
def trainClassifier(self):
'''
Trains the classifier based on tweets in self.modelfname
Stores the resulting data structures in a pickle file
'''
if self.force:
os.remove(self.modelfname)
elif os.path.exists(self.modelfname):
grams, self.tweetcounts, self.ftweetcounts = pickle.load(
open(self.modelfname, "rb")
)
# stop iff we have data for the number of grams we want
if grams == self.numgrams:
print "Model retrieved from '%s'" % self.modelfname
return
f = open(self.rawfname)
r = csv.reader(f, delimiter=',', quotechar='"')
# get 0th column -> '0' if negative (class 0), '4' if positive (class 1)
# get 5th column -> contains text of tweet
stripped = [(0 if line[0] == '0' else 1,
re.sub(r'[,.]', r'',
line[-1].lower().strip())) for line in r]
# Only train on lines 0 -> <last_line> of the training set
last_line = len(stripped) if self.filesubset == "all" else self.filesubset
for each in stripped[:last_line]:
self.train(each[0], each[1])
# store Classifier training data
pickle.dump([self.numgrams, self.tweetcounts, self.ftweetcounts],
open(self.modelfname, "wb")
)
print "Model stored in '%s'" % self.modelfname
f.close()
def getSampleTweets(self, n, pct_pos = .5):
'''
Return <n> tweets from the training set where <pct_pos> of the tweets
have positive sentiment and (1 - <pct_pos>) have negative sentiment
'''
random.seed(10)
numpos, numneg = 0, 0
targetpos, targetneg = int(n * pct_pos), int(n * (1 - pct_pos))
# Should have <n> lines in the end
sample = []
f = open(self.rawfname)
r = csv.reader(f, delimiter=',', quotechar='"')
# get 0th column -> '0' if negative (class 0), '4' if positive (class 1)
# get 5th column -> contains text of tweet
stripped = [(0 if line[0] == '0' else 1,
re.sub(r'[,.]', r'',
line[-1].lower().strip())) for line in r]
random.shuffle(stripped)
i = 0
# Read through the shuffled list of examples until there are
# <targetpos> positive tweets and <targetneg> negative tweets
# in our sample
while numpos < targetpos or numneg < targetneg:
curtweet = stripped[i]
if curtweet[0] == 0 and numneg < targetneg:
numneg += 1
sample.append(curtweet)
elif curtweet[0] == 1 and numpos < targetpos:
numpos += 1
sample.append(curtweet)
i += 1
return sample
def probFC(self, f, c):
'''
Return the probability of a feature being in a particular class
'''
if self.getC(c) == 0:
return 0
return self.getFC(f, c)/self.getC(c)
def probC(self, c):
'''
Return the probability Prob(Class)
'''
return self.getC(c)/self.getTotal()
def setWeight(self, w):
'''
Set weight to use in classifier
'''
self.weight = w
def weightedProb(self, f, c, ap=0.5):
'''
Method of smoothing:
Start with an assumed probability (ap) for each word in each class
Then, return weighted probability of real probability (probFC)
and assumed probability
weight of 1.0 means ap is weighted as much as a word
Bayesian in nature:
For example, the word 'dude' might not be in the corpus initially.
so assuming weight of 1.0, then
P('dude' | class=0) = 0.5 and P('dude' | class=1) = 0.5
then when we find one 'dude' that's positive,
P('dude' | class=0) = 0.25 and P('dude' | class=1) = 0.75
'''
# calculate current probability
real = self.probFC(f, c)
# count number of times this feature has appeared in all categories
totals = sum([self.getFC(f,c) for c in [0, 1]])
# calculate weighted average
return ((self.weight * ap) + (totals * real))/(self.weight + totals)
def classify(self, text):
'''
Return 0 if negative; Return 1 if positive
'''
raise Exception("You must subclass 'Classifier' to classify tweets")
def __repr__(self):
return "Classifier info: (weight=%s, grams=%s)" % (self.weight, self.numgrams)