-
Notifications
You must be signed in to change notification settings - Fork 0
/
pb_classifiers.py
164 lines (146 loc) · 6.52 KB
/
pb_classifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
####################################################################
# Licence: Creative Commons (see COPYRIGHT) #
# Authors: Nikolaos Pappas, Georgios Katsimpras #
# {nik0spapp, gkatsimpras}@gmail.com #
# Supervisor: Efstathios stamatatos #
# stamatatos@aegean.gr #
# University of the Aegean #
# Department of Information and Communication Systems Engineering #
# Information Management Track (MSc) #
# Karlovasi, Samos #
# Greece #
####################################################################
import os
import sys
import random
from terminal_colors import Tcolors
sys.path.append(os.path.abspath("") + "/../")
class PbSubj:
"""
PbSubj: Pattern-Based subjective sentence classifier which classifies a
sentence as subjective with a probability of the top-matched
pattern among a list of strongly associated with subjectivity
patterns. The selection of these patterns is made using two thresholds
t1 and t2. The patterns whom frequency is greater than t1 and the
subjective frequency greater than t2 are selected.
"""
def __init__(self, tagger, debug=False):
self.tagger = tagger
# Patterns learned from the pattern learner
self.learned_patterns = {}
# Strong subjective patterns
self.ss_patterns = {}
self.sorted_ss_patterns = None
self.t1_threshold = 5 # 3
self.t2_threshold = 1 # 0.9
self.pl_threshold = 25
self.limit = 1
self.debug = debug
def classify(self, sentence):
"""
Classify sentence based on the probabilities of the strongly
associated patterns with subjectivity.
"""
found = False
matched_pattern = None
# POS tagging
tagged_sentence = self.tagger.tag(sentence)
words = []
tags = []
for (word, tag) in tagged_sentence:
words.append(word)
if tag is None:
tag = ""
tags.append(tag)
for (pattern, value) in self.sorted_ss_patterns:
display = value['display']
pattern_type = value['type']
try:
pos_in_sentence = sentence.find(display)
except:
pos_in_sentence = -1
if pos_in_sentence > -1:
matched_pattern = value
if pattern_type == "subj":
found = self.search_for_subject(display, words, tags)
elif pattern_type in ["dobj", "np"]:
remaining_sentence = sentence[pos_in_sentence:]
found = self.search_for_object(display, words, tags)
if found:
break
if not found:
objective = False
subjective = False
else:
if self.debug: print matched_pattern
random.seed()
if random.uniform(0,1) <= matched_pattern['prob']:
subjective = True
objective = False
if self.debug: print "Probability: " + (str)(matched_pattern['prob'])
else:
objective = True
subjective = False
if self.debug: print "Probability: " + (str)(1 - matched_pattern['prob'])
return found, subjective, objective
def find_needle_in_haystack(self, needle, haystack):
"""
This method finds the position of the intersection on
the haystack array (if there is one).
"""
r = []
L = len(needle)
for i in range(len(haystack)):
if haystack[i:i+L] == needle:
r.append(i)
return r
def search_for_object(self, pattern, words, tags):
pattern_words = pattern.split()
position = self.find_needle_in_haystack(pattern_words, words)
if len(position) > 0:
position = position[0] + len(pattern_words)
for i, tag in enumerate(tags[position:]):
if i < self.limit and (tag.find("NN") > -1 or tag.find("NP") > -1\
or tag.find("PR") > -1):
return True
return False
def search_for_subject(self, pattern, words, tags):
pattern_words = pattern.split()
position = self.find_needle_in_haystack(pattern_words, words)
if len(position) > 0:
position = position[0] - 1
for i, tag in enumerate(tags[position:]):
if i < self.limit and (tag.find("NN") > -1 or tag.find("NP") > -1\
or tag.find("PR") > -1):
return True
return False
def select_strong_subjective_patterns(self):
"""
Selection of the strongly associated with subjectivity patterns
using the thresholds t1 and t2.
"""
self.ss_patterns = {}
for pattern in self.learned_patterns.keys():
freq = self.learned_patterns[pattern]['freq']
prob = self.learned_patterns[pattern]['prob']
if freq >= self.t1_threshold and prob >= self.t2_threshold:
self.ss_patterns[pattern] = self.learned_patterns[pattern]
# delete some patterns with low frequency and probability for efficiency
elif freq > 5 and freq < ((self.t1_threshold*3) / 4):
del(self.learned_patterns[pattern])
sorted_ss = sorted(self.ss_patterns.iteritems(),key=lambda x: x[1]['prob'], reverse=True)
self.sorted_ss_patterns = sorted_ss
for (s,v) in sorted_ss:
title = (Tcolors.OKGREEN+s+Tcolors.ENDC+" ").ljust(70,'-')
pbs = (str)(v['freq'])+"/" + Tcolors.CYAN + (str)(v['prob']) + Tcolors.ENDC
if self.debug: print title + "------------> " + pbs
if self.debug: print
if len(sorted_ss) > self.pl_threshold:
self.t1_threshold += 1
def train(self, learned_patterns):
"""
Train classifier with the learned patterns derived from
the pattern learner.
"""
self.learned_patterns = learned_patterns
self.select_strong_subjective_patterns()