Skip to content

Commit

Permalink
start UnigramModel
Browse files Browse the repository at this point in the history
  • Loading branch information
arminZolfaghari committed Jul 18, 2021
1 parent b1d0733 commit 50a2c69
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 20 deletions.
4 changes: 2 additions & 2 deletions BigramModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ def __init__(self, train_pos_set, train_neg_set, lambda_arr, epsilon, cut_down,
self.count_unary_train_neg_dict = {}
self.count_binary_train_pos_dict = {}
self.count_binary_train_neg_dict = {}
# self.number_words_in_neg = 0
# self.number_words_in_pos = 0
self.cut_down = cut_down
self.cut_above = cut_above

Expand Down Expand Up @@ -170,7 +168,9 @@ def recognize_sentence(self, sentence):

# calculate sentence probability to recognize better probability
prob_given_sentence_is_negative = self.calculate_sentence_probability(sentence, "negative")
print("pos prob", prob_given_sentence_is_negative)
prob_given_sentence_is_positive = self.calculate_sentence_probability(sentence, "positive")
print("neg prob", prob_given_sentence_is_positive)
if prob_given_sentence_is_positive > prob_given_sentence_is_negative:
return "positive"
elif prob_given_sentence_is_positive < prob_given_sentence_is_negative:
Expand Down
2 changes: 1 addition & 1 deletion Main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def do_test(dict_sentences_with_label, model):
pre_process(negative_train_set), pre_process(negative_test_set)

# create bigram model object
lambda_arr = [0.2, 0.3, 0.5] # [h0, h1, h2]
lambda_arr = [0.2, 0.4, 0.4] # [h0, h1, h2]
epsilon = 0.2
cut_down = 2
cut_above= 10
Expand Down
91 changes: 74 additions & 17 deletions UnigramModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,40 @@ def get_words_array(sentence):

class UnigramModel():

def __init__(self, train_pos_set, train_neg_set, lambda_arr, epsilon):
def __init__(self, train_pos_set, train_neg_set, lambda_arr, epsilon, cut_down, cut_above):
self.train_positive_set = train_pos_set
self.train_negative_set = train_neg_set
self.lambda_arr = lambda_arr # [h0, h1, h2] => weights for probability
self.lambda_arr = lambda_arr # [h0, h1] => weights for probability
self.epsilon = epsilon
self.count_unary_train_pos_dict = {}
self.count_unary_train_neg_dict = {}
self.number_words_in_neg = 0
self.number_words_in_pos = 0
self.alpha_cut = 2
self.cut_down = cut_down
self.cut_above = cut_above

# cut from down
def do_alpha_cut(self):
for word in self.count_unary_train_pos_dict.keys():
if self.count_unary_train_pos_dict[word] <= 2:
for word in list(self.count_unary_train_pos_dict):
if self.count_unary_train_pos_dict[word] <= self.cut_down:
del self.count_unary_train_pos_dict[word]

for word in self.count_unary_train_neg_dict.keys():
if self.count_unary_train_neg_dict[word] <= 2:
for word in list(self.count_unary_train_neg_dict):
if self.count_unary_train_neg_dict[word] <= self.cut_down:
del self.count_unary_train_neg_dict[word]

# cut from above
def remove_from_above(self):
self.count_unary_train_pos_dict = sorted(self.count_unary_train_pos_dict.items(), key=lambda x: x[1],
reverse=True)
self.count_unary_train_neg_dict = sorted(self.count_unary_train_neg_dict.items(), key=lambda x: x[1],
reverse=True)

self.count_unary_train_pos_dict = dict(self.count_unary_train_pos_dict)
self.count_unary_train_neg_dict = dict(self.count_unary_train_neg_dict)

for i in range(self.cut_above):
del self.count_unary_train_pos_dict[list(self.count_unary_train_pos_dict)[i]]
del self.count_unary_train_neg_dict[list(self.count_unary_train_neg_dict)[i]]

def create_unary_words_dict(self):
for sentence in self.train_positive_set:
sentence = get_words_array(sentence)
Expand All @@ -51,33 +65,76 @@ def create_unary_words_dict(self):
else:
self.count_unary_train_neg_dict[word] = 1

self.do_alpha_cut()
self.remove_from_above()
self.calculate_number_words() # to calculate numbers of all words




# calculate number of all words in dictionary
def calculate_number_words(self):
sum_in_pos = 0
for key, value in self.count_unary_train_pos_dict:
for value in self.count_unary_train_pos_dict.values():
sum_in_pos += value
sum_in_neg = 0
for key, value in self.count_unary_train_neg_dict:
for value in self.count_unary_train_neg_dict.values():
sum_in_neg += value

self.number_words_in_pos = sum_in_pos
self.number_words_in_neg = sum_in_neg
# print(self.number_words_in_neg)
# print(self.number_words_in_pos)

# calculate p(w) = count(w)/M (M: all words in dictionary)
def calculate_unary_probability(self, word, dataset_mode):
res = 0
if dataset_mode == "positive":
if word in self.count_unary_train_pos_dict.keys():
if word in self.count_unary_train_pos_dict:
res = self.count_unary_train_pos_dict[word] / self.number_words_in_pos

else:
res = 0
elif dataset_mode == "negative":
if word in self.count_unary_train_neg_dict.keys():
if word in self.count_unary_train_neg_dict:
res = self.count_unary_train_neg_dict[word] / self.number_words_in_neg
else:
res = 0

return res

# recognize sentence: is positive or negative
# calculate p(wi|wi-1) = h2 * p(wi|wi-1) + h1 * p(wi) + h0 * e
def calculate_conditional_probability(self, word1, word2, dataset_mode):
[h0, h1, h2] = self.lambda_arr
res = h2 * self.calculate_simple_conditional_probability(word1, word2,
dataset_mode) + h1 * self.calculate_unary_probability(
word2, dataset_mode) + h0 * self.epsilon

return res

def calculate_sentence_probability(self, sentence, dataset_mode):
words_array = get_words_array(sentence)
PI = 1
for i in range(1, len(words_array)):
PI *= self.calculate_conditional_probability(words_array[i - 1], words_array[i], dataset_mode)

return PI

# start learning and create unary and binary words dictionary
def learning(self):

self.create_unary_words_dict()
self.create_binary_words_dict()

# recognize sentence is positive or negative
def recognize_sentence(self, sentence):
words_array = get_words_array()

# calculate sentence probability to recognize better probability
prob_given_sentence_is_negative = self.calculate_sentence_probability(sentence, "negative")
print("pos prob", prob_given_sentence_is_negative)
prob_given_sentence_is_positive = self.calculate_sentence_probability(sentence, "positive")
print("neg prob", prob_given_sentence_is_positive)
if prob_given_sentence_is_positive > prob_given_sentence_is_negative:
return "positive"
elif prob_given_sentence_is_positive < prob_given_sentence_is_negative:
return "negative"
else:
return "equal"

0 comments on commit 50a2c69

Please sign in to comment.