start UnigramModel

arminZolfaghari · Jul 18, 2021 · 50a2c69 · 50a2c69
1 parent b1d0733
commit 50a2c69
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 20 deletions.
diff --git a/BigramModel.py b/BigramModel.py
@@ -21,8 +21,6 @@ def __init__(self, train_pos_set, train_neg_set, lambda_arr, epsilon, cut_down,
         self.count_unary_train_neg_dict = {}
         self.count_binary_train_pos_dict = {}
         self.count_binary_train_neg_dict = {}
-        # self.number_words_in_neg = 0
-        # self.number_words_in_pos = 0
         self.cut_down = cut_down
         self.cut_above = cut_above
 
@@ -170,7 +168,9 @@ def recognize_sentence(self, sentence):
 
         # calculate sentence probability to recognize better probability
         prob_given_sentence_is_negative = self.calculate_sentence_probability(sentence, "negative")
+        print("pos prob", prob_given_sentence_is_negative)
         prob_given_sentence_is_positive = self.calculate_sentence_probability(sentence, "positive")
+        print("neg prob", prob_given_sentence_is_positive)
         if prob_given_sentence_is_positive > prob_given_sentence_is_negative:
             return "positive"
         elif prob_given_sentence_is_positive < prob_given_sentence_is_negative:

diff --git a/Main.py b/Main.py
@@ -25,7 +25,7 @@ def do_test(dict_sentences_with_label, model):
     pre_process(negative_train_set), pre_process(negative_test_set)
 
     # create bigram model object
-    lambda_arr = [0.2, 0.3, 0.5]    # [h0, h1, h2]
+    lambda_arr = [0.2, 0.4, 0.4]    # [h0, h1, h2]
     epsilon = 0.2
     cut_down = 2
     cut_above= 10

diff --git a/UnigramModel.py b/UnigramModel.py
@@ -12,26 +12,40 @@ def get_words_array(sentence):
 
 class UnigramModel():
 
-    def __init__(self, train_pos_set, train_neg_set, lambda_arr, epsilon):
+    def __init__(self, train_pos_set, train_neg_set, lambda_arr, epsilon, cut_down, cut_above):
         self.train_positive_set = train_pos_set
         self.train_negative_set = train_neg_set
-        self.lambda_arr = lambda_arr  # [h0, h1, h2] => weights for probability
+        self.lambda_arr = lambda_arr  # [h0, h1] => weights for probability
         self.epsilon = epsilon
         self.count_unary_train_pos_dict = {}
         self.count_unary_train_neg_dict = {}
-        self.number_words_in_neg = 0
-        self.number_words_in_pos = 0
-        self.alpha_cut = 2
+        self.cut_down = cut_down
+        self.cut_above = cut_above
 
+    # cut from down
     def do_alpha_cut(self):
-        for word in self.count_unary_train_pos_dict.keys():
-            if self.count_unary_train_pos_dict[word] <= 2:
+        for word in list(self.count_unary_train_pos_dict):
+            if self.count_unary_train_pos_dict[word] <= self.cut_down:
                 del self.count_unary_train_pos_dict[word]
 
-        for word in self.count_unary_train_neg_dict.keys():
-            if self.count_unary_train_neg_dict[word] <= 2:
+        for word in list(self.count_unary_train_neg_dict):
+            if self.count_unary_train_neg_dict[word] <= self.cut_down:
                 del self.count_unary_train_neg_dict[word]
 
+    # cut from above
+    def remove_from_above(self):
+        self.count_unary_train_pos_dict = sorted(self.count_unary_train_pos_dict.items(), key=lambda x: x[1],
+                                                 reverse=True)
+        self.count_unary_train_neg_dict = sorted(self.count_unary_train_neg_dict.items(), key=lambda x: x[1],
+                                                 reverse=True)
+
+        self.count_unary_train_pos_dict = dict(self.count_unary_train_pos_dict)
+        self.count_unary_train_neg_dict = dict(self.count_unary_train_neg_dict)
+
+        for i in range(self.cut_above):
+            del self.count_unary_train_pos_dict[list(self.count_unary_train_pos_dict)[i]]
+            del self.count_unary_train_neg_dict[list(self.count_unary_train_neg_dict)[i]]
+
     def create_unary_words_dict(self):
         for sentence in self.train_positive_set:
             sentence = get_words_array(sentence)
@@ -51,33 +65,76 @@ def create_unary_words_dict(self):
                 else:
                     self.count_unary_train_neg_dict[word] = 1
 
+        self.do_alpha_cut()
+        self.remove_from_above()
         self.calculate_number_words()  # to calculate numbers of all words
 
+
+
+
     # calculate number of all words in dictionary
     def calculate_number_words(self):
         sum_in_pos = 0
-        for key, value in self.count_unary_train_pos_dict:
+        for value in self.count_unary_train_pos_dict.values():
             sum_in_pos += value
         sum_in_neg = 0
-        for key, value in self.count_unary_train_neg_dict:
+        for value in self.count_unary_train_neg_dict.values():
             sum_in_neg += value
 
         self.number_words_in_pos = sum_in_pos
         self.number_words_in_neg = sum_in_neg
+        # print(self.number_words_in_neg)
+        # print(self.number_words_in_pos)
 
     # calculate p(w) = count(w)/M   (M: all words in dictionary)
     def calculate_unary_probability(self, word, dataset_mode):
-        res = 0
         if dataset_mode == "positive":
-            if word in self.count_unary_train_pos_dict.keys():
+            if word in self.count_unary_train_pos_dict:
                 res = self.count_unary_train_pos_dict[word] / self.number_words_in_pos
-
+            else:
+                res = 0
         elif dataset_mode == "negative":
-            if word in self.count_unary_train_neg_dict.keys():
+            if word in self.count_unary_train_neg_dict:
                 res = self.count_unary_train_neg_dict[word] / self.number_words_in_neg
+            else:
+                res = 0
 
         return res
 
-    # recognize sentence: is positive or negative
+    # calculate p(wi|wi-1) = h2 * p(wi|wi-1) + h1 * p(wi) + h0 * e
+    def calculate_conditional_probability(self, word1, word2, dataset_mode):
+        [h0, h1, h2] = self.lambda_arr
+        res = h2 * self.calculate_simple_conditional_probability(word1, word2,
+                                                                 dataset_mode) + h1 * self.calculate_unary_probability(
+            word2, dataset_mode) + h0 * self.epsilon
+
+        return res
+
+    def calculate_sentence_probability(self, sentence, dataset_mode):
+        words_array = get_words_array(sentence)
+        PI = 1
+        for i in range(1, len(words_array)):
+            PI *= self.calculate_conditional_probability(words_array[i - 1], words_array[i], dataset_mode)
+
+        return PI
+
+    # start learning and create unary and binary words dictionary
+    def learning(self):
+
+        self.create_unary_words_dict()
+        self.create_binary_words_dict()
+
+    # recognize sentence is positive or negative
     def recognize_sentence(self, sentence):
-        words_array = get_words_array()
+
+        # calculate sentence probability to recognize better probability
+        prob_given_sentence_is_negative = self.calculate_sentence_probability(sentence, "negative")
+        print("pos prob", prob_given_sentence_is_negative)
+        prob_given_sentence_is_positive = self.calculate_sentence_probability(sentence, "positive")
+        print("neg prob", prob_given_sentence_is_positive)
+        if prob_given_sentence_is_positive > prob_given_sentence_is_negative:
+            return "positive"
+        elif prob_given_sentence_is_positive < prob_given_sentence_is_negative:
+            return "negative"
+        else:
+            return "equal"