svm_with_word2vec.py

# -*- coding: utf-8 -*-
"""SVM+word2vec.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1CRasmyaAHAne-E8fBj4r2BK1SCVW8p7W
"""

!pip install contractions

import pandas as pd
from torch.utils.data import Dataset

import contractions
import re
import string
import numpy as np
from gensim.models import Word2Vec
def text_preprocessing(df):
    """ Includes all data preprocessing.
    :param df: raw dataframe
    :param df: preprocessed data
    :return:
    """
    df['new_text']=df['text'].apply(lambda x: contractions.fix(x, slang=True))
    df['new_text'] = df['new_text'].str.lower()
    df['new_text'] = df['new_text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))
    df['new_text'] = df['new_text'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))
    df['new_text'] = df['new_text'].apply(lambda x: re.sub(' +', ' ', x))
    return df

"""Preprocess the original data. Note that we need to put the data file in the same folder with this notebook"""

label_dict = {'negative': 0, 'neutral': 1, 'positive': 2}
texts_df = pd.read_csv("train_data.csv")    # should be adjusted according to the position of data files
labels_df = pd.read_csv("train_results.csv")  # should be adjusted according to the position of data files
texts_df['label'] = labels_df.apply(lambda row: label_dict[row['target']], axis=1)
texts_df_new = text_preprocessing(texts_df)
tmp_corpus = texts_df_new['new_text'].map(lambda x: x.split('.'))

"""Construc the corpus using the existing data"""

from tqdm import tqdm
corpus = []
for i in tqdm(range(len(tmp_corpus))):
    for line in tmp_corpus[i]:
        words = [x for x in line.split()]
        corpus.append(words)
num_of_sentences = len(corpus)
num_of_words = 0
for line in corpus:
    num_of_words += len(line)

print('Num of sentences - %s'%(num_of_sentences))
print('Num of words - %s'%(num_of_words))

"""Training our own word2vec model"""

size = 100
window_size = 2 # sentences weren't too long, so
epochs = 5
min_count = 2
workers = 4
import random
# shuffle corpus
def shuffle_corpus(sentences):
    shuffled = list(sentences)
    random.shuffle(shuffled)
    return shuffled
# train word2vec model using gensim
model = Word2Vec(corpus, sg=1,window=window_size,size=size,
                 min_count=min_count, workers=workers, iter=epochs, sample=0.01)
model.build_vocab(sentences=shuffle_corpus(corpus),update=True)

model.train(sentences=shuffle_corpus(corpus),epochs=2,total_examples=model.corpus_count)

"""Save the word2vec model"""

model.save('w2v_model_100')
# from gensim.models import Word2Vec
# model = Word2Vec.load('w2v_model_100')

"""Use the word2vec model to tokenize our sentances"""

def FunctionText2Vec(corpus):
    """ Use the word2vec model to tokenize our sentances.
    :param corpus: cleaned text data
    :return: tokenized data
    """
    vector_Data=[]

    # Looping through each row for the data
    for sentence in corpus:
        # initiating a sentence with all zeros
        vector_sentence = np.zeros(100)
        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in sentence:
            #print(word)
            if word in model.wv.vocab:    
                vector_sentence=vector_sentence+model.wv[word]
        # Appending the sentence to the dataframe
        vector_Data.append(vector_sentence.tolist())


    return vector_Data
all_data = FunctionText2Vec(corpus)

import pickle
def save_model(clf, filename='trained_model_100.pkl'):
    pickle.dump(clf, open(filename, 'wb'))

"""Training the New SVM with vectors generated by word2Vec model"""

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
def train_svm(Train_X, Train_Y, Test_X, Test_Y):
    """ Training SVM and save the model.
    :param train_data: transformed training data
    :param val_data: transformed validation data
    :return:
    """
    # linearSVC
    text_clf_svm = Pipeline([
        # ('vect', CountVectorizer(ngram_range=(1, 2))),
        # ('tfidf', TfidfTransformer(sublinear_tf=True)),
        ('clf-svm', LinearSVC(loss='hinge', C=1.0, class_weight='balanced')),
    ])

    _ = text_clf_svm.fit(Train_X, Train_Y)
    preds = text_clf_svm.predict(Test_X)
    acc = np.mean(preds == Test_Y)
    print(f'Test accurary of SVM model is: {acc}')

    save_model(text_clf_svm, 'svm_trained_model_trigram_100.pkl')

train_data_len = int(len(all_data)*0.9)
print(train_data_len)
print(texts_df_new.columns)

Train_X, Train_Y, Test_X, Test_Y = all_data[:train_data_len],texts_df_new.iloc[:train_data_len,2],all_data[train_data_len:],texts_df_new.iloc[train_data_len:,2]
train_svm(Train_X, Train_Y, Test_X, Test_Y)