lstm.py

# -*- coding: utf-8 -*-
"""LSTM

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1TQrXnkFU9cCVjdtX49DU1cNqcAmVUCFv
"""

!pip install contractions
!pip install lime

import pandas as pd
from torch.utils.data import Dataset
import contractions
import re
import string
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import pickle
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
def text_preprocessing(df):
    """ Includes all data preprocessing.
    :param df: raw dataframe
    :param df: preprocessed data
    :return:
    """
    df['new_text']=df['text'].apply(lambda x: contractions.fix(x, slang=True))
    df['new_text'] = df['new_text'].str.lower()
    df['new_text'] = df['new_text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))
    df['new_text'] = df['new_text'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))
    df['new_text'] = df['new_text'].apply(lambda x: re.sub(' +', ' ', x))
    return df

"""Preprocess the original data. Note that we need to put the data file in the same folder with this notebook"""

label_dict = {'negative': 0, 'neutral': 1, 'positive': 2}
texts_df = pd.read_csv("train_data.csv")    # should be adjusted according to the position of data files
labels_df = pd.read_csv("train_results.csv")  # should be adjusted according to the position of data files
texts_df['label'] = labels_df.apply(lambda row: label_dict[row['target']], axis=1)
texts_df_new = text_preprocessing(texts_df)
label_Y = pd.get_dummies(texts_df_new['label']).values  #
X_train_ori, X_test_ori, Y_train, Y_test = train_test_split(texts_df_new['new_text'],label_Y, test_size = 0.10, random_state = 42)

"""Tokenize the texts into vectors and padding each vector to 100 dimension"""

# tokenize the texts into vectors
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 100
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X_train_ori.values)
word_index = tokenizer.word_index
X_train_ori,X_test_ori = X_train_ori.values,X_test_ori.values
X_train, X_test = tokenizer.texts_to_sequences(X_train_ori),tokenizer.texts_to_sequences(X_test_ori)
X_train, X_test = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH),pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Found %s unique tokens.' % len(word_index))
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

"""Construct the LSTM model"""

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer=Adam(learning_rate=1e-3),loss=CategoricalCrossentropy(),metrics=[CategoricalAccuracy()])
print(model.summary())

"""Train the model and evaluate every epoch with validation set"""

epochs = 4
batch_size = 3200
from keras.callbacks import EarlyStopping
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_data=(X_test,Y_test),use_multiprocessing=True,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

"""Save the trained model"""

pickle.dump(model, open('LSTM_trained_model.pkl', 'wb'))

"""Using the LIME to show the inference process"""

from keras.engine.training import data_adapter
from sklearn.pipeline import Pipeline
loaded_model = pickle.load(open('LSTM_trained_model.pkl', 'rb'))
class PipeStep(object):
    """
    Wrapper for turning functions into pipeline transforms (no-fitting)
    """
    def __init__(self, func1,func2):
        self.func1 = func1
        self.func2 = func2
    # def fit(self,X):
    #     return self.func1.fit_on_texts(X)
    def fit(self,*args):
        return self
    def transform(self,X):
        return self.func2(self.func1.texts_to_sequences(X),maxlen=MAX_SEQUENCE_LENGTH)

tokenizer_step = PipeStep(tokenizer,pad_sequences)
pipeline_LSTM = Pipeline(
    [("tokenize",tokenizer_step),
     ("model",loaded_model),
    ])
print(pipeline_LSTM.predict([X_test_ori[0]]))

from lime.lime_text import LimeTextExplainer
# class_names = ['negative', 'neutral', 'positive']
class_names = [item for item in label_dict.keys()]
print(class_names)
explainer = LimeTextExplainer(class_names=class_names)
idx = 1
exp = explainer.explain_instance(X_test_ori[idx], pipeline_LSTM.predict, num_features=6, labels=[0, 2])
print('Document id: %d' % idx)

# show the most important inference words through LIME
print("original text:",X_test_ori[idx])
print ('Explanation for class %s' % class_names[0])
print ('\n'.join(map(str, exp.as_list(label=0))))
print ()
print ('Explanation for class %s' % class_names[2])
print ('\n'.join(map(str, exp.as_list(label=2))))

# Show the labeled inference words through LIME 
exp.show_in_notebook(text=X_test_ori[idx], labels=(2,))
exp.show_in_notebook(text=X_test_ori[idx], labels=(0,))

"""Use the trained model to predict the test data"""

import numpy as np
import pickle
def test_lstm(test_data_path,model_path):
    """ Model inference and save the results.
    :param test_data: transformed test data
    :return:
    """
    loaded_model = pickle.load(open(model_path, 'rb'))
    texts_df = pd.read_csv(test_data_path)
    texts_df = text_preprocessing(texts_df)
    test_data = texts_df['new_text']
    test_data = test_data.values
    pred = pipeline_LSTM.predict(test_data)
    pred_label = np.argmax(pred,axis=1)
    df = pd.DataFrame({'target': pred_label})
    df['id'] = df.index
    df.to_csv(f'results.csv', columns=['id', 'target'], index=False)

test_lstm("test_data.csv",'LSTM_trained_model.pkl')