-
Notifications
You must be signed in to change notification settings - Fork 0
/
FakeNewsClassifierUsingLSTM.py
137 lines (111 loc) · 4.22 KB
/
FakeNewsClassifierUsingLSTM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
print(tf.__version__)
"""Dataset: https://www.kaggle.com/c/fake-news/data"""
# Using Word Embedding Techniques with LSTM
"""Steps:
1. Dataset
2. Independent and Dependent Features
3. Cleaning the Data i) Stemming, ii) Stopwords
4. Fix a sentence length to fix the input
5. One Hot representation, Embedding Layer
6. LSTM Neural Network"""
df = pd.read_csv('Datasets/fake-news/train.csv')
print(df.head())
print(df.shape)
print(df.isnull().sum())
# Drop Nan Values
df = df.dropna()
# Get the Independent Features
X = df.drop('label', axis=1)
# Get the Dependent features
y = df['label']
print(X.shape, y.shape)
# Vocabulary size
vocab_size = 5000
"""OneHot Representation"""
messages = X.copy()
# print(messages['title'])
print(messages['title'][0])
messages.reset_index(inplace=True)
# Dataset Preprocessing
stemmer = PorterStemmer()
corpus = []
for i in range(len(messages)):
review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
review = review.lower()
review = review.split()
review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]
review = ' '.join(review)
corpus.append(review)
# print(corpus)
print(corpus[0])
one_hot_rep = [one_hot(words, vocab_size) for words in corpus]
# print(one_hot_rep)
print(one_hot_rep[0])
"""Embedding Representation"""
sent_length = 20
embedded_docs = pad_sequences(one_hot_rep, padding='pre', maxlen=sent_length)
print(embedded_docs)
print(embedded_docs[0])
print(len(embedded_docs), y.shape)
X_final = np.array(embedded_docs)
y_final = np.array(y)
print(X_final.shape, y_final.shape)
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)
"""Creating model"""
embedding_vector_features = 40 # features representation
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid')) # Dense layer is added as it is a classification problem
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
"""Model Training"""
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)
"""Performance Metrics And Accuracy"""
y_pred = model.predict(X_test)
# y_prediction = np.argmax(y_pred, axis=-1) # For multi-class classification
y_prediction = (y_pred > 0.5).astype("int32") # for binary classification
matrix = confusion_matrix(y_test, y_prediction)
scores = accuracy_score(y_test, y_prediction)
print(matrix)
print(scores)
"""Adding Dropout"""
# Creating model
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid')) # Dense layer is added as it is a classification problem
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)
# Performance Metrics And Accuracy
y_pred = model.predict(X_test)
# y_prediction = np.argmax(y_pred, axis=-1) # For multi-class classification
# y_pred = np.where(y_pred > 0.6, 1, 0) # AUC ROC Curve
y_prediction = (y_pred > 0.5).astype("int32") # for binary classification
# print(y_prediction)
matrix = confusion_matrix(y_test, y_prediction)
print(matrix)
scores = accuracy_score(y_test, y_prediction)
print('Accuracy:', scores)
report = classification_report(y_test, y_prediction)
print(report)