-
Notifications
You must be signed in to change notification settings - Fork 0
/
classification_model_final.py
230 lines (165 loc) · 9.02 KB
/
classification_model_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# -*- coding: utf-8 -*-
"""
Author: Grace Milner
Machine learning model to classify tweets into relevant (1) or irrelevant (0) based on text content.
The following model configurations can be manually changed by the user and then the model run through once:
(1) Classifier -> Logistic Regression or SVM
(2) Pre-processing steps (none, no lemmatisation, or full)
(3) n-gram range (unigram or unigram + bigram)
(4) Vectoriser (BoW or BoW with TFIDF weighting)
Final section provides option to test to see the prediction success per R type when all R types are classified together.
The optimal configuration for each R type can be chosen and the results compared.
E.g. can be used to compare the recall of R1 tweets when just classified individually, or when classified
within the R123 (all relevance kinds combined).
"""
# Loading in libraries
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import metrics
# =============================================================================
# INITIAL TEXT PROCESSING
# =============================================================================
# Reading in the data
os.chdir("C:\VUB\Thesis\Data\Tweets\For model")
df = pd.read_csv("R123_new.csv")
# Look at top 5 records
print(df.head())
# Look at shape of df
print(df.shape)
# Can also value count to see how many in this dataset are classed as relevant
print(df["Relevance"].value_counts())
# Import required libraries
import spacy
import string
#includes list of punctuation marks
import spacy.lang.en.stop_words
#model to remove common stop words
# Create list of punctuation marks
punctuations = string.punctuation
# Adding TM to punctuation (emerged as an issue during testing)
punctuations += "™"
# Create list of stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS
# Loading SpaCy's English NLP model (small version, no word vectors)
nlp = spacy.load('en_core_web_sm')
# Create custom tokeniser function:
# Will take a sentence (or tweet) as input and process
# into tokens, remove stop words, turn into
# lowercase and perform lemmatisation.
def spacy_tokenizer(sentence):
# Creating token object, used to create documents with linguistic annotations (gives insights into grammatical structure).
# Disabling unneeded parts of the pipeline for now (parser, and entity recognition)
mytokens = nlp(sentence, disable=['parser', 'ner'])
# Lemmatising each token if required (uncomment when needed)
# (spaCy uses '-PRON-' to refer to all personal pronouns e.g. 'I, me')
# Converts token to lemmatised version, if pronoun not lemmatised. Also makes lowercase and removes white spaces.
#mytokens = [ word.lemma_.lower().strip() for word in mytokens if word.lemma_ != "-PRON-"]
# Removing stop words and punctuation
# (this removes any token which contains any punctuation, not just the punctuation, so also removes usernames or links.)
#mytokens = [ word for word in mytokens if word not in stop_words and not any([x in word for x in punctuations])]
# If removing pre-processing steps, need to manually convert tokens to strings:
mytokens = [str(token) for token in mytokens]
# return preprocessed list of tokens
return mytokens
##### Vectorising Data #####
# Converting data into machine-readable format.
# Using Bag of Words (BoW) to convert text to numeric (vector) format
#(generates matrix which records occurrence of words within document)
# Generating vector by using CountVectorizer from scikit-learn
# Uses our custom tokenisor, and defines the ngram range (combination of adjacent words)
# Creates a BoW matrix where each unique word given a column and each text sample (tweet) given a row.
# Populated by 0 or 1 depending on if the tweet contains that specific word.
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range = (1,1))
# Used unigram here, so lower and upper bound of ngram range = 1. Could change to (2, 2) to only use bigrams.
# Important to note, ngram range impacts the corpus not the vocabulary.
# Using Term Frequency-Inverse Document Frequency (TF-IDF) to normalise BoW output.
# Higher TF-IDF value --> term more important to the document
# If using this normalised version, need to replace it for bow_vector in pipeline
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range = (1,2)) #again can change ngram range.
# =============================================================================
# TRAIN AND TEST DATASETS
# =============================================================================
# Generating training data to train model and test dataset to test model performance
from sklearn.model_selection import train_test_split
X = df['Tweet'] # The features we want to analyse
ylabels = df['Relevance'] # The labels/target variables, relevance
# Splitting up data using sklearn
# Takes input features, labels, and test size as arguments.
# Test size gives percentage of split e.g. 0.3 -> 70% training data and 30% test data
# Random state argument optional, used for initialising internal random number gnerator
# which decides splitting of data. random_state=1 means each run of code will give
# same data in training and test split.
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size = 0.3, random_state = 1)
# =============================================================================
# GENERATING MODEL - LOGISTIC REGRESSION
# =============================================================================
classifier_lr = LogisticRegression()
pipe_lr = Pipeline([
("vectorizer", tfidf_vector),
("classifier", classifier_lr)
])
pipe_lr.fit(X_train, y_train)
# Predicting with a test dataset (named lr to show log reg)
predicted_lr = pipe_lr.predict(X_test)
# Model Accuracy
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted_lr))
# Model Precision
print("Logistic Regression Precision:", metrics.precision_score(y_test, predicted_lr))
# Model Recall
print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted_lr))
# F1 Score
print("Logistic Regression F1:", metrics.f1_score(y_test, predicted_lr))
# =============================================================================
# GENERATING MODEL - SVM
# =============================================================================
classifier_svm = svm.SVC(kernel='rbf')
pipe_svm = Pipeline([
("vectorizer", tfidf_vector),
("classifier", classifier_svm)
])
pipe_svm.fit(X_train, y_train)
# Predicting the response for test dataset
predicted_svm = pipe_svm.predict(X_test)
# Model Accuracy
print("SVM Accuracy:", metrics.accuracy_score(y_test, predicted_svm))
# Model Precision
print("SVM Precision:", metrics.precision_score(y_test, predicted_svm))
# Model Recall
print("SVM Recall:", metrics.recall_score(y_test, predicted_svm))
# F1 Score
print("SVM F1:", metrics.f1_score(y_test, predicted_svm))
# =============================================================================
# OUTPUT DATAFRAME
# =============================================================================
# Creating a new dataframe for predictions
output_df = pd.DataFrame(X_test)
output_df["Actually Relevant"] = y_test
output_df["R1"] = df.loc[X_test.index, "R1"] # used the loc function to match the indices in the input dataset
output_df["R2"] = df.loc[X_test.index, "R2"]
output_df["R3"] = df.loc[X_test.index, "R3"]
output_df["LR Predictions"] = predicted_lr
output_df["LR Correct"] = output_df["Actually Relevant"] == output_df["LR Predictions"]
output_df["SVM Predictions"] = predicted_svm
output_df["SVM Correct"] = output_df["Actually Relevant"] == output_df["SVM Predictions"]
print(output_df.head())
# =============================================================================
# CALCULATING METRICS (for chosen configuration)
# =============================================================================
# Can use this section of the code to test to see the prediction success per R type when all R types are classified together.
# Example here is testing R1 type tweets with the SVM model run as configured above.
# Grouping the output dataframe by R1 and calculating metrics (focussing on SVM results for above configuration)
filtered_df = output_df[output_df["R1"] == 1]
# Calculate metrics using filtered data
accuracy = metrics.accuracy_score(filtered_df["Actually Relevant"], filtered_df["SVM Predictions"])
precision = metrics.precision_score(filtered_df["Actually Relevant"], filtered_df["SVM Predictions"])
recall = metrics.recall_score(filtered_df["Actually Relevant"], filtered_df["SVM Predictions"])
f1_score = metrics.f1_score(filtered_df["Actually Relevant"], filtered_df["SVM Predictions"])
print("Metrics for R1:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)