-
Notifications
You must be signed in to change notification settings - Fork 0
/
SA_COVID-19.R
329 lines (236 loc) · 8.15 KB
/
SA_COVID-19.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
rm(list = ls())
#Useful packages
library(tidyr)
library(tidyverse)
#Load the rtweet library
library(rtweet)
library(tm)
#These packages will be useful for later on in the analysis:
library(SentimentAnalysis)
library(syuzhet)
library(wordcloud)
library(dplyr) #Data manipulation (also included in the tidyverse package)
library(tidytext) #Text mining
install.packages("tidytext")
#Visualizations!
library(ggplot2) #Visualizations (also included in the tidyverse package)
library(knitr) #Create nicely formatted output tables
library(circlize) #Visualizations - chord diagram
library(memery) #Memes - images with plots
library(magick) #Memes - images with plots (image_read)
library(yarrr) #Pirate plot
library(radarchart) #Visualizations
library(igraph) #ngram network diagrams
library(ggraph) #ngram network diagrams
#Authorize twitter API to extract tweet data
#We are going to extract Saudi Arabia tweet data
tweets_COVID <- search_tweets("#COVID19", n = 1000, include_rts = TRUE, lang = "en") #search_tweetsfunction allows us to query tweet data
head(tweets_COVID)
names(tweets_COVID)
#We need to build a corpus:
library(tm)
corpus<-iconv(tweets_COVID$text, to = "utf-8-mac")
corpus<-Corpus(VectorSource(corpus))
inspect(corpus[1:5])
#We need to clean this data:
corpus<-tm_map(corpus, tolower)
inspect(corpus[1:5])
corpus<- tm_map(corpus, removePunctuation)
inspect(corpus[1:5])
corpus<- tm_map(corpus, removeNumbers)
inspect(corpus[1:5])
cleanset<-tm_map(corpus, removeWords, stopwords('english'))
inspect(cleanset[1:5])
cleanset<-tm_map(corpus, removeWords, c('covid', 'coronavirus', 'and','the'))
removeURL <- function(x) gsub('http[[:alnum:]]*', '',x)
cleanset <- tm_map(cleanset, content_transformer(removeURL))
inspect(cleanset[1:5])
cleanset <- tm_map(cleanset, stripWhitespace)
inspect(cleanset[1:10])
#Term document matrix to turn this data into structured data:
tdm <- TermDocumentMatrix(cleanset)
tdm
tdm <- as.matrix(tdm)
tdm[1:10,1:20]
#Bar plot
w <- rowSums(tdm)
w <- subset(w, w>=25)
barplot(w,
las = 2,
col = rainbow(50))
#Library word cloud
library(wordcloud)
library(RColorBrewer)
#Word frequency
word_frequency <- sort(colSums(as.matrix(tdm)),
decreasing = TRUE)
df_frequency <- data.frame(word = names(word_frequency),
freq = word_frequency)
head(df_frequency)
#Simple word cloud
wordcloud(w$word,
w$freq)
#Alternative way to produce word cloud
sort(rowSums(tdm), decreasing = TRUE)
set.seed(222)
wordcloud(words = names(tdm),
freq = w,
max.words = 150)
wordcloud(w$word,
w$freq,
max.words = 10, min.freq = 1,
random.order = FALSE,
family = "Helvatica", font = 3)
library(wordcloud2)
#Letter cloud:
letterCloud(w,
word = "SA",
size = 1)
library(topicmodels)
#Create a topic model with 5 topics
topicmodl_5 <- LDA(tdm, k = 5)
#Select and view the top 10 terms in the topic model
top_10terms <- terms(topicmodl_5,10)
top_10terms
#Create a topic model with 4 topics
topicmodl_4 <- LDA(tdm, k = 4)
#Select and view the top 6 terms in the topic model
top_6terms <- terms(topicmodl_4, 6)
top_6terms
#Sentiment Analysis:
library(syuzhet)
library(lubridate)
library(ggplot2)
library(scales)
library(dplyr)
library(reshape2)
tweets <- iconv(tweets_Saudi, to = 'utf-8-mac')
#Obtain sentiment scores:
sa.value <- get_nrc_sentiment(tweets)
#Provide score for every emotion:
view(sa.value)
get_nrc_sentiment('negative')
barplot(colSums(sa.value),
las = 2,
col = rainbow(10),
ylab = 'Count',
main = 'Sentiment Scores for Saudi Arabia')
#load in the qdap library:
library(qdapRegex)
#Extract tweet text from the pre-loaded dataset
twt_txt <- tweets_COVID$text
head(twt_txt)
#Remove URLs from the tweet text and view the output
twt_txt_url <- rm_twitter_url(twt_txt)
head(twt_txt_url)
#Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs <- gsub("[^A-Za-z]"," " , twt_txt_url)
#View text after replacing special characters, punctuation, & numbers
head(twt_txt_chrs)
twt_gsub <- twt_txt_chrs
#We can see that URLs have been removed and special characters, punctuation, & numbers have been replaced with additional spaces in the text
#Convert text in "twt_gsub" dataset to a text corpus and view output
twt_gsub <- twt_txt_chrs
head(twt_gsub)
#install.packages("tm")
library(tm)
twt_corpus <- twt_gsub %>%
VectorSource() %>%
Corpus()
head(twt_corpus$content)
#Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower)
#View the corpus after converting to lowercase
head(twt_corpus_lwr$content)
twt_corpus<-tm_map(twt_corpus, removeWords, stopwords('english'))
inspect(twt_corpus[1:5])
twt_corpus<-tm_map(twt_corpus, removeWords, c('the', 'this', 'can'))
#install.packages("qdap", INSTALL_opts = "--no-multiarch")
#Removing custom stop words:
library(RColorBrewer)
library(wordcloud)
#Extract term frequencies for top 60 words and view output
termfreq <- wordcloud(twt_corpus, max.words = 60,
random.order = FALSE, colors = brewer.pal(8, "Dark2"))
termfreq
#Creating a document term matrix (DTM) from the tweet corpus above
dtm_COVID <- DocumentTermMatrix(twt_corpus)
dtm_COVID
#Lets find the sum of word counts in each document
rowTotals <- apply(dtm_COVID, 1, sum)
head(rowTotals)
#We then need to select rows with a row total greater than zero
dtm_COV_new <- dtm_COVID[rowTotals > 0, ]
dtm_COV_new
#install.packages("topicmodels")
library(topicmodels)
#Create a topic model with 5 topics
topicmodl_5 <- LDA(dtm_COV_new, k = 5)
#Select and view the top 10 terms in the topic model
top_10terms <- terms(topicmodl_5,10)
top_10terms
#Create a topic model with 4 topics
topicmodl_4 <- LDA(dtm_COV_new, k = 4)
#Select and view the top 6 terms in the topic model
top_6terms <- terms(topicmodl_4, 6)
top_6terms
#Sentiment Analysis
library(syuzhet)
sa.value <- get_nrc_sentiment(tweets_COVID$text)
view(sa.value)
#sum of the sentiment scores, extracts sum of SA as input
score <- colSums(sa.value[,])
#Convert to a data frame
score_df <- data.frame(score)
score_df
#Convert the row names into a 'sentiment' column:
sa.score <- cbind(sentiment = row.names(score_df),
score_df, row.names = NULL)
print(sa.score)
#SA plot
ggplot(data = sa.score, aes(x = sentiment, y = score,fill = sentiment)) + geom_bar(stat = "identity")
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
sc_name <- table(tweets_COVID$screen_name)
head(sc_name)
# Sort the table in descending order of tweet counts
sc_name_sort <- sort(sc_name, decreasing = TRUE)
# View top 6 users and tweet frequencies
head(sc_name_sort)
names(sc_name_sort)#Look for unusual twitter handles or suspicous handles (possible bots?)
# Create a data frame of tweet text and retweet counts
rtwt <- tweets_COVID[,c("retweet_count","text")]
# Sort data frame based on descending order of retweet counts
rtwt_sort <- arrange(rtwt, desc(retweet_count))
#install.packages("data.table")
library(data.table)
#Unique function removes duplicate tweets from df
rtwt_unique <- unique(rtwt_sort, by = "text")
rtwt_unique
#Most popular tweets:
names(rtwt_unique)
head(rtwt_unique)
tweets_COV19<-search_tweets("#COVID19", n = 1000, include_rts = TRUE, lang = "en")
# Clean the data
text <- str_c(tweets_COV19$text, collapse = "")
library(qdapRegex)
# continue cleaning the text
text <- text %>%
rm_twitter_url() %>% # Remove URLS
rm_url() %>%
str_remove_all("#\\S+") %>% # Remove any hashtags
str_remove_all("@\\S+") %>% # Remove any @ mentions
removeWords(stopwords("english")) %>% # Remove common words (a, the, it etc.)
removeNumbers() %>%
stripWhitespace() %>%
removeWords(c("amp")) # Final cleanup of other small changes
# Convert the data into a summary table
textCorpus <-
Corpus(VectorSource(text)) %>%
TermDocumentMatrix() %>%
as.matrix()
textCorpus <- sort(rowSums(textCorpus), decreasing=TRUE)
textCorpus <- data.frame(word = names(textCorpus), freq=textCorpus, row.names = NULL)
head(textCorpus)
# build wordcloud
wordcloud <- wordcloud2(data = textCorpus, minRotation = 0, maxRotation = 0, ellipticity = 0.6)
wordcloud