-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcooc_analysis.R
229 lines (189 loc) · 9.03 KB
/
cooc_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
options(stringsAsFactors = FALSE)
library(quanteda)
library(dplyr)
library(stringr)
# Attention, path must fit!
textdata <- read.csv("data_git/all_articles2.csv", sep = ";")
colnames(textdata) <- c('doc_id', 'id', 'url', 'date', 'title', 'text')
# Delete articles where Corona, Covid and SARS-Cov-2 do not appear in the text or title and articles where there is no text
del_idx = c()
for (i in 1:nrow(textdata)){
word_counts <- sum(str_count(tolower(textdata$text[i]), c("corona", "covid", "sars-cov-2")))
word_counts_title <- sum(str_count(tolower(textdata$title[i]), c("corona", "covid", "sars-cov-2")))
if (((word_counts+word_counts_title) < 1) | (textdata$text[i] == "")) {
del_idx <- append(del_idx, FALSE)
}
else {
del_idx <- append(del_idx, TRUE)
}
}
textdata_new <- textdata[del_idx, ]
# if cooc analysis with only a subset of the data
# select articles only from 12 2019 - 12 2020 or from 01 2021 - 03 2022
#subset_textdata <- textdata_new %>% filter(textdata_new$date >= "2019-12-01" & textdata_new$date < "2021-01-01")
#subset_textdata <- textdata_new %>% filter(textdata_new$date >= "2021-01-01" & textdata_new$date < "2022-04-01")
# alternativ: subset_textdata <- textdata[which(textdata$date > "2019-12-01" & textdata$date < "2020-07-01")]
# Create corpus from subset_textdata
#guardian_corpus <- corpus(subset_textdata$text, docnames = subset_textdata$doc_id,
#docvars = data.frame(year = substr(subset_textdata$date,0,4)))
# Create corpus
guardian_corpus <- corpus(textdata_new$text, docnames = textdata_new$doc_id,
docvars = data.frame(year = substr(textdata_new$date,0,4)))
# separate sentences
corpus_sentences <- corpus_reshape(guardian_corpus, to = "sentences")
# Build a dictionary of lemmas
lemma_data <- read.csv("resources/baseform_en.tsv", encoding = "UTF-8")
# read an extended stop word list
stopwords_extended <- readLines("resources/stopwords_en.txt", encoding = "UTF-8")
# Preprocessing of the corpus of sentences
corpus_tokens <- corpus_sentences %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% # remove_separators = TRUE
tokens_tolower() %>%
tokens_replace(lemma_data$inflected_form, lemma_data$lemma,
valuetype = "fixed") %>%
tokens_remove(pattern = stopwords_extended, padding = T) %>%
tokens_remove("")
minimumFrequency <- 50
# Create DTM, prune vocabulary and set binary values for
# presence/absence of types
binDTM <- corpus_tokens %>%
dfm() %>%
dfm_trim(min_docfreq = minimumFrequency) %>%
dfm_weight("boolean")
#memory.limit(48000)
# due to vocabulary pruning, we have empty rows in our binDTM
# So we remove those docs from the binDTM and the metadata
sel_idx <- rowSums(binDTM) > 0
binDTM <- binDTM[sel_idx, ]
textdata_new <- textdata_new[sel_idx, ]
# Display most used terms
topfeatures(binDTM, 100, decreasing = TRUE)
# delete selected terms from DTM
# terms, which contains "â" or"œ" or "@" or "."
terms_to_delete <- c()
for (i in 1:dim(binDTM)[2]){
if (grepl("â",colnames(binDTM)[i], fixed = TRUE) | grepl("œ",colnames(binDTM)[i], fixed = TRUE) | grepl(".",colnames(binDTM)[i], fixed = TRUE) | grepl("@",colnames(binDTM)[i], fixed = TRUE)) {
terms_to_delete <- append(terms_to_delete, colnames(binDTM)[i])
}
}
binDTM <- binDTM[,!(colnames(binDTM) %in% terms_to_delete)]
# Read in the source code for the co-occurrence calculation
source("calculateCoocStatistics.R")
# Definition of a parameter for the representation of the
# co-occurrences of a concept
numberOfCoocs <- 15
# Determination of the term of which co-competitors are to
# be measured.
coocTerm <- "covid-19" # vaccine
coocs <- calculateCoocStatistics(coocTerm, binDTM, measure = "LOGLIK")
# Display the numberOfCoocs main terms
#print(coocs[1:numberOfCoocs])
resultGraph <- data.frame(from = character(), to = character(),
sig = numeric(0))
# The structure of the temporary graph object is equal to
# that of the resultGraph
tmpGraph <- data.frame(from = character(), to = character(),
sig = numeric(0))
# Fill the data.frame to produce the correct number of
# lines
tmpGraph[1:numberOfCoocs, 3] <- coocs[1:numberOfCoocs]
# Entry of the search word into the first column in all
# lines
tmpGraph[, 1] <- coocTerm
# Entry of the co-occurrences into the second column of the
# respective line
tmpGraph[, 2] <- names(coocs)[1:numberOfCoocs]
# Set the significances
tmpGraph[, 3] <- coocs[1:numberOfCoocs]
# Attach the triples to resultGraph
resultGraph <- rbind(resultGraph, tmpGraph)
# Iteration over the most significant numberOfCoocs
# co-occurrences of the search term
for (i in 1:numberOfCoocs) {
# Calling up the co-occurrence calculation for term i
# from the search words co-occurrences
newCoocTerm <- names(coocs)[i]
coocs2 <- calculateCoocStatistics(newCoocTerm, binDTM, measure = "LOGLIK")
# print the co-occurrences
coocs2[1:10]
# Structure of the temporary graph object
tmpGraph <- data.frame(from = character(), to = character(),
sig = numeric(0))
tmpGraph[1:numberOfCoocs, 3] <- coocs2[1:numberOfCoocs]
tmpGraph[, 1] <- newCoocTerm
tmpGraph[, 2] <- names(coocs2)[1:numberOfCoocs]
tmpGraph[, 3] <- coocs2[1:numberOfCoocs]
# Append the result to the result graph
resultGraph <- rbind(resultGraph, tmpGraph[2:length(tmpGraph[,
1]), ])
}
# Sample of some examples from resultGraph
#resultGraph[sample(nrow(resultGraph), 6), ]
require(igraph)
# set seed for graph plot
set.seed(1)
# Create the graph object as undirected graph
graphNetwork <- graph.data.frame(resultGraph, directed = F)
# Identification of all nodes with less than 2 edges
#verticesToRemove <- V(graphNetwork)[degree(graphNetwork) < 2]
# These edges are removed from the graph
#graphNetwork <- delete.vertices(graphNetwork, verticesToRemove)
# Assign colors to nodes (search term blue, others orange)
V(graphNetwork)$color <- ifelse(V(graphNetwork)$name == coocTerm, 'cornflowerblue', 'orange')
# Set edge colors
E(graphNetwork)$color <- adjustcolor("DarkGray", alpha.f = .5)
# scale significance between 1 and 10 for edge width
E(graphNetwork)$width <- scales::rescale(E(graphNetwork)$sig, to = c(1, 10))
# Set edges with radius
E(graphNetwork)$curved <- 0.15
# Size the nodes by their degree of networking (scaled between 5 and 15)
V(graphNetwork)$size <- scales::rescale(log(degree(graphNetwork)), to = c(5, 15))
# Define the frame and spacing for the plot
par(mai=c(0,0,1,0))
# Final Plot
plot(
graphNetwork,
layout = layout.fruchterman.reingold, # Force Directed Layout
#main = paste(coocTerm, ' Graph'),
vertex.label.family = "sans",
vertex.label.cex = 0.8,
vertex.shape = "circle",
vertex.label.dist = 0.5, # Labels of the nodes moved slightly
vertex.frame.color = adjustcolor("darkgray", alpha.f = .5),
vertex.label.color = 'black', # Color of node names
vertex.label.font = 2, # Font of node names
vertex.label = V(graphNetwork)$name, # node names
vertex.label.cex = 1 # font size of node names
)
###########################################################################
# code for some tests with cooccurrence counts
# Matrix multiplication for cooccurrence counts
coocCounts <- t(binDTM) %*% binDTM
as.matrix(coocCounts[202:205, 202:205])
coocTerm <- "covid-19"
k <- nrow(binDTM)
ki <- sum(binDTM[, coocTerm])
kj <- colSums(binDTM)
names(kj) <- colnames(binDTM)
kij <- coocCounts[coocTerm, ]
########## MI: log(k*kij / (ki * kj) ########
mutualInformationSig <- log(k * kij/(ki * kj))
mutualInformationSig <- mutualInformationSig[order(mutualInformationSig,
decreasing = TRUE)]
########## DICE: 2 X&Y / X + Y ##############
dicesig <- 2 * kij/(ki + kj)
dicesig <- dicesig[order(dicesig, decreasing = TRUE)]
########## Log Likelihood ###################
logsig <- 2 * ((k * log(k)) - (ki * log(ki)) - (kj * log(kj)) +
(kij * log(kij)) + (k - ki - kj + kij) * log(k - ki - kj +
kij) + (ki - kij) * log(ki - kij) + (kj - kij) * log(kj -
kij) - (k - ki) * log(k - ki) - (k - kj) * log(k - kj))
logsig <- logsig[order(logsig, decreasing = T)]
# Put all significance statistics in one Data-Frame
resultOverView <- data.frame(names(sort(kij, decreasing = T)[1:10]),
sort(kij, decreasing = T)[1:10], names(mutualInformationSig[1:10]),
mutualInformationSig[1:10], names(dicesig[1:10]), dicesig[1:10],
names(logsig[1:10]), logsig[1:10], row.names = NULL)
colnames(resultOverView) <- c("Freq-terms", "Freq", "MI-terms",
"MI", "Dice-Terms", "Dice", "LL-Terms", "LL")
print(resultOverView)