Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated to accept CSV files #4

Open
wants to merge 4 commits into
base: Dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
Binary file added R/.DS_Store
Binary file not shown.
63 changes: 63 additions & 0 deletions R/create_category_matrix.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#' Generates a matrix for lexicon Category results
#'
#' Creates a file listing the total matches for each Category in the lexicon for each row in the corpus.
#' The file can be pasted as additional columns in the corpus, which allows for using filter and sorting functions to explore the data.
#' For example, to find all rows in an analysed corpus that match a specific lexicon category or combination of categories and how many times.
#'
#' @param lecat_result data frame output from the \link[lecat]{run_lecat_analysis} function
#' @param inShiny If inShiny is TRUE then shiny based notifications will be shown
#'
#' @return Passing the output of the \link[lecat]{run_lecat_analysis} function will return a data frame
create_category_matrix <-
function(lecat_result, inShiny = FALSE) {
# iterators
categories <- unique(lecat_result$Category)
i <- 1
n <- nrow(categories)

# start building the result matrix by adding row numbers (which are column names in the lecat_result data frame, from the fifth column) as the first column
result <-
cbind(colnames(lecat_result[, c(5:ncol(lecat_result))]))

if (inShiny) {
shiny::withProgress(message = 'Generating category matrix', detail = 'This is fairly fast', value = 0, {

# go through each category
for (x in categories) {

# increment progress bar
shiny::incProgress(1/n, detail = paste('Category:', x))
i <- i + 1

# subset the rows that match the category
mm <- subset(lecat_result, Category == x)

# start adding columns to the result dataframe (subtotals for ID columns - if a tweet has hit any of the query terms for the category the sum will be more than 0)
result <- cbind(result, colSums(mm[, c(5:ncol(mm))]))
}
})
} else {
pb <- utils::txtProgressBar(
min = 1,
max = nrow(categories),
initial = 1
)
# go through each category
for (x in categories) {
# increment progress bar
utils::setTxtProgressBar(pb, i)
i <- i + 1
# subset the rows that match the category
mm <- subset(lecat_result, Category == x)
# start adding columns to the result dataframe (subtotals for ID columns - if a tweet has hit any of the query terms for the category the sum will be more than 0)
result <- cbind(result, colSums(mm[, c(5:ncol(mm))]))
}

}
# now let's add the column names so humans can read them
colnames(result) <- c("Row ID", categories)

# now return the result as a dataframe
as.data.frame(result, stringsAsFactors = FALSE)

}
227 changes: 115 additions & 112 deletions R/create_unique_total_diagnostics.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,39 +8,108 @@
#' @return Passing the output of the \link[lecat]{run_lecat_analysis} function will return a data frame with Type,
#' Category, Queries and Column_examined columns. In the output the unique and total occurrences of Types, Category and Query
#' are reported in the format Term(total occurrences, unique occurrences).
create_unique_total_diagnostics <- function(lecat_result, inShiny = FALSE){

# TODO: keep counts as a matrix for efficiency reasons
# currently much code assumes it's a dataframe,
# so this will take a while

# preallocate results dataframe
result <- data.frame(Type = rep(NaN, length(unique(lecat_result$Type))),
Category = NaN,
Queries = NaN,
Column_examined = NaN,
stringsAsFactors = FALSE)
# iterators
i <- 1
result_i <- 1


count <- function(x) {
totals <- colSums(x, na.rm = TRUE)
paste('(', sum(totals), ',', sum(totals > 0), ')', sep = '')
}
create_unique_total_diagnostics <-
function(lecat_result, inShiny = FALSE) {
# TODO: keep counts as a matrix for efficiency reasons
# currently much code assumes it's a dataframe,
# so this will take a while

# preallocate results dataframe
result <-
data.frame(
Type = rep(NaN, length(unique(
lecat_result$Type
))),
Category = NaN,
Queries = NaN,
Column_examined = NaN,
stringsAsFactors = FALSE
)
# iterators
i <- 1
result_i <- 1
n <- length(unique(lecat_result$Type))
category_i <- 1

count <- function(x) {
totals <- colSums(x, na.rm = TRUE)
paste('(', sum(totals), ',', sum(totals > 0), ')', sep = '')
}

n <- length(unique(lecat_result$Type))
category_i <- 1
if (inShiny) {
shiny::withProgress(message = 'Generating diagnostics', value = 0, {
# Loop though types
for (type in unique(lecat_result$Type)) {
shiny::incProgress(1 / n, detail = paste("Category", category_i))

if (inShiny) {
shiny::withProgress(message = 'Generating diagnostics', value = 0, {
# types, categories and queries
these_types_categories_queries <-
lecat_result[lecat_result$Type == type, 1:4]

# Loop though types
for (type in unique(lecat_result$Type)) {
# pass frequencies to count function
type_string <-
paste(type,
count(lecat_result[lecat_result$Type == type, 5:ncol(lecat_result)])
, sep = '')

shiny::incProgress(1/n, detail = paste("Type", category_i))
# loop though categories in type
for (category in unique(these_types_categories_queries$Category)) {
category_i <- category_i + 1

# categories and queries
these_categories_queries <-
lecat_result[lecat_result$Type == type &
lecat_result$Category == category, 1:4]

# pass frequencies to count function
category_string <-
paste(category,
count(lecat_result[lecat_result$Type == type &
lecat_result$Category == category, 5:ncol(lecat_result)]),
sep = '')

# preallocate query string
query_strings <- ''

# for each query in our category data
for (query in unique(these_categories_queries$Query)) {
# pass frequencies to count function
query_string <-
paste(query,
count(lecat_result[lecat_result$Type == type &
lecat_result$Category == category &
lecat_result$Query == query, 5:ncol(lecat_result)]),
sep = '')

# add query string entry to existing query strings
query_strings <- paste(query_strings, query_string)

}

# write result to preallocated dataframe
result[result_i, ] <- data.frame(
Type = type_string,
Category = category_string,
Queries = query_strings,
Column_examined = unique(these_categories_queries$Column_examined),
stringsAsFactors = FALSE
)
# incriment result counter
result_i <- result_i + 1
}
# incriment our i variable
i <- i + 1
}
})
} else {
# Create progress bar
pb <-
utils::txtProgressBar(min = 1,
max = length(unique(lecat_result$Type)),
initial = 1)

# Loop though types
for (type in unique(lecat_result$Type)) {
# types, categories and queries
these_types_categories_queries <-
lecat_result[lecat_result$Type == type, 1:4]
Expand All @@ -53,33 +122,34 @@ create_unique_total_diagnostics <- function(lecat_result, inShiny = FALSE){

# loop though categories in type
for (category in unique(these_types_categories_queries$Category)) {

category_i <- category_i + 1

# categories and queries
these_categories_queries <-
lecat_result[lecat_result$Type == type & lecat_result$Category == category, 1:4]
lecat_result[lecat_result$Type == type &
lecat_result$Category == category, 1:4]

# pass frequencies to count function
category_string <-
paste(category,
count(
lecat_result[lecat_result$Type == type & lecat_result$Category == category, 5:ncol(lecat_result)]
),
count(lecat_result[lecat_result$Type == type &
lecat_result$Category == category, 5:ncol(lecat_result)]),
sep = '')

# preallocate query string
query_strings <- ''

# for each query in our category data
for (query in unique(these_categories_queries$Query)) {
# increment the progress bar
utils::setTxtProgressBar(pb, i)

# pass frequencies to count function
query_string <-
paste(query,
count(
lecat_result[lecat_result$Type == type & lecat_result$Category == category & lecat_result$Query == query, 5:ncol(lecat_result)]
),
count(lecat_result[lecat_result$Type == type &
lecat_result$Category == category &
lecat_result$Query == query, 5:ncol(lecat_result)]),
sep = '')

# add query string entry to existing query strings
Expand All @@ -88,87 +158,20 @@ create_unique_total_diagnostics <- function(lecat_result, inShiny = FALSE){
}

# write result to preallocated dataframe
result[result_i,] <- data.frame(Type = type_string,
Category = category_string,
Queries = query_strings,
Column_examined = unique(these_categories_queries$Column_examined),
stringsAsFactors = FALSE)
result[result_i, ] <- data.frame(
Type = type_string,
Category = category_string,
Queries = query_strings,
Column_examined = unique(these_categories_queries$Column_examined),
stringsAsFactors = FALSE
)
# incriment result counter
result_i <- result_i + 1
}
# incriment our i variable
i <- i + 1
}
})
} else {
# Create progress bar
pb <- utils::txtProgressBar(min = 1, max = length(unique(lecat_result$Type)), initial = 1)

# Loop though types
for (type in unique(lecat_result$Type)) {

# types, categories and queries
these_types_categories_queries <-
lecat_result[lecat_result$Type == type, 1:4]

# pass frequencies to count function
type_string <-
paste(type,
count(lecat_result[lecat_result$Type == type, 5:ncol(lecat_result)])
, sep = '')

# loop though categories in type
for (category in unique(these_types_categories_queries$Category)) {

category_i <- category_i + 1

# categories and queries
these_categories_queries <-
lecat_result[lecat_result$Type == type & lecat_result$Category == category, 1:4]

# pass frequencies to count function
category_string <-
paste(category,
count(
lecat_result[lecat_result$Type == type & lecat_result$Category == category, 5:ncol(lecat_result)]
),
sep = '')

# preallocate query string
query_strings <- ''

# for each query in our category data
for (query in unique(these_categories_queries$Query)) {

# incriment the progress bar
utils::setTxtProgressBar(pb, i)

# pass frequencies to count function
query_string <-
paste(query,
count(
lecat_result[lecat_result$Type == type & lecat_result$Category == category & lecat_result$Query == query, 5:ncol(lecat_result)]
),
sep = '')

# add query string entry to existing query strings
query_strings <- paste(query_strings, query_string)

}

# write result to preallocated dataframe
result[result_i,] <- data.frame(Type = type_string,
Category = category_string,
Queries = query_strings,
Column_examined = unique(these_categories_queries$Column_examined),
stringsAsFactors = FALSE)
# incriment result counter
result_i <- result_i + 1
}
# incriment our i variable
i <- i + 1
close(pb)
}
close(pb)
result
}
result
}
Loading