diff --git a/.DS_Store b/.DS_Store index e32f155..e19e0fb 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/R/.DS_Store b/R/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/R/.DS_Store differ diff --git a/R/create_category_matrix.R b/R/create_category_matrix.R new file mode 100644 index 0000000..096a7ad --- /dev/null +++ b/R/create_category_matrix.R @@ -0,0 +1,63 @@ +#' Generates a matrix for lexicon Category results +#' +#' Creates a file listing the total matches for each Category in the lexicon for each row in the corpus. +#' The file can be pasted as additional columns in the corpus, which allows for using filter and sorting functions to explore the data. +#' For example, to find all rows in an analysed corpus that match a specific lexicon category or combination of categories and how many times. +#' +#' @param lecat_result data frame output from the \link[lecat]{run_lecat_analysis} function +#' @param inShiny If inShiny is TRUE then shiny based notifications will be shown +#' +#' @return Passing the output of the \link[lecat]{run_lecat_analysis} function will return a data frame +create_category_matrix <- + function(lecat_result, inShiny = FALSE) { + # iterators + categories <- unique(lecat_result$Category) + i <- 1 + n <- nrow(categories) + + # start building the result matrix by adding row numbers (which are column names in the lecat_result data frame, from the fifth column) as the first column + result <- + cbind(colnames(lecat_result[, c(5:ncol(lecat_result))])) + + if (inShiny) { + shiny::withProgress(message = 'Generating category matrix', detail = 'This is fairly fast', value = 0, { + + # go through each category + for (x in categories) { + + # increment progress bar + shiny::incProgress(1/n, detail = paste('Category:', x)) + i <- i + 1 + + # subset the rows that match the category + mm <- subset(lecat_result, Category == x) + + # start adding columns to the result dataframe (subtotals for ID columns - if a tweet has hit any of the query terms for the category the sum will be more than 0) + result <- cbind(result, colSums(mm[, c(5:ncol(mm))])) + } + }) + } else { + pb <- utils::txtProgressBar( + min = 1, + max = nrow(categories), + initial = 1 + ) + # go through each category + for (x in categories) { + # increment progress bar + utils::setTxtProgressBar(pb, i) + i <- i + 1 + # subset the rows that match the category + mm <- subset(lecat_result, Category == x) + # start adding columns to the result dataframe (subtotals for ID columns - if a tweet has hit any of the query terms for the category the sum will be more than 0) + result <- cbind(result, colSums(mm[, c(5:ncol(mm))])) + } + + } + # now let's add the column names so humans can read them + colnames(result) <- c("Row ID", categories) + + # now return the result as a dataframe + as.data.frame(result, stringsAsFactors = FALSE) + + } diff --git a/R/create_unique_total_diagnostics.R b/R/create_unique_total_diagnostics.R index a49e497..770a29a 100644 --- a/R/create_unique_total_diagnostics.R +++ b/R/create_unique_total_diagnostics.R @@ -8,39 +8,108 @@ #' @return Passing the output of the \link[lecat]{run_lecat_analysis} function will return a data frame with Type, #' Category, Queries and Column_examined columns. In the output the unique and total occurrences of Types, Category and Query #' are reported in the format Term(total occurrences, unique occurrences). -create_unique_total_diagnostics <- function(lecat_result, inShiny = FALSE){ - - # TODO: keep counts as a matrix for efficiency reasons - # currently much code assumes it's a dataframe, - # so this will take a while - - # preallocate results dataframe - result <- data.frame(Type = rep(NaN, length(unique(lecat_result$Type))), - Category = NaN, - Queries = NaN, - Column_examined = NaN, - stringsAsFactors = FALSE) - # iterators - i <- 1 - result_i <- 1 - - - count <- function(x) { - totals <- colSums(x, na.rm = TRUE) - paste('(', sum(totals), ',', sum(totals > 0), ')', sep = '') - } +create_unique_total_diagnostics <- + function(lecat_result, inShiny = FALSE) { + # TODO: keep counts as a matrix for efficiency reasons + # currently much code assumes it's a dataframe, + # so this will take a while + + # preallocate results dataframe + result <- + data.frame( + Type = rep(NaN, length(unique( + lecat_result$Type + ))), + Category = NaN, + Queries = NaN, + Column_examined = NaN, + stringsAsFactors = FALSE + ) + # iterators + i <- 1 + result_i <- 1 + n <- length(unique(lecat_result$Type)) + category_i <- 1 + + count <- function(x) { + totals <- colSums(x, na.rm = TRUE) + paste('(', sum(totals), ',', sum(totals > 0), ')', sep = '') + } - n <- length(unique(lecat_result$Type)) - category_i <- 1 + if (inShiny) { + shiny::withProgress(message = 'Generating diagnostics', value = 0, { + # Loop though types + for (type in unique(lecat_result$Type)) { + shiny::incProgress(1 / n, detail = paste("Category", category_i)) - if (inShiny) { - shiny::withProgress(message = 'Generating diagnostics', value = 0, { + # types, categories and queries + these_types_categories_queries <- + lecat_result[lecat_result$Type == type, 1:4] - # Loop though types - for (type in unique(lecat_result$Type)) { + # pass frequencies to count function + type_string <- + paste(type, + count(lecat_result[lecat_result$Type == type, 5:ncol(lecat_result)]) + , sep = '') - shiny::incProgress(1/n, detail = paste("Type", category_i)) + # loop though categories in type + for (category in unique(these_types_categories_queries$Category)) { + category_i <- category_i + 1 + + # categories and queries + these_categories_queries <- + lecat_result[lecat_result$Type == type & + lecat_result$Category == category, 1:4] + + # pass frequencies to count function + category_string <- + paste(category, + count(lecat_result[lecat_result$Type == type & + lecat_result$Category == category, 5:ncol(lecat_result)]), + sep = '') + # preallocate query string + query_strings <- '' + + # for each query in our category data + for (query in unique(these_categories_queries$Query)) { + # pass frequencies to count function + query_string <- + paste(query, + count(lecat_result[lecat_result$Type == type & + lecat_result$Category == category & + lecat_result$Query == query, 5:ncol(lecat_result)]), + sep = '') + + # add query string entry to existing query strings + query_strings <- paste(query_strings, query_string) + + } + + # write result to preallocated dataframe + result[result_i, ] <- data.frame( + Type = type_string, + Category = category_string, + Queries = query_strings, + Column_examined = unique(these_categories_queries$Column_examined), + stringsAsFactors = FALSE + ) + # incriment result counter + result_i <- result_i + 1 + } + # incriment our i variable + i <- i + 1 + } + }) + } else { + # Create progress bar + pb <- + utils::txtProgressBar(min = 1, + max = length(unique(lecat_result$Type)), + initial = 1) + + # Loop though types + for (type in unique(lecat_result$Type)) { # types, categories and queries these_types_categories_queries <- lecat_result[lecat_result$Type == type, 1:4] @@ -53,19 +122,18 @@ create_unique_total_diagnostics <- function(lecat_result, inShiny = FALSE){ # loop though categories in type for (category in unique(these_types_categories_queries$Category)) { - category_i <- category_i + 1 # categories and queries these_categories_queries <- - lecat_result[lecat_result$Type == type & lecat_result$Category == category, 1:4] + lecat_result[lecat_result$Type == type & + lecat_result$Category == category, 1:4] # pass frequencies to count function category_string <- paste(category, - count( - lecat_result[lecat_result$Type == type & lecat_result$Category == category, 5:ncol(lecat_result)] - ), + count(lecat_result[lecat_result$Type == type & + lecat_result$Category == category, 5:ncol(lecat_result)]), sep = '') # preallocate query string @@ -73,13 +141,15 @@ create_unique_total_diagnostics <- function(lecat_result, inShiny = FALSE){ # for each query in our category data for (query in unique(these_categories_queries$Query)) { + # increment the progress bar + utils::setTxtProgressBar(pb, i) # pass frequencies to count function query_string <- paste(query, - count( - lecat_result[lecat_result$Type == type & lecat_result$Category == category & lecat_result$Query == query, 5:ncol(lecat_result)] - ), + count(lecat_result[lecat_result$Type == type & + lecat_result$Category == category & + lecat_result$Query == query, 5:ncol(lecat_result)]), sep = '') # add query string entry to existing query strings @@ -88,87 +158,20 @@ create_unique_total_diagnostics <- function(lecat_result, inShiny = FALSE){ } # write result to preallocated dataframe - result[result_i,] <- data.frame(Type = type_string, - Category = category_string, - Queries = query_strings, - Column_examined = unique(these_categories_queries$Column_examined), - stringsAsFactors = FALSE) + result[result_i, ] <- data.frame( + Type = type_string, + Category = category_string, + Queries = query_strings, + Column_examined = unique(these_categories_queries$Column_examined), + stringsAsFactors = FALSE + ) # incriment result counter result_i <- result_i + 1 } # incriment our i variable i <- i + 1 } - }) - } else { - # Create progress bar - pb <- utils::txtProgressBar(min = 1, max = length(unique(lecat_result$Type)), initial = 1) - - # Loop though types - for (type in unique(lecat_result$Type)) { - - # types, categories and queries - these_types_categories_queries <- - lecat_result[lecat_result$Type == type, 1:4] - - # pass frequencies to count function - type_string <- - paste(type, - count(lecat_result[lecat_result$Type == type, 5:ncol(lecat_result)]) - , sep = '') - - # loop though categories in type - for (category in unique(these_types_categories_queries$Category)) { - - category_i <- category_i + 1 - - # categories and queries - these_categories_queries <- - lecat_result[lecat_result$Type == type & lecat_result$Category == category, 1:4] - - # pass frequencies to count function - category_string <- - paste(category, - count( - lecat_result[lecat_result$Type == type & lecat_result$Category == category, 5:ncol(lecat_result)] - ), - sep = '') - - # preallocate query string - query_strings <- '' - - # for each query in our category data - for (query in unique(these_categories_queries$Query)) { - - # incriment the progress bar - utils::setTxtProgressBar(pb, i) - - # pass frequencies to count function - query_string <- - paste(query, - count( - lecat_result[lecat_result$Type == type & lecat_result$Category == category & lecat_result$Query == query, 5:ncol(lecat_result)] - ), - sep = '') - - # add query string entry to existing query strings - query_strings <- paste(query_strings, query_string) - - } - - # write result to preallocated dataframe - result[result_i,] <- data.frame(Type = type_string, - Category = category_string, - Queries = query_strings, - Column_examined = unique(these_categories_queries$Column_examined), - stringsAsFactors = FALSE) - # incriment result counter - result_i <- result_i + 1 - } - # incriment our i variable - i <- i + 1 + close(pb) } - close(pb) + result } - result -} diff --git a/R/run_lecat_analysis.R b/R/run_lecat_analysis.R index c063b93..f09db1d 100644 --- a/R/run_lecat_analysis.R +++ b/R/run_lecat_analysis.R @@ -1,98 +1,102 @@ -#' Searches for queries in a corpus using a specific regular expression +#' Searches for queries in a corpus using a specific regular expression. Uses \link[lecat]{run_search}. #' #' Each corpus element is checked for the presence of a query. The process is repeated for multiple queries. The result is a table of queries and number of matches for each corpus row. #' #' @param lexicon Lexicon dataframe as parsed by the \link[lecat]{parse_lexicon} function #' @param corpus Corpus dataframe containing search columns present in the searches dataframe #' @param searches Data frame with the columns 'Type' and 'Column'. Queries in each Type will be located in the corresponding corpus Column -#' @param id Column name to use for identifying differing corpus samples (e.g., YouTube video id). Autogenerated if no id is provided. #' @param regex_expression String regular expression defining search pattern. Defaults to searching for the query term with non word characters either side or at the beginning or end of string. Look behind and look ahead impede characters outside the query term to be matched (correctly finds emoji or any other non-word query terms) #' @param inShiny If inShiny is TRUE then shiny based notifications will be shown #' @param case_sensitive If case_sensitive is TRUE then the search will be case sensitive #' @param advanced_mode If advanced_mode is TRUE then the search will not apply a regex, instead it will assume all query terms are well-formed regex patterns (this covers query terms with no regex pattern at all). #' -#' @return run_lecat_analysis returns a data frame containing the lexicon, the corresponding search column for the query type and the frequency of terms by corpus id -run_lecat_analysis <- function(lexicon, corpus, searches, id = NaN, regex_expression = '(?<=\\W|^)query(?=\\W|$)', inShiny = FALSE, case_sensitive = FALSE, advanced_mode = FALSE){ +#' @return run_lecat_analysis uses \link[lecat]{run_search} to return a data frame containing the lexicon, the corresponding search column for the query type, and the frequency of terms by corpus row (columns representing corpus rows are named after ID in corpus, if present) +run_lecat_analysis <- + function(lexicon, + corpus, + searches, + regex_expression = '(?<=\\W|^)query(?=\\W|$)', + inShiny = FALSE, + case_sensitive = FALSE, + advanced_mode = FALSE) { + # start building output + result <- + data.frame( + Type = rep(NaN, nrow(lexicon)), + Category = rep(NaN, nrow(lexicon)), + Query = rep(NaN, nrow(lexicon)), + Column_examined = rep(NaN, nrow(lexicon)), + stringsAsFactors = FALSE + ) - # DONT MAKE EVERYTHING LOWER CASE! USE REGEX CASE SENSITIVE MODIFIER INSTEAD (IN RUN_SEARCH FUNCTION)... - # convert everything to lower case if not case sensitive - # if(!case_sensitive) { - # lexicon$Queries <- stringr::str_to_lower(lexicon$Queries) + counts_df <- + as.data.frame(matrix( + data = rep(NaN, nrow(lexicon) * nrow(corpus)), + nrow = nrow(lexicon), + ncol = nrow(corpus) + )) - # turn tibble all lower case - # corpus <- corpus %>% - # dplyr::mutate_all(.funs = stringr::str_to_lower) + result <- cbind(result, counts_df, stringsAsFactors = FALSE) - # set each column in the corpus to lower case - #for (col_name in names(corpus)) { - # corpus[, col_name] <- lapply(X = corpus[, col_name], FUN = stringr::str_to_lower) - #} - # } - - # Create custom ID - message('Creating ID column') - corpus$auto_id_column <- as.character(1:nrow(corpus)) - id <- 'auto_id_column' - - out <- NULL - - # output dataframe - result <- - data.frame( - Type = rep(NaN, nrow(lexicon)), - Category = rep(NaN, nrow(lexicon)), - Query = rep(NaN, nrow(lexicon)), - Column_examined = rep(NaN, nrow(lexicon)), - stringsAsFactors = FALSE - ) - - counts_df <- - as.data.frame(matrix( - data = rep(NaN, nrow(lexicon) * nrow(corpus)), - nrow = nrow(lexicon), - ncol = nrow(corpus) - )) - - result <- cbind(result, counts_df, stringsAsFactors = FALSE) - - if (inShiny) { - n <- nrow(lexicon) - shiny::withProgress(message = 'Searching corpus', value = 0, { - for (i in 1:nrow(lexicon)) { - shiny::incProgress(1/n, detail = paste("query", i)) - this_search_column <- searches$Column[lexicon$Type[i] == searches$Type] - #out <- rbind(out, - shiny::showNotification(paste('Query:', lexicon$Queries[i])) - result[i,] <- run_search(corpus[,this_search_column], - lexicon$Queries[i], - regex_expression, lexicon$Type[i], - lexicon$Category[i], - corpus[,id], - this_search_column, - case_sensitive, - advanced_mode) - #) - } - }) - } else { - pb <- utils::txtProgressBar(min = 1, max = nrow(lexicon), initial = 1) - for (i in 1:nrow(lexicon)) { - utils::setTxtProgressBar(pb, i) - this_search_column <- searches$Column[lexicon$Type[i] == searches$Type] - #out <- rbind(out, - result[i,] <- run_search(strings = corpus[,this_search_column], - query = lexicon$Queries[i], - regex = regex_expression, - type = lexicon$Type[i], - category = lexicon$Category[i], - ids = corpus[,id], - column = this_search_column, + if (inShiny) { + n <- nrow(lexicon) + shiny::withProgress(message = 'Searching corpus', + detail = 'This may take a while...', + value = 0, + { + # go through each row in the long lexicon + for (i in 1:nrow(lexicon)) { + shiny::incProgress(1 / n, detail = paste("Query", i, lexicon$Queries[i])) + # set the search column for this type, as specified in the lookup table + this_search_column <- + searches$Column[lexicon$Type[i] == searches$Type] + #out <- rbind(out, + #shiny::showNotification(paste('Query:', lexicon$Queries[i])) + # search for this query term + result[i, ] <- run_search( + corpus[, this_search_column], + lexicon$Queries[i], + regex_expression, + lexicon$Type[i], + lexicon$Category[i], + this_search_column, case_sensitive, - advanced_mode) - #) - } - close(pb) - } + advanced_mode + ) + #) + } + }) + } else { + pb <- + utils::txtProgressBar(min = 1, + max = nrow(lexicon), + initial = 1) + # go through each row in the long lexicon + for (i in 1:nrow(lexicon)) { + utils::setTxtProgressBar(pb, i) + # set the search column for this type, as specified in the lookup table + this_search_column <- + searches$Column[lexicon$Type[i] == searches$Type] + + # search for this query term + result[i, ] <- + run_search( + strings = corpus[, this_search_column], + query = lexicon$Queries[i], + regex = regex_expression, + type = lexicon$Type[i], + category = lexicon$Category[i], + column = this_search_column, + case_sensitive, + advanced_mode + ) + } + close(pb) + } - result -} + # If corpus contains an ID column, use it to name columns in the raw matrix (after already named columns for lexicon fields) + if ("id" %in% names(corpus)) { + names(result)[5:ncol(result)] <- corpus$id + } + result + } diff --git a/R/run_search.R b/R/run_search.R index a1422e7..09034a1 100644 --- a/R/run_search.R +++ b/R/run_search.R @@ -1,30 +1,30 @@ -#' Searches a series of strings for a specific query. Used internally by run_lecat_analysis +#' Searches a series of strings for a specific query. Used internally by \link[lecat]{run_lecat_analysis} #' #' Search through a series for queries based on a regex query. #' -#' @param strings Vector of strings to search through. Strings are from the chosen corpus. -#' @param query String query term to search for in the strings. Taken from the chosen lexicon. +#' @param strings Vector of strings to search through. The values for the corresponding search column from the chosen corpus for a given lexicon type, as specified in the lookup table. +#' @param query String query term to search for in the strings. Derived from lexicon. #' @param regex String regular expression defining the search pattern. Defaults to searching for the query term with non word characters either side or at the beginning or end of string. Look behind and look ahead impede characters outside the query term to be matched (correctly finds emoji or any other non-word query terms) #' @param type String query type. Derived from lexicon type. -#' @param category String query type. Derived from lexicon type. -#' @param ids id of the string +#' @param category String query type. Derived from lexicon category. #' @param column String column name examined included in the returned dataframe #' @param case_sensitive If case_sensitive is TRUE then the search will be case sensitive #' @param advanced_mode If advanced_mode is TRUE then the search will not apply a regex, instead it will assume all query terms are well-formed regex patterns (this covers query terms with no regex pattern at all). #' -#' @return dataframe with counts of the query in each string +#' @return dataframe with counts of the query in each string. run_search <- function(strings, query, regex = "(?<=\\W|^)query(?=\\W|$)", type, category, - ids, column, case_sensitive, advanced_mode) { + # if not on advanced mode, prepare the query by escaping special characters and adding the global regex if (!advanced_mode) { + # correctly add backslash to any special character for regex search {}[]()|?$^*+.\ query <- stringr::str_replace_all(query, "([{}\\[\\]()|?$^*+.\\\\])", "\\\\\\$1") @@ -36,12 +36,12 @@ run_search <- replacement = query) } else { + # the query term should already come as a regex pattern this_pattern <- query } # count matches of the regex - applies case sensitivity as per user selection - # counts <- stringr::str_count(string = strings, pattern = this_pattern) counts <- lapply( strings, @@ -59,10 +59,12 @@ run_search <- stringsAsFactors = FALSE ) + # make strings into a data frame if (!is.data.frame(strings)) { strings <- as.data.frame(strings) } + # create a matrix with the results, that can be appended to the whole counts_df <- as.data.frame(matrix( data = as.numeric(unlist(counts)), diff --git a/inst/shiny-apps/lecat-app/server.R b/inst/shiny-apps/lecat-app/server.R index 3daddb0..93a6830 100644 --- a/inst/shiny-apps/lecat-app/server.R +++ b/inst/shiny-apps/lecat-app/server.R @@ -57,8 +57,8 @@ function(input, output, session) { example_corpus <- data.frame( id = c(1, 2, 3), title = c( - ' James Tripp talks about LE-CAT', - ' Noortje Marres interview', + 'James Tripp talks about LE-CAT', + 'Noortje Marres interview', 'New iphone' ), description = c( @@ -138,11 +138,22 @@ function(input, output, session) { # try to load the lexicon file and parse it tryCatch({ - # read excel file - x <- - readxl::read_excel(input$lecat_lexicon_file$datapath) - - shiny::showNotification('Loading lexicon: reading excel file', type = 'message') + # read file + if (input$lecat_lexicon_file$type == "text/csv") { + x <- + read.csv( + input$lecat_lexicon_file$datapath, + stringsAsFactors = FALSE, + header = TRUE, + sep = ",", + quote = "\"", + fileEncoding = "UTF-8", + skipNul = TRUE + ) + } else { + x <- + readxl::read_excel(input$lecat_lexicon_file$datapath) + } # Make sure there's at least one query at_least_one_query <- sum(grepl( @@ -157,11 +168,14 @@ function(input, output, session) { tryCatch({ data$lecat_lexicon <- parse_lexicon(x) data$lexicon_loaded <- TRUE - shiny::showNotification('Loading lexicon: Parsed to long format', type = 'message') + shiny::showNotification('Lexicon loaded: Parsed to long format', type = 'message') }, error = function(e) { # return an error if parsing fails - shiny::showNotification('Lexicon not loaded: Parsing error, check all categories in the lexicon have at least one query term', type = 'error') + shiny::showNotification( + 'Lexicon not loaded: Parsing error, check all categories in the lexicon have at least one query term', + type = 'error' + ) }) } else { if (!at_least_one_query) { @@ -189,16 +203,27 @@ function(input, output, session) { req(input$lecat_corpus_file) tryCatch({ - data$lecat_corpus <- - readxl::read_excel(input$lecat_corpus_file$datapath) + if (input$lecat_corpus_file$type == "text/csv") { + data$lecat_corpus <- + read.csv( + input$lecat_corpus_file$datapath, + stringsAsFactors = FALSE, + header = TRUE, + sep = ",", + quote = "\"", + fileEncoding = "UTF-8", + skipNul = TRUE + ) + } else { + data$lecat_corpus <- + readxl::read_excel(input$lecat_corpus_file$datapath) + } data$corpus_loaded <- TRUE }, error = function(e) { - # return a safeError if a parsing error occurs - #stop(safeError(e)) + # return an error if a parsing error occurs shiny::showNotification('Corpus not loaded: error', type = 'error') - #safeError(e) }) }) @@ -207,8 +232,21 @@ function(input, output, session) { req(input$lecat_lookup_table_file) tryCatch({ - lookup_table <- - readxl::read_excel(input$lecat_lookup_table_file$datapath) + if (input$lecat_lookup_table_file$type == "text/csv") { + lookup_table <- + read.csv( + input$lecat_lookup_table_file$datapath, + stringsAsFactors = FALSE, + header = TRUE, + sep = ",", + quote = "\"", + fileEncoding = "UTF-8", + skipNul = TRUE + ) + } else { + lookup_table <- + readxl::read_excel(input$lecat_lookup_table_file$datapath) + } # check if lookup table has the Type and Column column names if (('Type' %in% names(lookup_table)) & @@ -223,7 +261,7 @@ function(input, output, session) { # count and remove incomplete cases n_incomplete_cases <- sum(!complete.cases(lookup_table)) lookup_table <- - lookup_table[complete.cases(lookup_table), ] + lookup_table[complete.cases(lookup_table),] # notify the user of incomplete cases shiny::showNotification(paste( @@ -264,7 +302,7 @@ function(input, output, session) { # Check if the lexicon and lookup types are the same if (mean(lookup_table_types %in% lexicon_types) == 1) { - shiny::showNotification('Running LE-CAT analysis') + shiny::showNotification('Running LE-CAT analysis...') tryCatch({ # Run the analysis @@ -287,17 +325,24 @@ function(input, output, session) { # Create diagnostic summary and assign into reactive value data$lecat_diagnostics <- - create_unique_total_diagnostics(x, inShiny = TRUE) - + create_unique_total_diagnostics(data$lecat_raw_result, inShiny = TRUE) shiny::showNotification('Diagnostics generated') + # Notify user + shiny::showNotification('Generating category matrix. Please wait...') + + # Create diagnostic summary and assign into reactive value + data$lecat_category_matrix <- + create_category_matrix(data$lecat_raw_result, inShiny = TRUE) + shiny::showNotification('Category matrix generated') + # Set flag indicating analysis complete for UI data$lecat_analysis_complete <- TRUE }, error = function(e) { - # return a safeError if a searching error occurs + # return a safeError if an analysis error occurs shiny::showNotification( - 'Analysis not completed: Check your lexicon query terms. If using Advanced Regex Mode, check all query terms are well-formed regex patterns', + 'Analysis not completed: If using Advanced Regex Mode, check all query terms are well-formed regex patterns', type = 'error' ) }) @@ -312,7 +357,7 @@ function(input, output, session) { # Event run when generate_network_button is pressed ---- observeEvent(input$lecat_generate_network_button, { - shiny::showNotification('Generating cooccurrence table and network graph') + shiny::showNotification('Generating co-occurrence table and network graph. Please wait...') # Create the network x <- create_cooccurrence_graph(data$lecat_raw_result, @@ -355,9 +400,10 @@ function(input, output, session) { switch( input$lecat_output, "raw" = data$lecat_raw_result, - "cotable" = data$lecat_cotable, "diagnostics" = data$lecat_diagnostics, - "network" = data$lecat_network + "category_matrix" = data$lecat_category_matrix, + "network" = data$lecat_network, + "cotable" = data$lecat_cotable ) }) diff --git a/inst/shiny-apps/lecat-app/ui.R b/inst/shiny-apps/lecat-app/ui.R index cffd464..1d57212 100644 --- a/inst/shiny-apps/lecat-app/ui.R +++ b/inst/shiny-apps/lecat-app/ui.R @@ -1,7 +1,7 @@ # Define UI for LE-CAT app ---- fluidPage( # App title ---- - titlePanel("LE-CAT (Helena's modified version)"), + titlePanel("LE-CAT (development version)"), # Sidebar layout with a input and output definitions ---- sidebarLayout( @@ -13,35 +13,35 @@ fluidPage( "Choose example file:", choices = c("Lexicon", "Corpus", "Lookup_Table") ), - # Download button for examples files ---- downloadButton("download_lecat_example", "Download"), hr(), - tags$h5('Input'), + tags$h5('Input files'), + tags$p('Excel preferred (.xls, .xlsx), Comma Separated Values possible (.csv - MUST use comma , as separator and double quotes " as delimiter))'), - # File input for lexicon excel file ---- + # File input for lexicon file ---- fileInput( "lecat_lexicon_file", - "Choose Lexicon File (xlsx format)", + "Upload Lexicon File", multiple = FALSE, - accept = ".xlsx" + accept = c(".xlsx", ".xls", ".csv") ), - # File input for corpus excel file ---- + # File input for corpus file ---- fileInput( "lecat_corpus_file", - "Choose Corpus File (xlsx format)", + "Upload Corpus File", multiple = FALSE, - accept = ".xlsx" + accept = c(".xlsx", ".xls", ".csv") ), - # File input for lookup table excel file ---- + # File input for lookup table file ---- fileInput( "lecat_lookup_table_file", - "Choose Lookup Table File (xlsx format)", + "Upload Lookup Table File", multiple = FALSE, - accept = ".xlsx" + accept = c(".xlsx", ".xls", ".csv") ), # Conditional panel displayed when lecat is ready ---- @@ -100,7 +100,7 @@ fluidPage( selectInput( "lecat_output", "Choose output file:", - choices = c("raw", 'diagnostics', "cotable", "network") + choices = c("raw", 'diagnostics', "cotable", "network", "category_matrix") ), downloadButton("download_lecat_output", "Download") ) @@ -158,14 +158,14 @@ fluidPage( br(), p( - 'LE-CAT requires three files to work. All of these should be Excel (.xlsx) files.' + 'LE-CAT requires three files to work. All of these should be Excel (.xlsx, .xls) or Comma Separated Values (.csv) files. Excel is safer. If using CSV, please make sure the file uses comma , as separator and double quotes " as delimiter' ), tags$ul( tags$li( 'A lexicon. The file contains queries you wish to search for in the corpus and the associated category and type.' ), - tags$li('A corpus. The dataset in which to find your queries.'), + tags$li('A corpus. The dataset in which to find your queries. Should contain an ID column.'), tags$li( 'A lookup table. A table instructing LE-CAT which column of the corpus to search for query types.' @@ -174,7 +174,7 @@ fluidPage( strong('Walkthrough'), tags$ol( tags$li( - 'Download the example Lexicon, Corpus and Lookup Table. Modify these files to suite your needs.' + 'Download the example Lexicon, Corpus and Lookup Table. Modify these files to suit your needs.' ), tags$li( 'Upload the modified files to LE-CAT by clicking Browse and choosing the file. The Analysis section will appear once all three files are uploaded.' @@ -186,10 +186,10 @@ fluidPage( 'Search the corpus for your queries by clicking on "Run LE-CAT Analysis". The LE-CAT diagnostics will be shown in the Data tab once the analysis is complete.' ), tags$li( - 'LE-CAT offers co-occurence analysis. You can investigate how often queries, types or categories co-occur. Select your desired level of co-occurrence then click "Calculate co-occurence". A co-occurence table and network file are now available to download.' + 'LE-CAT offers co-occurrence analysis. You can investigate how often categories co-occur (NB. calculation of co-occurrence of query terms is in development). Select your desired level of co-occurrence then click "Calculate co-occurrence". A co-occurrence table and network file are now available to download.' ), tags$li( - 'All the output files are available to download. Click on the drop down menu to select a file and then press Download.' + 'All the output files are available to download (NB. cotable and network files will only be available after clicking the "Calculate co-occurrence" button). Click on the drop down menu to select a file and then press Download.' ) ) ), diff --git a/man/create_category_matrix.Rd b/man/create_category_matrix.Rd new file mode 100644 index 0000000..33da6b6 --- /dev/null +++ b/man/create_category_matrix.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_category_matrix.R +\name{create_category_matrix} +\alias{create_category_matrix} +\title{Generates a matrix for lexicon Category results} +\usage{ +create_category_matrix(lecat_result, inShiny = FALSE) +} +\arguments{ +\item{lecat_result}{data frame output from the \link[lecat]{run_lecat_analysis} function} + +\item{inShiny}{If inShiny is TRUE then shiny based notifications will be shown} +} +\value{ +Passing the output of the \link[lecat]{run_lecat_analysis} function will return a data frame +} +\description{ +Creates a file listing the total matches for each Category in the lexicon for each row in the corpus. +The file can be pasted as additional columns in the corpus, which allows for using filter and sorting functions to explore the data. +For example, to find all rows in an analysed corpus that match a specific lexicon category or combination of categories and how many times. +} diff --git a/man/run_lecat_analysis.Rd b/man/run_lecat_analysis.Rd index 7cfc5e6..0cf5d9c 100644 --- a/man/run_lecat_analysis.Rd +++ b/man/run_lecat_analysis.Rd @@ -2,13 +2,12 @@ % Please edit documentation in R/run_lecat_analysis.R \name{run_lecat_analysis} \alias{run_lecat_analysis} -\title{Searches for queries in a corpus using a specific regular expression} +\title{Searches for queries in a corpus using a specific regular expression. Uses \link[lecat]{run_search}.} \usage{ run_lecat_analysis( lexicon, corpus, searches, - id = NaN, regex_expression = "(?<=\\\\W|^)query(?=\\\\W|$)", inShiny = FALSE, case_sensitive = FALSE, @@ -22,8 +21,6 @@ run_lecat_analysis( \item{searches}{Data frame with the columns 'Type' and 'Column'. Queries in each Type will be located in the corresponding corpus Column} -\item{id}{Column name to use for identifying differing corpus samples (e.g., YouTube video id). Autogenerated if no id is provided.} - \item{regex_expression}{String regular expression defining search pattern. Defaults to searching for the query term with non word characters either side or at the beginning or end of string. Look behind and look ahead impede characters outside the query term to be matched (correctly finds emoji or any other non-word query terms)} \item{inShiny}{If inShiny is TRUE then shiny based notifications will be shown} @@ -33,7 +30,7 @@ run_lecat_analysis( \item{advanced_mode}{If advanced_mode is TRUE then the search will not apply a regex, instead it will assume all query terms are well-formed regex patterns (this covers query terms with no regex pattern at all).} } \value{ -run_lecat_analysis returns a data frame containing the lexicon, the corresponding search column for the query type and the frequency of terms by corpus id +run_lecat_analysis uses \link[lecat]{run_search} to return a data frame containing the lexicon, the corresponding search column for the query type, and the frequency of terms by corpus row (columns representing corpus rows are named after ID in corpus, if present) } \description{ Each corpus element is checked for the presence of a query. The process is repeated for multiple queries. The result is a table of queries and number of matches for each corpus row. diff --git a/man/run_search.Rd b/man/run_search.Rd index 7d9bf0b..5b47623 100644 --- a/man/run_search.Rd +++ b/man/run_search.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/run_search.R \name{run_search} \alias{run_search} -\title{Searches a series of strings for a specific query. Used internally by run_lecat_analysis} +\title{Searches a series of strings for a specific query. Used internally by \link[lecat]{run_lecat_analysis}} \usage{ run_search( strings, @@ -10,24 +10,21 @@ run_search( regex = "(?<=\\\\W|^)query(?=\\\\W|$)", type, category, - ids, column, case_sensitive, advanced_mode ) } \arguments{ -\item{strings}{Vector of strings to search through. Strings are from the chosen corpus.} +\item{strings}{Vector of strings to search through. The values for the corresponding search column from the chosen corpus for a given lexicon type, as specified in the lookup table.} -\item{query}{String query term to search for in the strings. Taken from the chosen lexicon.} +\item{query}{String query term to search for in the strings. Derived from lexicon.} \item{regex}{String regular expression defining the search pattern. Defaults to searching for the query term with non word characters either side or at the beginning or end of string. Look behind and look ahead impede characters outside the query term to be matched (correctly finds emoji or any other non-word query terms)} \item{type}{String query type. Derived from lexicon type.} -\item{category}{String query type. Derived from lexicon type.} - -\item{ids}{id of the string} +\item{category}{String query type. Derived from lexicon category.} \item{column}{String column name examined included in the returned dataframe} @@ -36,7 +33,7 @@ run_search( \item{advanced_mode}{If advanced_mode is TRUE then the search will not apply a regex, instead it will assume all query terms are well-formed regex patterns (this covers query terms with no regex pattern at all).} } \value{ -dataframe with counts of the query in each string +dataframe with counts of the query in each string. } \description{ Search through a series for queries based on a regex query.