Skip to content

Commit

Permalink
updates for sentence_similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexChristensen committed Aug 2, 2024
1 parent b618f5f commit eeabe1f
Show file tree
Hide file tree
Showing 11 changed files with 375 additions and 13 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: transforEmotion
Title: Sentiment Analysis for Text, Image and Video using Transformer Models
Version: 0.1.5
Date: 2024-01-28
Version: 0.1.6
Date: 2024-08-02
Authors@R: c(person("Alexander", "Christensen", email = "alexpaulchristensen@gmail.com",
role = "aut", comment = c(ORCID = "0000-0002-9798-7037")),
person("Hudson", "Golino", email = "hfg9s@virginia.edu", role = "aut",
Expand All @@ -15,5 +15,5 @@ Encoding: UTF-8
Imports: dplyr, googledrive, LSAfun, Matrix, methods, pbapply, remotes, reticulate
Suggests: knitr, markdown, rmarkdown, rstudioapi, testthat (>= 3.0.0)
VignetteBuilder: knitr
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Config/testthat/edition: 3
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(nlp_scores)
export(plot_sim_emotions)
export(punctuate)
export(rag)
export(sentence_similarity)
export(setup_gpu_modules)
export(setup_miniconda)
export(setup_modules)
Expand Down
7 changes: 7 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# Changes in version 0.1.6

o ADD: `sentence_similarity` to apply {sentence-transformers} to get similarity between texts

o FIX: error in `transformer_scores` when the argument 'device_map' does not exist (solution: ignore device)


# Changes in version 0.1.5

o ADD: basic retrieval-augmented generation in the `rag` function (includes LLAMA-2, Mistral-7B, Orca-2, Phi-2, TinyLLAMA)
Expand Down
2 changes: 1 addition & 1 deletion R/rag.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ rag <- function(
"accumulate", "compact", "no_text",
"refine", "simple_summarize", "tree_summarize"
), similarity_top_k = 5,
device = c("auto", "cpu"), keep_in_env = TRUE,
device = c("auto", "cpu", "cuda"), keep_in_env = TRUE,
envir = 1, progress = TRUE
)
{
Expand Down
247 changes: 247 additions & 0 deletions R/sentence_similarity.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
#' Sentiment Analysis Scores
#'
#' @description Uses sentiment analysis pipelines from \href{https://huggingface.co}{huggingface}
#' to compute probabilities that the text corresponds to the specified classes
#'
#' @param text Character vector or list.
#' Text in a vector or list data format
#'
#' @param comparison_text Character vector or list.
#' Text in a vector or list data format
#'
#' @param transformer Character.
#' Specific sentence similarity transformer
#' to be used.
#' Defaults to \code{"all_minilm_l6"} (see \href{https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2}{huggingface})
#'
#' Also allows any sentence similarity models with a pipeline
#' from \href{https://huggingface.co/models?pipeline_tag=sentence-similarity}{huggingface}
#' to be used by using the specified name (e.g., \code{"typeform/distilbert-base-uncased-mnli"}; see Examples)
#'
#' @param device Character.
#' Whether to use CPU or GPU for inference.
#' Defaults to \code{"auto"} which will use
#' GPU over CPU (if CUDA-capable GPU is setup).
#' Set to \code{"cpu"} to perform over CPU
#'
#' @param preprocess Boolean.
#' Should basic preprocessing be applied?
#' Includes making lowercase, keeping only alphanumeric characters,
#' removing escape characters, removing repeated characters,
#' and removing white space.
#' Defaults to \code{FALSE}.
#' Transformers generally are OK without preprocessing and handle
#' many of these functions internally, so setting to \code{TRUE}
#' will not change performance much
#'
#' @param keep_in_env Boolean.
#' Whether the classifier should be kept in your global environment.
#' Defaults to \code{TRUE}.
#' By keeping the classifier in your environment, you can skip
#' re-loading the classifier every time you run this function.
#' \code{TRUE} is recommended
#'
#' @param envir Numeric.
#' Environment for the classifier to be saved for repeated use.
#' Defaults to the global environment
#'
#' @return Returns a \emph{n} x \emph{m} similarity matrix where \emph{n} is length of \code{text} and \emph{m} is the length of \code{comparison_text}
#'
#' @author Alexander P. Christensen <alexpaulchristensen@gmail.com>
#'
#' @examples
#' # Load data
#' data(neo_ipip_extraversion)
#'
#' # Example text
#' text <- neo_ipip_extraversion$friendliness[1:5]
#'
#' \dontrun{
#' # Example with defaults
#' sentence_similarity(
#' text = text, comparison_text = text
#' )
#'
#' # Example with model from 'sentence-transformers'
#' sentence_similarity(
#' text = text, comparison_text = text,
#' transformer = "sentence-transformers/all-mpnet-base-v2"
#' )
#'
#'}
#'
#' @export
#'
# Sentence Similarity
# Updated 02.08.2024
sentence_similarity <- function(
text, comparison_text,
transformer = c("all_minilm_l6"),
device = c("auto", "cpu", "cuda"),
preprocess = FALSE, keep_in_env = TRUE, envir = 1
)
{

# Check that input of 'text' argument is in the
# appropriate format for the analysis
non_text_warning(text) # see utils-transforEmotion.R for function

# Check for comparison text
if(missing(comparison_text)){
stop(
"Comparison text to compute similarity must be specified using the 'comparison_text' argument (e.g., `comparison_text = c(\"a similar sentence\", \a random sentence\")`)\n",
call. = FALSE
)
}

# Check for transformer
if(missing(transformer)){
transformer <- "all_minilm_l6"
}

# Check for multiple transformers
if(length(transformer) > 1){
stop("Only one transformer model can be used at a time.\n\nSelect one of the default models or select a model from huggingface: <https://huggingface.co/models?pipeline_tag=sentence-similarity>\n")
}

# Set device
if(missing(device)){
device <- "auto"
}else{device <- tolower(match.arg(device))}

# Check for classifiers in environment
if(exists(transformer, envir = as.environment(envir))){
classifier <- get(transformer, envir = as.environment(envir))
}else{

# Run setup for modules
setup_modules()

# Import 'sentence-transformers' module
message("Importing sentence-transformers module...")
sentence_transformers <- reticulate::import("sentence_transformers")

# Check for custom transformer
if(transformer %in% c("all_minilm_l6")){

# Load pipeline
classifier <- sentence_transformers$SentenceTransformer(
switch(
transformer,
"all_minilm_l6" = "sentence-transformers/all-MiniLM-L6-v2",
device = device
)
)

}else{

# Custom pipeline from huggingface
# Try to catch non-existing pipelines
pipeline_catch <- try(
classifier <- sentence_transformers$SentenceTransformer(
transformer, device = device
), silent = TRUE
)

# Errors
if(is(pipeline_catch, "try-error")){

# Model exists but no pipeline
if(isTRUE(grepl("Tokenizer class", pipeline_catch))){

stop(
paste(
"Transformer model '",
transformer,
"' exists but does not have a working pipeline yet.\n\nTry a default model or select a model from huggingface: <https://huggingface.co/models?pipeline_tag=zero-shot-classification>\n",
sep = ""
), call. = FALSE
)

}else if(isTRUE(grepl("device_map", pipeline_catch))){

# Try again without device
pipeline_catch <- try(
classifier <- sentence_transformers$SentenceTransformer(transformer), silent = TRUE
)

}else{
stop(pipeline_catch, call. = FALSE)
}

}

}

}

# Load into environment
if(isTRUE(keep_in_env)){

# Keep transformer module in environment
assign(
x = "sentence_transformers",
value = sentence_transformers,
envir = as.environment(envir)
)

# Keep classifier in environment
assign(
x = transformer,
value = classifier,
envir = as.environment(envir)
)
}

# Basic preprocessing
if(isTRUE(preprocess)){
text <- preprocess_text( # Internal function. See `utils-transforEmotion`
text,
remove_stop = FALSE # Transformers will remove stop words
)
}

# Message
message("Obtaining similarities...")

# Combine sentences
sentences <- c(text, comparison_text)

# Get embeddings
embeddings <- classifier$encode(sentences)

# Loop over text comparisons
text_length <- length(text)
comparison_length <- length(comparison_text)

# Set up matrix
similarity_matrix <- matrix(
0, nrow = text_length, ncol = comparison_length,
dimnames = list(text, comparison_text)
)

# Populate similarity matrix
for(i in seq_len(text_length))
for(j in seq_len(comparison_length)){

# Compute cosine
similarity_matrix[i,j] <- cosine(
embed1 = embeddings[i,],
embed2 = embeddings[j + text_length,]
)

}


# Return similarities
return(similarity_matrix)

}

#' @noRd
# Cosine function ----
# Updated 02.08.2024
cosine <- function(embed1, embed2)
{
return(crossprod(embed1, embed2) / sqrt(crossprod(embed1) * crossprod(embed2)))
}
6 changes: 3 additions & 3 deletions R/setup_modules.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' @export
#'
# Install modules
# Updated 03.02.2024
# Updated 02.08.2024
setup_modules <- function()
{

Expand All @@ -18,8 +18,8 @@ setup_modules <- function()
"accelerate==0.29.3", "llama-index==0.10.30", "nltk==3.8.1",
"opencv-python", "pandas==2.1.3", "pypdf==4.0.1",
"pytube==15.0.0", "pytz==2024.1", "qdrant-client==1.8.2",
"sentencepiece==0.2.0", "tensorflow==2.14.1", "torch==2.1.1",
"transformers==4.35.0"
"sentencepiece==0.2.0", "sentence-transformers==2.7.0",
"tensorflow==2.14.1", "torch==2.1.1", "transformers==4.35.0"
)

# Determine whether any modules need to be installed
Expand Down
15 changes: 12 additions & 3 deletions R/transformer_scores.R
Original file line number Diff line number Diff line change
Expand Up @@ -141,15 +141,15 @@
#' @export
#'
# Transformer Scores
# Updated 23.01.2024
# Updated 02.08.2024
transformer_scores <- function(
text, classes, multiple_classes = FALSE,
transformer = c(
"cross-encoder-roberta",
"cross-encoder-distilroberta",
"facebook-bart"
),
device = c("auto", "cpu"),
device = c("auto", "cpu", "cuda"),
preprocess = FALSE, keep_in_env = TRUE, envir = 1
)
{
Expand Down Expand Up @@ -221,7 +221,7 @@ transformer_scores <- function(
)

# Errors
if(any(class(pipeline_catch) == "try-error")){
if(is(pipeline_catch, "try-error")){

# Model exists but no pipeline
if(isTRUE(grepl("Tokenizer class", pipeline_catch))){
Expand All @@ -235,6 +235,15 @@ transformer_scores <- function(
), call. = FALSE
)

}else if(isTRUE(grepl("device_map", pipeline_catch))){

# Try again without device
pipeline_catch <- try(
classifier <- transformers$pipeline(
"zero-shot-classification", model = transformer
), silent = TRUE
)

}else{
stop(pipeline_catch, call. = FALSE)
}
Expand Down
9 changes: 8 additions & 1 deletion man/MASS_mvrnorm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/rag.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit eeabe1f

Please sign in to comment.