jhudsl · howardbaik · Oct 17, 2023 · Jul 7, 2023 · Jun 2, 2023 · Jun 2, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -20,7 +20,11 @@ Depends:
     R (>= 3.1.0)
 Imports:
     cli,
+    docxtractr,
     hms,
+    httr,
+    jsonlite,
+    pdftools,
     progress,
     purrr,
     rmarkdown,

@@ -26,7 +26,6 @@ export(set_video_codec)
 export(video_codec_encode)
 importFrom(cli,cli_alert_info)
 importFrom(hms,hms)
-importFrom(progress,progress_bar)
 importFrom(purrr,compose)
 importFrom(purrr,discard)
 importFrom(purrr,map)

@@ -1,4 +1,4 @@
-#' Create a video from slides and a script
+#' Generate video from slides and a script
 #'
 #' \code{ari_narrate} creates a video from a script written in markdown and HTML
 #' slides created with \code{\link[rmarkdown]{rmarkdown}} or a similar package.
@@ -44,46 +44,48 @@
 #' @export
 #' @examples
 #' \dontrun{
-#'
-#' #
 #' ari_narrate(system.file("test", "ari_intro_script.md", package = "ari"),
-#'   system.file("test", "ari_intro.html", package = "ari"),
-#'   voice = "Joey"
-#' )
+#'             system.file("test", "ari_intro.html", package = "ari"))
 #' }
 ari_narrate <- function(script, slides,
                         output = tempfile(fileext = ".mp4"),
-                        voice = text2speech::tts_default_voice(service = service),
-                        service = "amazon",
+                        tts_engine = text2speech::tts,
+                        tts_engine_args = list(service = "coqui",
+                                               voice = NULL,
+                                               model_name = "tacotron2-DDC_ph",
+                                               vocoder_name = "ljspeech/univnet"),
+                        tts_engine_auth = text2speech::tts_auth,
                         capture_method = c("vectorized", "iterative"),
-                        subtitles = FALSE, ...,
+                        subtitles = FALSE,
                         verbose = FALSE,
                         audio_codec = get_audio_codec(),
                         video_codec = get_video_codec(),
-                        cleanup = TRUE) {
-  auth <- text2speech::tts_auth(service = service)
+                        cleanup = TRUE,
+                        ...) {
+  # Authentication for Text-to-Speech Engines
+  auth <- tts_engine_auth(service =  tts_engine_args$service)
+  # Stop message
   if (!auth) {
     stop(paste0(
       "It appears you're not authenticated with ",
-      service, ". Make sure you've ",
+      tts_engine_args$service, ". Make sure you've ",
       "set the appropriate environmental variables ",
       "before you proceed."
     ))
   }
-
-
+  # Check capture_method
   capture_method <- match.arg(capture_method)
   if (!(capture_method %in% c("vectorized", "iterative"))) {
     stop('capture_method must be either "vectorized" or "iterative"')
   }
-
+  # Output directory, path to script
   output_dir <- normalizePath(dirname(output))
   script <- normalizePath(script)
   if (file_ext(script) %in% c("Rmd", "rmd") & missing(slides)) {
     tfile <- tempfile(fileext = ".html")
     slides <- rmarkdown::render(input = script, output_file = tfile)
   }
-
+  # Slides
   if (file.exists(slides)) {
     slides <- normalizePath(slides)
     if (.Platform$OS.type == "windows") {
@@ -92,52 +94,59 @@ ari_narrate <- function(script, slides,
       slides <- paste0("file://localhost", slides)
     }
   }
+  # Check if script and output_dir exists
   stopifnot(
     file.exists(script),
     dir.exists(output_dir)
   )
-
+  # Convert script to html and get text
   if (file_ext(script) %in% c("Rmd", "rmd")) {
     paragraphs <- parse_html_comments(script)
   } else {
-    html_path <- file.path(output_dir, paste0("ari_script_", grs(), ".html"))
+    html_path <- file.path(output_dir, paste0("ari_script_", get_random_string(), ".html"))
     if (cleanup) {
       on.exit(unlink(html_path, force = TRUE), add = TRUE)
     }
-    render(script, output_format = html_document(), output_file = html_path)
+    rmarkdown::render(script, output_format = rmarkdown::html_document(), output_file = html_path)
     paragraphs <- map_chr(
-      html_text(html_nodes(read_html(html_path), "p")),
+      rvest::html_text(rvest::html_nodes(xml2::read_html(html_path), "p")),
       function(x) {
         gsub("\u2019", "'", x)
       }
     )
   }
-
+  # Path to images
   slide_nums <- seq_along(paragraphs)
   img_paths <- file.path(
     output_dir,
     paste0(
       "ari_img_",
       slide_nums, "_",
-      grs(), ".jpeg"
+      get_random_string(), ".jpeg"
     )
   )
-
+  # Take screenshot
   if (capture_method == "vectorized") {
-    webshot(url = paste0(slides, "#", slide_nums), file = img_paths, ...)
+    webshot::webshot(url = paste0(slides, "#", slide_nums), file = img_paths, ...)
   } else {
     for (i in slide_nums) {
-      webshot(url = paste0(slides, "#", i), file = img_paths[i], ...)
+      webshot::webshot(url = paste0(slides, "#", i), file = img_paths[i], ...)
     }
   }
 
   if (cleanup) {
     on.exit(walk(img_paths, unlink, force = TRUE), add = TRUE)
   }
+
+  # Pass along ari_spin()
   ari_spin(
     images = img_paths, paragraphs = paragraphs,
-    output = output, voice = voice,
-    service = service, subtitles = subtitles,
-    verbose = verbose, cleanup = cleanup
+    output = output,
+    tts_engine = tts_engine,
+    tts_engine_args =  tts_engine_args,
+    tts_engine_auth = tts_engine_auth,
+    subtitles = subtitles,
+    verbose = verbose,
+    cleanup = cleanup
   )
 }
@@ -1,30 +1,23 @@
-#' Create a video from images and text
+#' Generate video from images and text
 #'
 #' Given equal length vectors of paths to images (preferably \code{.jpg}s
 #' or \code{.png}s) and strings which will be
-#' synthesized by
-#' \href{https://aws.amazon.com/polly/}{Amazon Polly} or
-#' any other synthesizer available in
-#' \code{\link[text2speech]{tts}}, this function creates an
+#' synthesized by a text-to-speech engine, this function creates an
 #' \code{.mp4} video file where each image is shown with
 #' its corresponding narration. This function uses \code{\link{ari_stitch}} to
 #' create the video.
 #'
-#' This function needs to connect to
-#' \href{https://aws.amazon.com/}{Amazon Web Services} in order to create the
-#' narration. You can find a guide for accessing AWS from R
-#' \href{http://seankross.com/2017/05/02/Access-Amazon-Web-Services-in-R.html}{here}.
-#' For more information about how R connects
-#' to Amazon Polly see the \code{aws.polly} documentation
-#' \href{https://github.com/cloudyr/aws.polly}{here}.
-#'
 #' @param images A vector of paths to images.
 #' @param paragraphs A vector strings that will be spoken by Amazon Polly.
 #' @param output A path to the video file which will be created.
 #' @param voice The voice you want to use. See
 #' \code{\link[text2speech]{tts_voices}} for more information
 #' about what voices are available.
-#' @param service speech synthesis service to use,
+#' @param model_name (Coqui TTS only) Deep Learning model for Text-to-Speech
+#'   Conversion
+#' @param vocoder_name (Coqui TTS only) Voice coder used for speech coding and
+#'   transmission
+#' @param service Speech synthesis service to use,
 #' passed to \code{\link[text2speech]{tts}},
 #' Either \code{"amazon"}, \code{"microsoft"}, or \code{"google"}.
 #' @param subtitles Should a \code{.srt} file be created with subtitles? The
@@ -33,7 +26,7 @@
 #' \code{.srt}.
 #' @param duration a vector of numeric durations for each audio
 #' track.  See \code{\link{pad_wav}}
-#' @param ... additional arguments to \code{\link{ari_stitch}}
+#' @param ... Additional arguments to voice_engine
 #' @param tts_args list of arguments to pass to \code{\link{tts}}
 #' @param key_or_json_file access key or JSON file to pass to
 #' \code{\link{tts_auth}} for authorization
@@ -43,7 +36,6 @@
 #' @importFrom text2speech tts_auth tts tts_default_voice
 #' @importFrom tuneR bind Wave
 #' @importFrom purrr map reduce
-#' @importFrom progress progress_bar
 #' @importFrom tools file_path_sans_ext
 #' @importFrom cli cli_alert_info
 #' @export
@@ -57,115 +49,107 @@
 #'   "Welcome to my very interesting lecture.",
 #'   "Here are some fantastic equations I came up with."
 #' )
-#' ari_spin(slides, sentences, voice = "Joey")
+#' ari_spin(slides, sentences)
 #' }
 #'
 ari_spin <- function(images, paragraphs,
                      output = tempfile(fileext = ".mp4"),
-                     voice = text2speech::tts_default_voice(service = service),
-                     service = ifelse(have_polly(), "amazon", "google"),
+                     tts_engine = text2speech::tts,
+                     tts_engine_args = list(service = "coqui",
+                                            voice = NULL,
+                                            model_name = "tacotron2-DDC_ph",
+                                            vocoder_name = "ljspeech/univnet"),
+                     tts_engine_auth = text2speech::tts_auth,
                      subtitles = FALSE,
                      duration = NULL,
-                     tts_args = NULL,
                      key_or_json_file = NULL,
-                     ...) {
+                     verbose = FALSE,
+                     cleanup = TRUE) {
   # Check for ffmpeg
   ffmpeg_exec()
-
   # Argument checks
-  auth <- text2speech::tts_auth(
-    service = service,
+  auth <- tts_engine_auth(
+    service = tts_engine_args$service,
     key_or_json_file = key_or_json_file
   )
   if (!auth) {
     stop(paste0(
       "It appears you're not authenticated with ",
-      service, ". Make sure you've ",
+      tts_engine_args$service, ". Make sure you've ",
       "set the appropriate environmental variables ",
       "before you proceed."
     ))
   }
-
+  # Create file path to output
   stopifnot(length(images) > 0)
   images <- normalizePath(images)
   output_dir <- normalizePath(dirname(output))
-
+  # Paragraphs
   if (length(paragraphs) == 1) {
     if (file.exists(paragraphs)) {
       paragraphs <- readLines(paragraphs, warn = FALSE)
       paragraphs <- paragraphs[!paragraphs %in% ""]
     }
   }
-
+  # Paragraphs: Check for semicolons
   semi_colon <- trimws(paragraphs) == ";"
   if (any(semi_colon)) {
     warning(paste0(
       "Some paragraphs are simply a semicolon - ",
       "likely needs to be replaced or slide removed!"
     ))
   }
+  # Check for arguments
   stopifnot(
     length(paragraphs) > 0,
     identical(length(images), length(paragraphs)),
     all(file.exists(images)),
     dir.exists(output_dir)
   )
-  # End of Argument checks
 
   # Setup objects to populate in for-loop with tts()
   wave_objects <- vector(mode = "list", length = length(paragraphs))
-  par_along <- seq_along(paragraphs)
+  paragraphs_along <- seq_along(paragraphs)
   ideal_duration <- rep(NA, length(paragraphs))
 
-  pb <- progress_bar$new(
-    format = "Fetching Narration [:bar] :percent",
-    total = length(par_along)
-  )
   # Iterate through arguments used in tts()
-  for (i in par_along) {
-    args <- tts_args
-    args$text <- paragraphs[i]
-    args$voice <- voice
-    args$service <- service
+  for (ii in paragraphs_along) {
+    args <- tts_engine_args
+    args$text <- paragraphs[ii]
     args$bind_audio <- TRUE
     # coqui+ari doesn't work with mp3
-    if (service == "coqui") {
+    if (tts_engine_args$service == "coqui") {
       args$output_format <- "wav"
+      args$voice <- NULL
     }
-    wav <- do.call(text2speech::tts, args = args)
+    wav <- do.call(tts_engine, args = args)
     wav <- reduce(wav$wav, bind)
-    wav <- pad_wav(wav, duration = duration[i])
-    ideal_duration[i] <- length(wav@left) / wav@samp.rate
-    wave_objects[[i]] <- wav
-    pb$tick()
-  }
-  if (service == "coqui") {
-    cli::cli_alert_info("Coqui TTS does not support MP3 format; will produce a WAV audio output.")
+    wav <- pad_wav(wav, duration = duration[ii])
+    ideal_duration[ii] <- length(wav@left) / wav@samp.rate
+    wave_objects[[ii]] <- wav
   }
-
   # Burn subtitles
   if (subtitles) {
     sub_file <- paste0(file_path_sans_ext(output), ".srt")
     ari_subtitles(paragraphs, wave_objects, sub_file)
   }
-
+  print("Audio succesfully converted...............")
   # Create a video from images and audio
-  res <- ari_stitch(images, wave_objects, output, ...)
-
+  res <- ari_stitch(images, wave_objects, output)
   # Collect output
-  args <- list(...)
+  args <- list()
   cleanup <- args$cleanup
   if (is.null(cleanup)) {
     cleanup <- TRUE
   }
   if (!cleanup) {
     attr(res, "wavs") <- wave_objects
   }
-  attr(res, "voice") <- voice
+  attr(res, "voice") <- tts_engine_args$voice
   if (subtitles) {
     attr(res, "subtitles") <- sub_file
   }
-  attr(res, "service") <- service
+  attr(res, "service") <- tts_engine_args$service
   return(res)
 }