Merge pull request #20 from JBGruber/improved-rss

improve rss reader (close #19)
JBGruber · Jul 13, 2024 · 700c074 · 700c074
2 parents a0d6f44 + fa5507b
commit 700c074
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 43 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: paperboy
 Title: Comprehensive Collection of News Media Scrapers
 Version: 0.0.6.9000
-Date: 2024-05-31
+Date: 2024-07-13
 Authors@R:
     person(given = "Johannes B.",
            family = "Gruber",

diff --git a/R/collect.R b/R/collect.R
@@ -102,6 +102,8 @@ pb_collect <- function(urls,
 
   if (nrow(out) > 0) {
 
+    class(out$content_raw) <- "html_content"
+
     out <- tibble::add_column(
       out,
       domain = adaR::ada_get_domain(out$expanded_url),
@@ -122,7 +124,7 @@ pb_collect <- function(urls,
         if (verbose) cli::cli_progress_step("Parsing RSS feeds")
         cont <- cont[rss]
         class(cont) <- "html_content"
-        rss_links <- pb_collect_rss(cont)
+        rss_links <- pb_collect_rss(cont)$link
         rss_out <- pb_collect(
           rss_links,
           collect_rss = FALSE,
@@ -148,7 +150,6 @@ pb_collect <- function(urls,
   if (verbose) cli::cli_progress_done()
   attr(out, "paperboy_collected_at") <- Sys.time()
   attr(out, "paperboy_data_loc") <- ifelse(is.null(save_dir), "memory", "disk")
-
   return(out)
 }
 

diff --git a/R/rss.r b/R/rss.r
@@ -1,43 +1,78 @@
 #' Collect RSS feed
 #'
-#' Collect the URLs of articles from RSS or Atom feed(s)
+#' Collect articles from RSS or Atom feed(s)
 #'
 #' @param x URL(s) to RSS or Atom feed(s).
+#' @param parse Whether the results should be parsed into a data.frame. Turn off for debugging.
 #' @param ... passed to pb_collect.
 #'
-#' @return a character vector of URLs to articles
+#' @return a data.frame or list
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' pb_collect_rss("https://feeds.washingtonpost.com/rss/world")
+#' pb_collect_rss("https://www.washingtonpost.com/arcio/rss/")
+#' # works with atom feeds too
+#' pb_collect_rss("https://www.nu.nl/rss")
 #' }
-pb_collect_rss <- function(x, ...) {
-  if (!methods::is(x, "html_content")) {
-    df <- pb_collect(x, collect_rss = FALSE, ...)
-    x <- unlist(df[df$status < 400L, "content_raw"])
+pb_collect_rss <- function(x, parse = TRUE, ...) {
+  if (!methods::is(x, "html_content") && !purrr::pluck_exists(x, "content_raw")) {
+    x <- pb_collect(x, collect_rss = FALSE, ...)
+  }
+  if (purrr::pluck_exists(x, "content_raw")) {
+    x <- unlist(x[x$status < 400L, "content_raw"])
   }
 
-  lapply(x, function(x) {
-    # for rss
-    out <- x %>%
-      xml2::read_xml() %>%
-      xml2::xml_find_all("//*[name()='item']") %>%
-      xml2::as_list() %>%
-      purrr::map("link")
-    # for atom
-    if (length(out) < 1L) {
-      out <- x %>%
-        xml2::read_xml() %>%
-        xml2::xml_find_all("//*[name()='entry']") %>%
-        xml2::as_list() %>%
-        purrr::map(function(e) attr(e[["link"]], "href"))
-    }
+  out <- purrr::map(x, parse_rss)
+  out <- unlist(out, recursive = FALSE)
+  if (parse) {
+    return(dplyr::bind_rows(out))
+  } else {
     return(out)
-  }) %>%
-    unlist() %>%
-    unname()
+  }
+}
+
+
+parse_rss <- function(xml) {
+  #  to make RSS and atom feed output equal
+  lookup <- c(
+    "guid"        = "id",
+    "pubDate"     = "published",
+    "description" = "summary",
+    "author"      = "author"
+  )
+
+  items <- xml %>%
+    xml2::read_xml() %>%
+    xml2::xml_find_all("//*[name()='item']") %>%
+    xml2::as_list() %>%
+    purrr::map(function(i) {
+      out <- lapply(i, unlist)
+      out <- out[!duplicated(names(i))]
+      out <- replace_names(out, lookup)
+      return(out)
+    })
+
+  if (length(items) < 1L) {
+    items <- parse_atom(xml)
+  }
+
+  return(items)
+}
 
+parse_atom <- function(xml) {
+  xml %>%
+    xml2::read_xml() %>%
+    xml2::xml_find_all("//*[name()='entry']") %>%
+    xml2::as_list() %>%
+    purrr::map(function(i) {
+      link <- attr(i[["link"]], "href")
+      out <- lapply(i, unlist)
+      # input contains multiple "link" items, delete them
+      out[["link"]] <- NULL
+      out[["url"]] <- link
+      return(out)
+    })
 }
 
 

diff --git a/R/utils.R b/R/utils.R
@@ -118,7 +118,7 @@ normalise_df <- function(l) {
 
 
 #' base R version of stringi::stri_replace_all() to limit dependencies
-#' @noRd
+#' @keywords internal
 replace_all <- function(str, pattern, replacement, fixed = TRUE) {
   for (i in seq_along(pattern)) str <- gsub(pattern[i], replacement[i], str, fixed = fixed)
   return(str)
@@ -132,6 +132,15 @@ extract <- function(str, pattern) {
 }
 
 
+#' replace names of an object given a lookuptable
+#' @keywords internal
+replace_names <- function(x, lookup) {
+  replacement <- lookup[names(x)]
+  names(x) <- ifelse(is.na(replacement), names(x), replacement)
+  return(x)
+}
+
+
 #' construct progress bar
 #' @noRd
 make_pb <- function(df) {

diff --git a/man/pb_collect_rss.Rd b/man/pb_collect_rss.Rd
diff --git a/tests/testthat/test-collect.R b/tests/testthat/test-collect.R
@@ -20,27 +20,20 @@ test_that("expandurls", {
   )
   expect_warning(
     pb_collect(urls = "https://httpbin.org/delay/10", timeout = 1, ignore_fails = TRUE),
-    "download did not finish before timeout."
+    "download.did.not.finish.before.timeout."
   )
 })
 
 test_that("send cookies", {
   jar <- options(cookie_dir = tempdir())
   withr::defer(options(jar))
   withr::defer(unlink(file.path(tempdir(), paste0("cookies.rds"))))
-  expect_equal({
+  expect_equivalent({
     cookiemonster::add_cookies(cookiestring = "test=true; success=yes", domain = "https://hb.cran.dev", confirm = TRUE)
-    pb_collect("https://hb.cran.dev/cookies", use_cookies = TRUE, verbose = FALSE)$content_raw
+    unclass(pb_collect("https://hb.cran.dev/cookies", use_cookies = TRUE, verbose = FALSE)$content_raw)
   }, "{\n  \"cookies\": {\n    \"success\": \"yes\", \n    \"test\": \"true\"\n  }\n}\n")
 })
 
-test_that("rss", {
-  expect_equal({
-    res <- pb_collect(urls = "https://rss.nytimes.com/services/xml/rss/nyt/World.xml")
-    c(nrow(res) > 1, ncol(res))
-  }, c(1, 5))
-})
-
 test_that("store local", {
   tmp <- tempdir()
   expect_true({

diff --git a/tests/testthat/test-rss.R b/tests/testthat/test-rss.R
@@ -0,0 +1,21 @@
+test_that("rss is collected", {
+  nyt <- pb_collect_rss("https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml")
+  expect_s3_class(
+    nyt,
+    "data.frame"
+  )
+  expect_more_than(
+    nrow(nyt),
+    0
+  )
+  expect_equal({
+    c(nrow(nyt) > 1, c("title", "link", "published") %in% colnames(nyt))
+  }, rep(TRUE, 4))
+})
+
+test_that("rss is expanded", {
+  expect_equal({
+    res <- pb_collect(urls = "https://rss.nytimes.com/services/xml/rss/nyt/World.xml")
+    c(nrow(res) > 1, ncol(res))
+  }, c(1, 5))
+})