exports pb_collect_rss

JBGruber · Oct 18, 2023 · afa2cd4 · afa2cd4
1 parent d98627b
commit afa2cd4
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 28 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ S3method(pb_deliver,default)
 export("%>%")
 export(pb_available)
 export(pb_collect)
+export(pb_collect_rss)
 export(pb_deliver)
 export(pb_find_rss)
 export(pb_inspect)

diff --git a/R/collect.R b/R/collect.R
@@ -120,8 +120,11 @@ pb_collect <- function(urls,
       rss <- grepl("<rss.+>|<\\?xml.+>", cont, useBytes = TRUE)
       if (any(rss)) {
         if (verbose) cli::cli_progress_step("Parsing RSS feeds")
-        rss_out <- collect_rss(
-          cont,
+        cont <- cont[rss]
+        class(cont) <- "html_content"
+        rss_links <- pb_collect_rss(cont)
+        rss_out <- pb_collect(
+          rss_links,
           collect_rss = FALSE,
           timeout = timeout,
           ignore_fails = ignore_fails,
@@ -237,27 +240,3 @@ parse_fail <- function(url) {
   }
 }
 
-
-collect_rss <- function(cont, ...) {
-
-  links <- lapply(cont, function(x) {
-    # for rss
-    out <- x %>%
-      xml2::read_xml() %>%
-      xml2::xml_find_all("//*[name()='item']") %>%
-      xml2::as_list() %>%
-      purrr::map("link")
-    # for atom
-    if (length(out) < 1L) {
-      out <- x %>%
-        xml2::read_xml() %>%
-        xml2::xml_find_all("//*[name()='entry']") %>%
-        xml2::as_list() %>%
-        purrr::map(function(e) attr(e[["link"]], "href"))
-    }
-    return(out)
-  }) %>%
-    unlist()
-
-  pb_collect(links, ...)
-}
diff --git a/R/find_rss.r → R/rss.r b/R/find_rss.r → R/rss.r
@@ -1,3 +1,44 @@
+#' Collect RSS feed
+#'
+#' Collect the URLs of articles from RSS or Atom feed(s)
+#'
+#' @param x URL(s) to RSS or Atom feed(s).
+#' @param ... passed to pb_collect.
+#'
+#' @return a character vector of URLs to articles
+#' @export
+#'
+#' @examples
+#' pb_collect_rss("https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml")
+pb_collect_rss <- function(x, ...) {
+  if (!methods::is(x, "html_content")) {
+    df <- pb_collect(x, pb_collect_rss = FALSE, ...)
+    x <- unlist(df[df$status < 400L, "content_raw"])
+  }
+
+  lapply(x, function(x) {
+    # for rss
+    out <- x %>%
+      xml2::read_xml() %>%
+      xml2::xml_find_all("//*[name()='item']") %>%
+      xml2::as_list() %>%
+      purrr::map("link")
+    # for atom
+    if (length(out) < 1L) {
+      out <- x %>%
+        xml2::read_xml() %>%
+        xml2::xml_find_all("//*[name()='entry']") %>%
+        xml2::as_list() %>%
+        purrr::map(function(e) attr(e[["link"]], "href"))
+    }
+    return(out)
+  }) %>%
+    unlist() %>%
+    unname()
+
+}
+
+
 #' Find RSS feed on a newspapers website
 #'
 #' @param x main domain of the newspaper site to check for RSS feeds.
@@ -133,3 +174,5 @@ is_feed_fns <- function(url) {
     paperboy.env$pages[[url]] <- is_feed(req)
   }
 }
+
+
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -6,6 +6,7 @@ Lifecycle
 Nutzungsbedingungen
 POSIXct
 anotherangryvoice
+bbc
 blogspot
 boston
 bostonglobe
@@ -25,10 +26,12 @@ doctype
 eu
 evolvepolitics
 faz
+feedly
 forbes
 foxbusiness
 foxnews
 ftw
+geenstijl
 huffingtonpost
 huffpost
 idnes
@@ -37,6 +40,8 @@ latimes
 lnk
 marketwatch
 mediacloud
+mediacourant
+metronieuws
 msnbc
 newsweek
 nl

diff --git a/man/pb_collect_rss.Rd b/man/pb_collect_rss.Rd
diff --git a/man/pb_find_rss.Rd b/man/pb_find_rss.Rd
diff --git a/tests/testthat/test-deliver.R b/tests/testthat/test-deliver.R
@@ -12,6 +12,10 @@ test_that("Test infrascture", {
     pb_deliver("duckduckgo.com/", verbose = TRUE),
     "No parser for domain"
   )
+  expect_equal(
+    nrow(pb_deliver("duckduckgo.com/", try_default = FALSE)),
+    0L
+  )
   expect_error(
     pb_deliver(list("google.com"), verbose = FALSE),
     "No method for class list."
@@ -43,7 +47,7 @@ test_that("Test huffpost scraper", {
 })
 
 test_scraper <- function(rss) {
-  test_that(rss, {
+  test_that(desc = paste("test:", rss), {
     skip_if_offline()
     skip_on_ci()
     expect_equal({
@@ -55,6 +59,7 @@ test_scraper <- function(rss) {
 }
 
 lapply(c(
+  "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
   "https://www.cbsnews.com/latest/rss/evening-news",
   "https://www.cnet.com/rss/news/",
   "http://rss.cnn.com/rss/edition.rss",