diff --git a/DESCRIPTION b/DESCRIPTION index cef6b07..f3e845e 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: paperboy Title: Comprehensive Collection of News Media Scrapers Version: 0.0.6.9000 -Date: 2024-05-31 +Date: 2024-07-13 Authors@R: person(given = "Johannes B.", family = "Gruber", diff --git a/R/collect.R b/R/collect.R index 62cbac1..6448006 100644 --- a/R/collect.R +++ b/R/collect.R @@ -102,6 +102,8 @@ pb_collect <- function(urls, if (nrow(out) > 0) { + class(out$content_raw) <- "html_content" + out <- tibble::add_column( out, domain = adaR::ada_get_domain(out$expanded_url), @@ -122,7 +124,7 @@ pb_collect <- function(urls, if (verbose) cli::cli_progress_step("Parsing RSS feeds") cont <- cont[rss] class(cont) <- "html_content" - rss_links <- pb_collect_rss(cont) + rss_links <- pb_collect_rss(cont)$link rss_out <- pb_collect( rss_links, collect_rss = FALSE, @@ -148,7 +150,6 @@ pb_collect <- function(urls, if (verbose) cli::cli_progress_done() attr(out, "paperboy_collected_at") <- Sys.time() attr(out, "paperboy_data_loc") <- ifelse(is.null(save_dir), "memory", "disk") - return(out) } diff --git a/R/rss.r b/R/rss.r index b97f3b9..1531c69 100644 --- a/R/rss.r +++ b/R/rss.r @@ -1,43 +1,78 @@ #' Collect RSS feed #' -#' Collect the URLs of articles from RSS or Atom feed(s) +#' Collect articles from RSS or Atom feed(s) #' #' @param x URL(s) to RSS or Atom feed(s). +#' @param parse Whether the results should be parsed into a data.frame. Turn off for debugging. #' @param ... passed to pb_collect. #' -#' @return a character vector of URLs to articles +#' @return a data.frame or list #' @export #' #' @examples #' \dontrun{ -#' pb_collect_rss("https://feeds.washingtonpost.com/rss/world") +#' pb_collect_rss("https://www.washingtonpost.com/arcio/rss/") +#' # works with atom feeds too +#' pb_collect_rss("https://www.nu.nl/rss") #' } -pb_collect_rss <- function(x, ...) { - if (!methods::is(x, "html_content")) { - df <- pb_collect(x, collect_rss = FALSE, ...) - x <- unlist(df[df$status < 400L, "content_raw"]) +pb_collect_rss <- function(x, parse = TRUE, ...) { + if (!methods::is(x, "html_content") && !purrr::pluck_exists(x, "content_raw")) { + x <- pb_collect(x, collect_rss = FALSE, ...) + } + if (purrr::pluck_exists(x, "content_raw")) { + x <- unlist(x[x$status < 400L, "content_raw"]) } - lapply(x, function(x) { - # for rss - out <- x %>% - xml2::read_xml() %>% - xml2::xml_find_all("//*[name()='item']") %>% - xml2::as_list() %>% - purrr::map("link") - # for atom - if (length(out) < 1L) { - out <- x %>% - xml2::read_xml() %>% - xml2::xml_find_all("//*[name()='entry']") %>% - xml2::as_list() %>% - purrr::map(function(e) attr(e[["link"]], "href")) - } + out <- purrr::map(x, parse_rss) + out <- unlist(out, recursive = FALSE) + if (parse) { + return(dplyr::bind_rows(out)) + } else { return(out) - }) %>% - unlist() %>% - unname() + } +} + + +parse_rss <- function(xml) { + # to make RSS and atom feed output equal + lookup <- c( + "guid" = "id", + "pubDate" = "published", + "description" = "summary", + "author" = "author" + ) + + items <- xml %>% + xml2::read_xml() %>% + xml2::xml_find_all("//*[name()='item']") %>% + xml2::as_list() %>% + purrr::map(function(i) { + out <- lapply(i, unlist) + out <- out[!duplicated(names(i))] + out <- replace_names(out, lookup) + return(out) + }) + + if (length(items) < 1L) { + items <- parse_atom(xml) + } + + return(items) +} +parse_atom <- function(xml) { + xml %>% + xml2::read_xml() %>% + xml2::xml_find_all("//*[name()='entry']") %>% + xml2::as_list() %>% + purrr::map(function(i) { + link <- attr(i[["link"]], "href") + out <- lapply(i, unlist) + # input contains multiple "link" items, delete them + out[["link"]] <- NULL + out[["url"]] <- link + return(out) + }) } diff --git a/R/utils.R b/R/utils.R index 9ecc0ca..b28213a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -118,7 +118,7 @@ normalise_df <- function(l) { #' base R version of stringi::stri_replace_all() to limit dependencies -#' @noRd +#' @keywords internal replace_all <- function(str, pattern, replacement, fixed = TRUE) { for (i in seq_along(pattern)) str <- gsub(pattern[i], replacement[i], str, fixed = fixed) return(str) @@ -132,6 +132,15 @@ extract <- function(str, pattern) { } +#' replace names of an object given a lookuptable +#' @keywords internal +replace_names <- function(x, lookup) { + replacement <- lookup[names(x)] + names(x) <- ifelse(is.na(replacement), names(x), replacement) + return(x) +} + + #' construct progress bar #' @noRd make_pb <- function(df) { diff --git a/man/pb_collect_rss.Rd b/man/pb_collect_rss.Rd index 4367378..370a306 100644 --- a/man/pb_collect_rss.Rd +++ b/man/pb_collect_rss.Rd @@ -4,21 +4,25 @@ \alias{pb_collect_rss} \title{Collect RSS feed} \usage{ -pb_collect_rss(x, ...) +pb_collect_rss(x, parse = TRUE, ...) } \arguments{ \item{x}{URL(s) to RSS or Atom feed(s).} +\item{parse}{Whether the results should be parsed into a data.frame. Turn off for debugging.} + \item{...}{passed to pb_collect.} } \value{ -a character vector of URLs to articles +a data.frame or list } \description{ -Collect the URLs of articles from RSS or Atom feed(s) +Collect articles from RSS or Atom feed(s) } \examples{ \dontrun{ -pb_collect_rss("https://feeds.washingtonpost.com/rss/world") +pb_collect_rss("https://www.washingtonpost.com/arcio/rss/") +# works with atom feeds too +pb_collect_rss("https://www.nu.nl/rss") } } diff --git a/tests/testthat/test-collect.R b/tests/testthat/test-collect.R index b3a2da6..ec907b6 100644 --- a/tests/testthat/test-collect.R +++ b/tests/testthat/test-collect.R @@ -20,7 +20,7 @@ test_that("expandurls", { ) expect_warning( pb_collect(urls = "https://httpbin.org/delay/10", timeout = 1, ignore_fails = TRUE), - "download did not finish before timeout." + "download.did.not.finish.before.timeout." ) }) @@ -28,19 +28,12 @@ test_that("send cookies", { jar <- options(cookie_dir = tempdir()) withr::defer(options(jar)) withr::defer(unlink(file.path(tempdir(), paste0("cookies.rds")))) - expect_equal({ + expect_equivalent({ cookiemonster::add_cookies(cookiestring = "test=true; success=yes", domain = "https://hb.cran.dev", confirm = TRUE) - pb_collect("https://hb.cran.dev/cookies", use_cookies = TRUE, verbose = FALSE)$content_raw + unclass(pb_collect("https://hb.cran.dev/cookies", use_cookies = TRUE, verbose = FALSE)$content_raw) }, "{\n \"cookies\": {\n \"success\": \"yes\", \n \"test\": \"true\"\n }\n}\n") }) -test_that("rss", { - expect_equal({ - res <- pb_collect(urls = "https://rss.nytimes.com/services/xml/rss/nyt/World.xml") - c(nrow(res) > 1, ncol(res)) - }, c(1, 5)) -}) - test_that("store local", { tmp <- tempdir() expect_true({ diff --git a/tests/testthat/test-rss.R b/tests/testthat/test-rss.R new file mode 100644 index 0000000..ef37768 --- /dev/null +++ b/tests/testthat/test-rss.R @@ -0,0 +1,21 @@ +test_that("rss is collected", { + nyt <- pb_collect_rss("https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml") + expect_s3_class( + nyt, + "data.frame" + ) + expect_more_than( + nrow(nyt), + 0 + ) + expect_equal({ + c(nrow(nyt) > 1, c("title", "link", "published") %in% colnames(nyt)) + }, rep(TRUE, 4)) +}) + +test_that("rss is expanded", { + expect_equal({ + res <- pb_collect(urls = "https://rss.nytimes.com/services/xml/rss/nyt/World.xml") + c(nrow(res) > 1, ncol(res)) + }, c(1, 5)) +})