Skip to content

Commit

Permalink
Merge pull request #20 from JBGruber/improved-rss
Browse files Browse the repository at this point in the history
improve rss reader (close #19)
  • Loading branch information
JBGruber authored Jul 13, 2024
2 parents a0d6f44 + fa5507b commit 700c074
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 43 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: paperboy
Title: Comprehensive Collection of News Media Scrapers
Version: 0.0.6.9000
Date: 2024-05-31
Date: 2024-07-13
Authors@R:
person(given = "Johannes B.",
family = "Gruber",
Expand Down
5 changes: 3 additions & 2 deletions R/collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ pb_collect <- function(urls,

if (nrow(out) > 0) {

class(out$content_raw) <- "html_content"

out <- tibble::add_column(
out,
domain = adaR::ada_get_domain(out$expanded_url),
Expand All @@ -122,7 +124,7 @@ pb_collect <- function(urls,
if (verbose) cli::cli_progress_step("Parsing RSS feeds")
cont <- cont[rss]
class(cont) <- "html_content"
rss_links <- pb_collect_rss(cont)
rss_links <- pb_collect_rss(cont)$link
rss_out <- pb_collect(
rss_links,
collect_rss = FALSE,
Expand All @@ -148,7 +150,6 @@ pb_collect <- function(urls,
if (verbose) cli::cli_progress_done()
attr(out, "paperboy_collected_at") <- Sys.time()
attr(out, "paperboy_data_loc") <- ifelse(is.null(save_dir), "memory", "disk")

return(out)
}

Expand Down
85 changes: 60 additions & 25 deletions R/rss.r
Original file line number Diff line number Diff line change
@@ -1,43 +1,78 @@
#' Collect RSS feed
#'
#' Collect the URLs of articles from RSS or Atom feed(s)
#' Collect articles from RSS or Atom feed(s)
#'
#' @param x URL(s) to RSS or Atom feed(s).
#' @param parse Whether the results should be parsed into a data.frame. Turn off for debugging.
#' @param ... passed to pb_collect.
#'
#' @return a character vector of URLs to articles
#' @return a data.frame or list
#' @export
#'
#' @examples
#' \dontrun{
#' pb_collect_rss("https://feeds.washingtonpost.com/rss/world")
#' pb_collect_rss("https://www.washingtonpost.com/arcio/rss/")
#' # works with atom feeds too
#' pb_collect_rss("https://www.nu.nl/rss")
#' }
pb_collect_rss <- function(x, ...) {
if (!methods::is(x, "html_content")) {
df <- pb_collect(x, collect_rss = FALSE, ...)
x <- unlist(df[df$status < 400L, "content_raw"])
pb_collect_rss <- function(x, parse = TRUE, ...) {
if (!methods::is(x, "html_content") && !purrr::pluck_exists(x, "content_raw")) {
x <- pb_collect(x, collect_rss = FALSE, ...)
}
if (purrr::pluck_exists(x, "content_raw")) {
x <- unlist(x[x$status < 400L, "content_raw"])
}

lapply(x, function(x) {
# for rss
out <- x %>%
xml2::read_xml() %>%
xml2::xml_find_all("//*[name()='item']") %>%
xml2::as_list() %>%
purrr::map("link")
# for atom
if (length(out) < 1L) {
out <- x %>%
xml2::read_xml() %>%
xml2::xml_find_all("//*[name()='entry']") %>%
xml2::as_list() %>%
purrr::map(function(e) attr(e[["link"]], "href"))
}
out <- purrr::map(x, parse_rss)
out <- unlist(out, recursive = FALSE)
if (parse) {
return(dplyr::bind_rows(out))
} else {
return(out)
}) %>%
unlist() %>%
unname()
}
}


parse_rss <- function(xml) {
# to make RSS and atom feed output equal
lookup <- c(
"guid" = "id",
"pubDate" = "published",
"description" = "summary",
"author" = "author"
)

items <- xml %>%
xml2::read_xml() %>%
xml2::xml_find_all("//*[name()='item']") %>%
xml2::as_list() %>%
purrr::map(function(i) {
out <- lapply(i, unlist)
out <- out[!duplicated(names(i))]
out <- replace_names(out, lookup)
return(out)
})

if (length(items) < 1L) {
items <- parse_atom(xml)
}

return(items)
}

parse_atom <- function(xml) {
xml %>%
xml2::read_xml() %>%
xml2::xml_find_all("//*[name()='entry']") %>%
xml2::as_list() %>%
purrr::map(function(i) {
link <- attr(i[["link"]], "href")
out <- lapply(i, unlist)
# input contains multiple "link" items, delete them
out[["link"]] <- NULL
out[["url"]] <- link
return(out)
})
}


Expand Down
11 changes: 10 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ normalise_df <- function(l) {


#' base R version of stringi::stri_replace_all() to limit dependencies
#' @noRd
#' @keywords internal
replace_all <- function(str, pattern, replacement, fixed = TRUE) {
for (i in seq_along(pattern)) str <- gsub(pattern[i], replacement[i], str, fixed = fixed)
return(str)
Expand All @@ -132,6 +132,15 @@ extract <- function(str, pattern) {
}


#' replace names of an object given a lookuptable
#' @keywords internal
replace_names <- function(x, lookup) {
replacement <- lookup[names(x)]
names(x) <- ifelse(is.na(replacement), names(x), replacement)
return(x)
}


#' construct progress bar
#' @noRd
make_pb <- function(df) {
Expand Down
12 changes: 8 additions & 4 deletions man/pb_collect_rss.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 3 additions & 10 deletions tests/testthat/test-collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,20 @@ test_that("expandurls", {
)
expect_warning(
pb_collect(urls = "https://httpbin.org/delay/10", timeout = 1, ignore_fails = TRUE),
"download did not finish before timeout."
"download.did.not.finish.before.timeout."
)
})

test_that("send cookies", {
jar <- options(cookie_dir = tempdir())
withr::defer(options(jar))
withr::defer(unlink(file.path(tempdir(), paste0("cookies.rds"))))
expect_equal({
expect_equivalent({
cookiemonster::add_cookies(cookiestring = "test=true; success=yes", domain = "https://hb.cran.dev", confirm = TRUE)
pb_collect("https://hb.cran.dev/cookies", use_cookies = TRUE, verbose = FALSE)$content_raw
unclass(pb_collect("https://hb.cran.dev/cookies", use_cookies = TRUE, verbose = FALSE)$content_raw)
}, "{\n \"cookies\": {\n \"success\": \"yes\", \n \"test\": \"true\"\n }\n}\n")
})

test_that("rss", {
expect_equal({
res <- pb_collect(urls = "https://rss.nytimes.com/services/xml/rss/nyt/World.xml")
c(nrow(res) > 1, ncol(res))
}, c(1, 5))
})

test_that("store local", {
tmp <- tempdir()
expect_true({
Expand Down
21 changes: 21 additions & 0 deletions tests/testthat/test-rss.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
test_that("rss is collected", {
nyt <- pb_collect_rss("https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml")
expect_s3_class(
nyt,
"data.frame"
)
expect_more_than(
nrow(nyt),
0
)
expect_equal({
c(nrow(nyt) > 1, c("title", "link", "published") %in% colnames(nyt))
}, rep(TRUE, 4))
})

test_that("rss is expanded", {
expect_equal({
res <- pb_collect(urls = "https://rss.nytimes.com/services/xml/rss/nyt/World.xml")
c(nrow(res) > 1, ncol(res))
}, c(1, 5))
})

0 comments on commit 700c074

Please sign in to comment.