Skip to content

Commit

Permalink
exports pb_collect_rss
Browse files Browse the repository at this point in the history
  • Loading branch information
JBGruber committed Oct 18, 2023
1 parent d98627b commit afa2cd4
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 28 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ S3method(pb_deliver,default)
export("%>%")
export(pb_available)
export(pb_collect)
export(pb_collect_rss)
export(pb_deliver)
export(pb_find_rss)
export(pb_inspect)
Expand Down
31 changes: 5 additions & 26 deletions R/collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,11 @@ pb_collect <- function(urls,
rss <- grepl("<rss.+>|<\\?xml.+>", cont, useBytes = TRUE)
if (any(rss)) {
if (verbose) cli::cli_progress_step("Parsing RSS feeds")
rss_out <- collect_rss(
cont,
cont <- cont[rss]
class(cont) <- "html_content"
rss_links <- pb_collect_rss(cont)
rss_out <- pb_collect(
rss_links,
collect_rss = FALSE,
timeout = timeout,
ignore_fails = ignore_fails,
Expand Down Expand Up @@ -237,27 +240,3 @@ parse_fail <- function(url) {
}
}


collect_rss <- function(cont, ...) {

links <- lapply(cont, function(x) {
# for rss
out <- x %>%
xml2::read_xml() %>%
xml2::xml_find_all("//*[name()='item']") %>%
xml2::as_list() %>%
purrr::map("link")
# for atom
if (length(out) < 1L) {
out <- x %>%
xml2::read_xml() %>%
xml2::xml_find_all("//*[name()='entry']") %>%
xml2::as_list() %>%
purrr::map(function(e) attr(e[["link"]], "href"))
}
return(out)
}) %>%
unlist()

pb_collect(links, ...)
}
43 changes: 43 additions & 0 deletions R/find_rss.r → R/rss.r
Original file line number Diff line number Diff line change
@@ -1,3 +1,44 @@
#' Collect RSS feed
#'
#' Collect the URLs of articles from RSS or Atom feed(s)
#'
#' @param x URL(s) to RSS or Atom feed(s).
#' @param ... passed to pb_collect.
#'
#' @return a character vector of URLs to articles
#' @export
#'
#' @examples
#' pb_collect_rss("https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml")
pb_collect_rss <- function(x, ...) {
if (!methods::is(x, "html_content")) {
df <- pb_collect(x, pb_collect_rss = FALSE, ...)
x <- unlist(df[df$status < 400L, "content_raw"])
}

lapply(x, function(x) {
# for rss
out <- x %>%
xml2::read_xml() %>%
xml2::xml_find_all("//*[name()='item']") %>%
xml2::as_list() %>%
purrr::map("link")
# for atom
if (length(out) < 1L) {
out <- x %>%
xml2::read_xml() %>%
xml2::xml_find_all("//*[name()='entry']") %>%
xml2::as_list() %>%
purrr::map(function(e) attr(e[["link"]], "href"))
}
return(out)
}) %>%
unlist() %>%
unname()

}


#' Find RSS feed on a newspapers website
#'
#' @param x main domain of the newspaper site to check for RSS feeds.
Expand Down Expand Up @@ -133,3 +174,5 @@ is_feed_fns <- function(url) {
paperboy.env$pages[[url]] <- is_feed(req)
}
}


5 changes: 5 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Lifecycle
Nutzungsbedingungen
POSIXct
anotherangryvoice
bbc
blogspot
boston
bostonglobe
Expand All @@ -25,10 +26,12 @@ doctype
eu
evolvepolitics
faz
feedly
forbes
foxbusiness
foxnews
ftw
geenstijl
huffingtonpost
huffpost
idnes
Expand All @@ -37,6 +40,8 @@ latimes
lnk
marketwatch
mediacloud
mediacourant
metronieuws
msnbc
newsweek
nl
Expand Down
22 changes: 22 additions & 0 deletions man/pb_collect_rss.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/pb_find_rss.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion tests/testthat/test-deliver.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ test_that("Test infrascture", {
pb_deliver("duckduckgo.com/", verbose = TRUE),
"No parser for domain"
)
expect_equal(
nrow(pb_deliver("duckduckgo.com/", try_default = FALSE)),
0L
)
expect_error(
pb_deliver(list("google.com"), verbose = FALSE),
"No method for class list."
Expand Down Expand Up @@ -43,7 +47,7 @@ test_that("Test huffpost scraper", {
})

test_scraper <- function(rss) {
test_that(rss, {
test_that(desc = paste("test:", rss), {
skip_if_offline()
skip_on_ci()
expect_equal({
Expand All @@ -55,6 +59,7 @@ test_scraper <- function(rss) {
}

lapply(c(
"https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
"https://www.cbsnews.com/latest/rss/evening-news",
"https://www.cnet.com/rss/news/",
"http://rss.cnn.com/rss/edition.rss",
Expand Down

0 comments on commit afa2cd4

Please sign in to comment.