diff --git a/DESCRIPTION b/DESCRIPTION index e1a796b..45748c3 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,10 +12,13 @@ Imports: adaR, callr, cli, + cookiemonster, curl, dplyr, + jsonlite, lubridate, magrittr, + methods, praise, purrr, rlang, @@ -32,7 +35,8 @@ Suggests: rmarkdown, rstudioapi, spelling, - testthat + testthat, + withr URL: https://github.com/JBGruber/paperboy Encoding: UTF-8 BugReports: https://github.com/JBGruber/paperboy/issues diff --git a/R/collect.R b/R/collect.R index 6b9ca48..346d84c 100644 --- a/R/collect.R +++ b/R/collect.R @@ -61,7 +61,7 @@ pb_collect <- function(urls, res <- purrr::map(url_batches, function(b) { domain <- adaR::ada_get_domain(b[1]) - cookies_str <- cookiemonster::get_cookies(paste0("\\b", domain, "\\b"), as = "string") + cookies_str <- cookiemonster::get_cookies(paste0(domain, "\\b"), as = "string") rp <- callr::r_bg(async_requests, args = list( urls = b, diff --git a/R/utils_dev.R b/R/utils_dev.R index 77cdc43..086d80a 100644 --- a/R/utils_dev.R +++ b/R/utils_dev.R @@ -42,7 +42,7 @@ use_new_parser <- function(x, rss = NULL, test_data = NULL) { - x <- head(adaR::ada_get_domain(x), 1) + x <- utils::head(adaR::ada_get_domain(x), 1) cli::cli_progress_step( "Creating R file", @@ -73,7 +73,7 @@ use_new_parser <- function(x, ) if (file.exists("inst/status.csv")) { - status <- read.csv("inst/status.csv") + status <- utils::read.csv("inst/status.csv") if (!gsub("^www.", "", x) %in% status$domain) { status <- status %>% rbind(list(domain = sub("^www.", "", x), @@ -82,7 +82,7 @@ use_new_parser <- function(x, issues = issue, rss = rss)) %>% dplyr::arrange(domain) - write.csv(status, "inst/status.csv", row.names = FALSE) + utils::write.csv(status, "inst/status.csv", row.names = FALSE) } else if (rss == "") { # if entry already present, get rss value rss <- status[grepl(gsub("^www.", "", x), status$domain), "rss"] @@ -154,13 +154,13 @@ use_new_parser <- function(x, msg_done = "status.csv updated." ) x <- utils::head(adaR::ada_get_domain(x), 1) - status <- read.csv("inst/status.csv") + status <- utils::read.csv("inst/status.csv") status[status$domain == gsub("^www.", "", x), "status"] <- "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)" cli::cli_alert_info("Check the entry manually. Press quit when you're happy.") status[status$domain == gsub("^www.", "", x), ] <- - edit(status[status$domain == gsub("^www.", "", x), ]) - write.csv(status, "inst/status.csv", row.names = FALSE) + utils::edit(status[status$domain == gsub("^www.", "", x), ]) + utils::write.csv(status, "inst/status.csv", row.names = FALSE) } diff --git a/README.md b/README.md index 7e5e24c..d4c45eb 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,9 @@ df <- pb_deliver("https://tinyurl.com/386e98k5") df ``` -| url | expanded_url | domain | status | datetime | author | headline | text | misc | -|:-------------------------------|:----------------------------------------------------------------------------------|:--------------------|-------:|:--------------------|:------------------------------------------------------|:------------------------|:-------------------------------|:-----| -| | | www.theguardian.com | 200 | 2021-07-12 12:00:13 | | ’A woman trapped in an… | In the Guide’s weekly Solved!… | NULL | +| url | expanded_url | domain | status | datetime | author | headline | text | misc | +|:-------------------------------|:----------------------------------------------------------------------------------|:----------------|-------:|:--------------------|:------------------------------------------------------|:------------------------|:-------------------------------|:-----| +| | | theguardian.com | 200 | 2021-07-12 12:00:13 | | ’A woman trapped in an… | In the Guide’s weekly Solved!… | NULL | The returned `data.frame` contains important meta information about the news items and their full text. Notice, that the function had no problem @@ -56,13 +56,12 @@ therefore often encounter this warning: ``` r pb_deliver("google.com") -#> Warning: ℹ No parser for domain www.google.com yet, attempting generic -#> approach. +#> Warning: ℹ No parser for domain google.com yet, attempting generic approach. ``` -| url | expanded_url | domain | status | datetime | author | headline | text | misc | -|:-----------|:-------------------------|:---------------|-------:|:---------|:-------|:---------|:---------------------------------------------|:-----| -| google.com | | www.google.com | 200 | NA | NA | Google | © 2023 - Ochrana soukromí - Smluvní podmínky | NULL | +| url | expanded_url | domain | status | datetime | author | headline | text | misc | +|:-----------|:-------------------------|:-----------|-------:|:---------|:-------|:---------|:----------------------------------------------------|:-----| +| google.com | | google.com | 200 | NA | NA | Google | © 2023 - Datenschutzerklärung - Nutzungsbedingungen | NULL | The function still returns a data.frame, but important information is missing — in this case because it isn’t there. The other URLs will be @@ -77,9 +76,9 @@ later parse it yourself: pb_collect("google.com") ``` -| url | expanded_url | domain | status | content_raw | -|:-----------|:-------------------------|:---------------|-------:|:-----------------------------------| -| google.com | | www.google.com | 200 | \\ | google.com | 200 | \\