Skip to content

Commit

Permalink
better telegraaf.nl parser
Browse files Browse the repository at this point in the history
  • Loading branch information
JBGruber committed Nov 10, 2023
1 parent 5142cc7 commit cd7a20f
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 17 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: paperboy
Title: Comprehensive Collection of News Media Scrapers
Version: 0.0.5.9000
Date: 2023-11-08
Date: 2023-11-10
Authors@R:
person(given = "Johannes B.",
family = "Gruber",
Expand Down
41 changes: 25 additions & 16 deletions R/deliver_telegraaf_nl.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,53 @@ pb_deliver_paper.telegraaf_nl <- function(x, verbose = NULL, pb, ...) {
# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

data <- html %>%
rvest::html_elements("[data-name=\"PageTracking\"]") %>%
rvest::html_text2() %>%
jsonlite::fromJSON()

type <- purrr::pluck(data, "article", "type")
paywall <- purrr::pluck(data, "article", "premium")

# datetime
datetime <- html %>%
rvest::html_element("[property=\"article:published_time\"]") %>%
rvest::html_attr("content") %>%
datetime <- purrr::pluck(data, "article", "publishDate") %>%
lubridate::as_datetime()

# headline
headline <- html %>%
rvest::html_element("[name=\"title\"]") %>%
rvest::html_attr("content")
headline <- purrr::pluck(data, "article", "title")

# author
author <- html %>%
rvest::html_element(".DetailBylineBlock__author") %>%
rvest::html_text2() %>%
toString()
author <- purrr::pluck(data, "article", "author", .default = NA_character_)

# text
text <- html %>%
rvest::html_elements(".Article__intro,.DetailBodyBlocks p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")
if (type == "normal") {
text <- html %>%
rvest::html_elements(".Article__intro,.DetailBodyBlocks p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")
} else {
text <- paste0("[", type, "]")
}

cover_image_html <- html %>%
rvest::html_element(".DetailArticleImage img") %>%
as.character()

cover_image_url <- html %>%
rvest::html_element(".DetailArticleImage img") %>%
rvest::html_attr("src") %>%
paste0("https://www.telegraaf.nl", .)
rvest::html_attr("src")

if (!is.na(cover_image_url))
cover_image_url <- paste0("https://www.telegraaf.nl", cover_image_url)

# the helper function safely creates a named list from objects
s_n_list(
datetime,
author,
headline,
text,
type,
paywall,
cover_image_url,
cover_image_html
)
Expand Down

0 comments on commit cd7a20f

Please sign in to comment.