Skip to content

Commit

Permalink
Irish scrapers part 1
Browse files Browse the repository at this point in the history
  • Loading branch information
JBGruber committed Nov 30, 2023
1 parent b2fca0b commit 0d6ffb0
Show file tree
Hide file tree
Showing 10 changed files with 336 additions and 21 deletions.
15 changes: 12 additions & 3 deletions R/deliver_bbc_co_uk.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,25 @@ pb_deliver_paper.bbc_co_uk <- function(x, verbose = NULL, pb, ...) {

# text
text <- html %>%
rvest::html_elements("article [class*=\"RichText\"] p") %>%
rvest::html_elements("article [class*=\"RichText\"] p,article .story-body p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

# the helper function safely creates a named list from objects
cover_image_html <- html %>%
rvest::html_element("picture img") %>%
as.character()

cover_image_url <- html %>%
rvest::html_element("picture img") %>%
rvest::html_attr("src")

s_n_list(
datetime,
author,
headline,
text
text,
cover_image_url,
cover_image_html
)

}
18 changes: 0 additions & 18 deletions R/deliver_blesk_cz.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,6 @@ pb_deliver_paper.blesk_cz <- function(x, verbose = NULL, pb, ...) {
author <- data$author$name %>%
toString()

# # datetime
# datetime <- html %>%
# rvest::html_element("[property=\"article:published_time\"]") %>%
# rvest::html_attr("content") %>%
# lubridate::as_datetime()
#
# # headline
# headline <- html %>%
# rvest::html_element("title") %>%
# rvest::html_text2()
#
# # author
# author <- html %>%
# rvest::html_elements(".author-container") %>%
# rvest::html_text2() %>%
# toString() %>%
# sub("Autor: ", "", ., fixed = TRUE)

# text
text <- html %>%
rvest::html_elements("#article p,#article h2") %>%
Expand Down
51 changes: 51 additions & 0 deletions R/deliver_breakingnews_ie.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
pb_deliver_paper.breakingnews_ie <- function(x, verbose = NULL, pb, ...) {

# updates progress bar
pb_tick(x, verbose, pb)

# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

data <- html %>%
rvest::html_element("script") %>%
rvest::html_text2()

if (!is.na(data)) {
data <- jsonlite::fromJSON(data)
# datetime
datetime <- data$datePublished %>%
lubridate::as_datetime()

# headline
headline <- data$headline

# author
author <- data$author$name %>%
toString()

# text
text <- html %>%
rvest::html_elements("article p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

cover_image_html <-data$image

cover_image_url <- cover_image_html$url

type <- data$`@type`

s_n_list(
datetime,
author,
headline,
text,
type,
cover_image_url,
cover_image_html
)
} else {
s_n_list()
}

}
49 changes: 49 additions & 0 deletions R/deliver_independent_ie.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
pb_deliver_paper.independent_ie <- function(x, verbose = NULL, pb, ...) {

# updates progress bar
pb_tick(x, verbose, pb)

# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

# datetime
datetime <- html %>%
rvest::html_element("[property=\"article:modified_time\"]") %>%
rvest::html_attr("content") %>%
lubridate::as_datetime()

# headline
headline <- html %>%
rvest::html_element("[property=\"og:title\"]") %>%
rvest::html_attr("content")

# author
author <- html %>%
rvest::html_elements("[name=\"cXenseParse:mhu-article_author\"]") %>%
rvest::html_attr("content") %>%
toString()

# text
text <- html %>%
rvest::html_elements("[data-fragment-name=\"articleDetail\"] p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

cover_image_html <- html %>%
rvest::html_element("[data-testid=\"article-image-wrapper\"] img") %>%
as.character()

cover_image_url <- html %>%
rvest::html_element("[data-testid=\"article-image-wrapper\"] img") %>%
rvest::html_attr("src")

s_n_list(
datetime,
author,
headline,
text,
cover_image_url,
cover_image_html
)

}
57 changes: 57 additions & 0 deletions R/deliver_irishtimes_com.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
pb_deliver_paper.irishtimes_com <- function(x, verbose = NULL, pb, ...) {

# updates progress bar
pb_tick(x, verbose, pb)

# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

data <- html %>%
rvest::html_elements("[type=\"application/ld+json\"]") %>%
rvest::html_text2() %>%
lapply(jsonlite::fromJSON)

# usually there are more than one,
if (length(data) > 1L) {
tp <- purrr::map_chr(data, function(x)
purrr::pluck(x, "@type", .default = NA_character_))

data <- purrr::pluck(data, which(tp == "NewsArticle"), .default = NA)
}

if (!isTRUE(is.na(data))) {

# datetime
datetime <- data$datePublished %>%
lubridate::as_datetime()

# headline
headline <- data$headline

# author
author <- data$author$name %>%
toString()

# text
text <- html %>%
rvest::html_elements("article p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

cover_image_url <- purrr::pluck(data$image, 1, .default = NA)

type <- data$`@type`

s_n_list(
datetime,
author,
headline,
text,
type,
cover_image_url
)
} else {
s_n_list()
}

}
54 changes: 54 additions & 0 deletions R/deliver_rte_ie.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
pb_deliver_paper.rte_ie <- function(x, verbose = NULL, pb, ...) {

# updates progress bar
pb_tick(x, verbose, pb)

# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

# datetime
datetime <- html %>%
rvest::html_element("[property=\"article:published_time\"]") %>%
rvest::html_attr("content") %>%
lubridate::as_datetime()

# headline
headline <- html %>%
rvest::html_element("title") %>%
rvest::html_text2()

# author
author <- html %>%
rvest::html_elements("[itemprop=\"author\"]>[itemprop=\"name\"]") %>%
rvest::html_attr("content") %>%
toString()

# text
text <- html %>%
rvest::html_elements(".article-body p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

type <- html %>%
rvest::html_element("[name=\"article-type\"]") %>%
rvest::html_attr("content")

cover_image_html <- html %>%
rvest::html_element("#main-article-image img") %>%
as.character()

cover_image_url <- html %>%
rvest::html_element("#main-article-image img") %>%
rvest::html_attr("src")

s_n_list(
datetime,
author,
headline,
text,
type,
cover_image_url,
cover_image_html
)

}
57 changes: 57 additions & 0 deletions R/deliver_sky_com.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
pb_deliver_paper.sky_com <- function(x, verbose = NULL, pb, ...) {

# updates progress bar
pb_tick(x, verbose, pb)

# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

data <- html %>%
rvest::html_elements("[type=\"application/ld+json\"]") %>%
rvest::html_text2() %>%
lapply(jsonlite::fromJSON)

# usually there are more than one,
if (length(data) > 1L) {
tp <- purrr::map_chr(data, function(x)
purrr::pluck(x, "@type", .default = NA_character_))

data <- purrr::pluck(data, which(tp == "NewsArticle"), .default = NA)
}

if (!isTRUE(is.na(data))) {

# datetime
datetime <- data$datePublished %>%
lubridate::as_datetime()

# headline
headline <- data$headline

# author
author <- data$author$name %>%
toString()

# text
text <- html %>%
rvest::html_elements(".sdc-article-body p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

cover_image_url <- purrr::pluck(data$image, "url", .default = NA)

type <- data$`@type`

s_n_list(
datetime,
author,
headline,
text,
type,
cover_image_url
)
} else {
s_n_list()
}

}
49 changes: 49 additions & 0 deletions R/deliver_thejournal_ie.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
pb_deliver_paper.thejournal_ie <- function(x, verbose = NULL, pb, ...) {

# updates progress bar
pb_tick(x, verbose, pb)

# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

# datetime
datetime <- html %>%
rvest::html_element("[property=\"article:post_date\"]") %>%
rvest::html_attr("content") %>%
lubridate::as_datetime()

# headline
headline <- html %>%
rvest::html_element("title") %>%
rvest::html_text2()

# author
author <- html %>%
rvest::html_elements("[property=\"article:author\"]") %>%
rvest::html_attr("content") %>%
toString()

# text
text <- html %>%
rvest::html_elements("[itemprop=\"articleBody\"] p:not(.article-updated-redesign)") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

cover_image_html <- html %>%
rvest::html_element(".article-primary-img-redesign") %>%
as.character()

cover_image_url <- html %>%
rvest::html_element(".article-primary-img-redesign") %>%
rvest::html_attr("srcset")

s_n_list(
datetime,
author,
headline,
text,
cover_image_url,
cover_image_html
)

}
1 change: 1 addition & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ skwawkbox
stri
stringi
techrepublic
telegraaf
thecanary
theguardian
thelily
Expand Down
Loading

0 comments on commit 0d6ffb0

Please sign in to comment.