Skip to content

Commit

Permalink
046 cran fix
Browse files Browse the repository at this point in the history
  • Loading branch information
michalovadek committed Sep 6, 2023
1 parent 3a23f7f commit 225684d
Show file tree
Hide file tree
Showing 33 changed files with 983 additions and 419 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
^Meta$
^CRAN-SUBMISSION$
^cran-comments\.md$
^vignettes/articles$
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: eurlex
Type: Package
Title: Retrieve Data on European Union Law
Version: 0.4.5
Version: 0.4.6
Authors@R: c(person(given = "Michal",
family = "Ovadek",
role = c("aut", "cre", "cph"),
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
## Minor changes

- minor changes to documentation
- cleaned up http calls code
- calls to `elx_council_votes()` and `elx_curia_list()` now fail gracefully
- .data replaced by quoted variables for tidyselect functions
- Internet-using vignettes moved to site-only articles

# eurlex 0.4.5

Expand Down
25 changes: 21 additions & 4 deletions R/elx_council_votes.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,30 @@ elx_council_votes <- function(){
}
ORDER BY DESC(?decisionDate), ?votingInstCode
"

votes <- httr::POST(url = "https://data.consilium.europa.eu/sparql",
body = list(query = query),
httr::add_headers('Accept' = 'text/csv')) %>%

# run query
votes_resp <- graceful_http(
remote_file = "https://data.consilium.europa.eu/sparql",
body = list(query = query),
httr::content_type("multipart"),
headers = httr::add_headers('Accept' = 'text/csv'),
encode = "multipart",
verb = "POST"
)

# if var not created, break
if (is.null(votes_resp)){

return(invisible(NULL))

}

# process response
votes <- votes_resp %>%
httr::content("text") %>%
readr::read_csv(col_types = readr::cols(.default = "c"))

# return
return(votes)

}
20 changes: 13 additions & 7 deletions R/elx_curia_list.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,16 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"

elx_curia_scraper <- function(url, ...){

page <- xml2::read_html(url(url, open = "rb"))
response <- graceful_http(url, verb = "GET")

# if var not created, break
if (is.null(response)){

return(invisible(NULL))

}

page <- xml2::read_html(response)

tab <- page %>%
rvest::html_node("table") %>%
Expand Down Expand Up @@ -131,7 +140,7 @@ elx_curia_scraper <- function(url, ...){
dplyr::ungroup()

out <- dplyr::left_join(tab, linked, by = c("case_id"="linked_id","n_id"="n_id")) %>%
dplyr::select(.data$case_id, .data$linked_celex, .data$case_info) %>%
dplyr::select("case_id", "linked_celex", "case_info") %>%
dplyr::rename(case_id_celex = linked_celex)

return(out)
Expand All @@ -152,16 +161,13 @@ elx_curia_parse <- function(x, ...){
see_case = stringr::str_extract(.data$case_info, "see Case .+") %>%
stringr::str_remove("see Case ") %>%
stringr::str_remove("APPEAL.*") %>%
stringr::str_squish() %>%
stringr::str_trim(),
stringr::str_squish(),
appeal = stringr::str_extract(.data$case_info, "APPEAL.*") %>%
stringr::str_remove("APPEAL.? :") %>%
stringr::str_remove_all("\\;|\\,|\\.") %>%
stringr::str_squish() %>%
stringr::str_trim()
stringr::str_squish()
)

return(out)

}

101 changes: 68 additions & 33 deletions R/elx_fetch_data.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' Retrieve additional data on EU documents
#'
#' Wraps httr::GET with pre-specified headers and parses retrieved data.
#' Get titles, texts, identifiers and XML notices for EU resources.
#'
#' @param url A valid url as character vector of length one based on a resource identifier such as CELEX or Cellar URI.
#' @param type The type of data to be retrieved. When type = "text", the returned list contains named elements reflecting the source of each text. When type = "notice", the results return an XML notice associated with the url.
Expand Down Expand Up @@ -38,8 +38,10 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"),

if (type == "notice" & missing(notice)){stop("notice type must be given")}

# format language query
language <- paste(language_1,", ",language_2,";q=0.8, ",language_3,";q=0.7", sep = "")

# process URL
if (stringr::str_detect(url,"celex.*[\\(|\\)|\\/]")){

clx <- stringr::str_extract(url, "(?<=celex\\/).*") %>%
Expand All @@ -53,12 +55,20 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"),

}

# titles
if (type == "title"){

response <- graceful_http(url,
headers = httr::add_headers('Accept-Language' = language,
'Accept' = 'application/xml; notice=object'),
verb = "GET")

# if var not created, break
if (is.null(response)){

return(invisible(NULL))

}

if (httr::status_code(response)==200){

Expand All @@ -71,6 +81,7 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"),

}

# full text
if (type == "text"){

response <- graceful_http(url,
Expand All @@ -79,6 +90,13 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"),
'Accept' = 'text/html, text/html;type=simplified, text/plain, application/xhtml+xml, application/xhtml+xml;type=simplified, application/pdf, application/pdf;type=pdf1x, application/pdf;type=pdfa1a, application/pdf;type=pdfx, application/pdf;type=pdfa1b, application/msword'),
verb = "GET")

# if var not created, break
if (is.null(response)){

return(invisible(NULL))

}

if (httr::status_code(response)==200){

out <- elx_read_text(response, html_text = html_text)
Expand Down Expand Up @@ -142,12 +160,20 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"),

}

# identifiers
if (type == "ids"){

response <- graceful_http(url,
headers = httr::add_headers('Accept-Language' = language,
'Accept' = 'application/xml; notice=identifiers'),
verb = "GET")

# if var not created, break
if (is.null(response)){

return(invisible(NULL))

}

if (httr::status_code(response)==200){

Expand All @@ -160,6 +186,7 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"),

}

# notices
if (type == "notice"){

accept_header <- paste('application/xml; notice=',
Expand All @@ -184,6 +211,13 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"),

}

# if var not created, break
if (is.null(response)){

return(invisible(NULL))

}

if (httr::status_code(response)==200){

out <- httr::content(response)
Expand All @@ -192,6 +226,7 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"),

}

# end
return(out)

}
Expand Down Expand Up @@ -225,40 +260,40 @@ elx_read_text <- function(http_response, html_text = "text2"){

}

if (stringr::str_detect(http_response$headers$`content-type`,"html")){

out <- http_response %>%
xml2::read_html() %>%
rvest::html_node("body") %>%
html_text_engine() %>%
paste0(collapse = " ---pagebreak--- ")

names(out) <- "html"

}

else if (stringr::str_detect(http_response$headers$`content-type`,"pdf")){

out <- http_response$url %>%
pdftools::pdf_text() %>%
paste0(collapse = " ---pagebreak--- ")

names(out) <- "pdf"

}

else if (stringr::str_detect(http_response$headers$`content-type`,"msword")){

out <- http_response$url %>%
antiword::antiword() %>%
paste0(collapse = " ---pagebreak--- ")
if (stringr::str_detect(http_response$headers$`content-type`,"html")){

out <- http_response %>%
xml2::read_html() %>%
rvest::html_node("body") %>%
html_text_engine() %>%
paste0(collapse = " ---pagebreak--- ")

names(out) <- "html"

}

names(out) <- "word"
else if (stringr::str_detect(http_response$headers$`content-type`,"pdf")){

out <- http_response$url %>%
pdftools::pdf_text() %>%
paste0(collapse = " ---pagebreak--- ")

names(out) <- "pdf"

}

} else {
out <- "unsupported format"
names(out) <- "unsupported"
}
else if (stringr::str_detect(http_response$headers$`content-type`,"msword")){

out <- http_response$url %>%
antiword::antiword() %>%
paste0(collapse = " ---pagebreak--- ")

names(out) <- "word"

} else {
out <- "unsupported format"
names(out) <- "unsupported"
}

return(out)

Expand Down
Loading

0 comments on commit 225684d

Please sign in to comment.