Skip to content

Commit

Permalink
Scrape features and add Administrative boundaries
Browse files Browse the repository at this point in the history
  • Loading branch information
ccamara committed Jun 28, 2024
1 parent 1ccbbb9 commit d43c1d8
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 51 deletions.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ URL: https://github.com/WarwickCIM/ukgeographies, https://warwickcim.github.io/u
BugReports: https://github.com/WarwickCIM/ukgeographies/issues
Suggests:
knitr,
rmarkdown
rmarkdown,
rvest,
stringr
VignetteBuilder: knitr
Imports:
sf
Expand Down
38 changes: 29 additions & 9 deletions R/get_boundaries.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,45 @@
#' Queries [ONS' Geoportal](https://geoportal.statistics.gov.uk/) endpoints and
#' retrieves the requested geographical boundaries in the form of a sf object.
#'
#' @param boundary a string containing...
#' @param boundary a string containing ... Accepted values are:
#' `r levels(data_urls$boundary)`
#' @param year a number containing...
#' @param resolution a string containing...
#' @param detail_level a string defining the level of detail in the geometry.
#' Accepted values are: `r levels(data_urls$resolution)`. Each value
#' corresponds to:
#'
#' - Full Extent (BFE) – Full resolution boundaries go to the Extent of the Realm (Low Water Mark) and are the most detailed of the boundaries.
#' - Full Clipped (BFC) – Full resolution boundaries that are clipped to the coastline (Mean High Water mark).
#' - Generalised Clipped (BGC) - Generalised to 20m and clipped to the coastline (Mean High Water mark) and more generalised than the BFE boundaries.
#' - Super Generalised Clipped (BSC) (200m) – Generalised to 200m and clipped to the coastline (Mean High Water mark).
#' - Ultra Generalised Clipped (BUC) (500m) – Generalised to 500m and clipped to the coastline (Mean High Water mark).
#' - Grid, Extent (BGE) - Grid formed of equally sized cells which extend beyond the coastline.
#' - Generalised, Grid (BGG) - Generalised 50m grid squares.
#'
#' For a detailed description of the methodology refer to [Digital boundaries](https://www.ons.gov.uk/methodology/geography/geographicalproducts/digitalboundaries)
#'
#'
#'
#' @return a sf object with the selected boundaries
#'
#' @return a sf object
#' @export
#'
get_boundaries <- function(boundary, year, resolution="BUC"){
# Check that boundary is not empty and is a string
# Check that year is not empty and is a number
#' @examples
#' CA_2023_BGC <- get_boundaries("Combined Authorities", 2023, "BGC")
#'
#' class(CA_2023_BGC)
#'
get_boundaries <- function(boundary, year, detail_level="BUC"){
# TODO Check that boundary is not empty and is a string
# TODO Check that year is not empty and is a number
# TODO if there's no combination of boundary and year, assign the closer
# number (floor) and print a message stating the assumption.

lookup <- paste(boundary, year, resolution, sep = "_")


url <- data_urls$url_download[data_urls$id == lookup]

spdf <- sf::read_sf(url)

return(spdf)
}

# test <- get_boundaries()
Binary file modified R/sysdata.rda
Binary file not shown.
100 changes: 63 additions & 37 deletions data-raw/data-urls.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,77 @@
## retrieve the data.

library(dplyr)
library(rvest)
library(stringr)

# Variables ---------------------------------------------------------------

geoportal_base_url <- "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
# Scrape URLS -------------------------------------------------------------

geoportal_base_url <- "https://services1.arcgis.com"

# Data frame --------------------------------------------------------------
ons_geoportal <- read_html(paste0(geoportal_base_url,
"/ESMARspQHYMw9BZ9/arcgis/rest/services/"))

data_urls_raw <- tibble::tribble(
# Column Names ---
# The category of the boundary, according to ONS. Administrative, Census...
~boundary_category,
# The type of boundary. Can be Countries, Counties...
~boundary,
~year,
# Resolution or clipping. Can be either: BFC, BFE, BGC, BUC.
~resolution,
# string with url to download endpoint. If not present, will be generated
# programatically.
~url_download,
~licence,
# URL to the dataset's metadata.
~url_metadata,
services <- ons_geoportal |>
html_elements("li") |>
html_elements("a") |>
html_attr("href")

# Boundaries: Add one per line.
# Countries
"administrative", "countries", 2022, "BUC", "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/Countries_December_2022_GB_BUC/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson", NA, NA,
"administrative", "countries", 2019, "BUC", "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/Countries_Dec_2019_UGCB_GB_2022/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson", NA, NA
)

# Build the dataframe -----------------------------------------------------


data_urls <- data_urls_raw |>
data_urls <- as.data.frame(services) |>
# Remove Mapserver
filter(str_detect(services, "/FeatureServer")) |>
# Convert absolute URLS
mutate(services = paste0(geoportal_base_url, services)) |>
mutate(type = case_when(
str_detect(services, "Lookup") ~ "Lookup",
)) |>
# Infer categories from titles
mutate(boundary = case_when(
str_detect(services, "Combined_Authorities") ~ "CAUTH",
str_detect(services, "/Counties_and_Unitary_Authorities") ~ "CTYUA",
str_detect(services, "/Counties_") ~ "CTY",
str_detect(services, "/Countries_") ~ "CTRY",
str_detect(services, "/County_Electoral_Division") ~ "CED",
str_detect(services, "/Local_Authority_Districts") ~ "LAD",
str_detect(services, "/Local_Planning_Authorities") ~ "LPA",
str_detect(services, "/Metropolitan_Counties") ~ "MCTY",
str_detect(services, "/Parishes_and_Non_Civil_Parished_Areas") ~ "PARNCP",
str_detect(services, "/Parishes") ~ "PAR",
str_detect(services, "/Regions") ~ "RGN",
str_detect(services, "/Upper_Tier") ~ "UTLA",
str_detect(services, "/Wards") ~ "WD",
str_detect(services, "/Lower_Layer") ~ "LSOA",
str_detect(services, "/Middle_Layer") ~ "MSOA",
str_detect(services, "/Output_Areas") ~ "OA",
),
boundary = as.factor(boundary)) |>
mutate(boundary_type = case_when(
boundary %in% c("CAUTH", "CTYUA", "CTY", "CTRY", "CED", "LAD",
"LPA", "MCTY", "PARNCP", "PAR", "RGN", "UTLA", "WD") ~
"Administrative",
boundary %in% c("LSOA", "MSOA", "OA") ~ "Census Boundaries"
),
boundary_type = as.factor(boundary_type)) |>
relocate(boundary_type, .before = boundary) |>
mutate(resolution = case_when(
str_detect(services, "_BFC") ~ "BFC",
str_detect(services, "_BFE") ~ "BFE",
str_detect(services, "_BGC") ~ "BGC",
str_detect(services, "_BUC") ~ "BUC"
),
detail_level = as.factor(resolution)) |>
mutate(year = str_extract(services, "_(19|20)(\\d){2}"),
year = as.numeric(str_remove(year, "_"))) |>
# Create URL to query featureserver and return a geojson file.
mutate(url_download = paste0(services, "/0/query?where=1%3D1&outFields=*&outSR=4326&f=json")) |>
#
# data_boundaries <- data_urls |>
# filter(!is.na(boundary)) |>
# Create unique id
mutate(id = paste(boundary, year, resolution, sep = "_")) |>
relocate(id) |>
# Create url_download if empty
mutate(url_download = case_when(
is.na(url_download) ~ paste0(geoportal_base_url, "hello" ),
.default = url_download)) |>
# Convert to factors
mutate(boundary_category = as.factor(boundary_category),
boundary = as.factor(boundary),
resolution = as.factor(resolution))


mutate(id = paste(boundary, year, detail_level, sep = "_")) |>
relocate(id)

usethis::use_data(data_urls, overwrite = TRUE, internal = TRUE)
28 changes: 24 additions & 4 deletions man/get_boundaries.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit d43c1d8

Please sign in to comment.