From 7f52582e8737da5146157161fcd15ae27155b1ae Mon Sep 17 00:00:00 2001 From: kthompson25 Date: Tue, 1 Oct 2024 12:39:15 +0200 Subject: [PATCH] Initial commit --- Code/00_BBS Initial Cleaning.R | 387 ++++++++++++++++++++++++++ Code/01_Creating spatial BBS routes.R | 290 +++++++++++++++++++ Code/02_BBS Final Cleaning.R | 117 ++++++++ 3 files changed, 794 insertions(+) create mode 100644 Code/00_BBS Initial Cleaning.R create mode 100644 Code/01_Creating spatial BBS routes.R create mode 100644 Code/02_BBS Final Cleaning.R diff --git a/Code/00_BBS Initial Cleaning.R b/Code/00_BBS Initial Cleaning.R new file mode 100644 index 0000000..444f2ab --- /dev/null +++ b/Code/00_BBS Initial Cleaning.R @@ -0,0 +1,387 @@ +############################################################################### +######## ######## +######## Downloading and Cleaning BBS Data ######## +######## ######## +############################################################################### + +# Author: Kimberly Thompson + +# This code does initial +# cleaning and preprocessing of the North American Breeding Bird +# Survey Dataset including: stop level data, migrants (stop-level format), +# routes, and weather. + +# Main preprocessing step is to add a unique route ID to each dataset, which +# simply combines the state ID with the route ID, so that each dataset can +# talk to one another + +# Website for data download +# https://www.sciencebase.gov/catalog/item/5ea04e9a82cefae35a129d65 + +# state level: 1966-2023, 50 stops along each route aggregated into 10-stop +# bins +# stop level: 1997-2023, data for each of 50 stops recorded + + +# Download includes: +# States - 62 individual files, 1 for each state and province - 1966-2023 +# Stop_Data - 10 files, with 5-9 states represented in each file, data for +# all 50 stops for each route, officially 1997-2023 but some other +# older years sneak in +# Migrants - counts of non-breeding birds, stop data format, data for +# all 50 stops for each route, officially 1997-2023 but some other +# older years sneak in +# Migrant Summary - counts for non-breeding birds, state data format, +# 1966-2023 +# Routes - Route information plus the lat long coordinate of the route's +# starting point +# Species List - Links AOU (4-digit species ID code) with order, family, genus +# and species +# Vehicle Data - vehicle counts and excessive noise ??? no info in metadata +# Weather - route quality information and weather conditions - more info in +# metadata + +# Why do we focus only on stop level data here? +# Because we will match with MODIS data from 2001-2023; therefore the stop level +# data fits our timeline AND in case we decide to incorporate some method for +# detection probability, the stop level data provides the best option for that. + + +# Produces datasets: + + + + + +########## clean workspace and load required packages #################### + +# clean workspace to improve efficiency: # +# rm(list = ls() ) +# gc() #releases memory + +library(tidyverse) # Data cleaning + + +############################################### +### ### +### Stop-Level Data ### +### Processing ### +############################################### + +# Combine data into 1 CSV file, add a unique identifier for each route + +# File path - set to Bioviewpoint folder +path <- "I:/mas/04_personal/Kim_T/BioViewPoint" +stop.list <- list.files(paste(path, "/00_Data/Raw/BBS/Stop_Data", + sep = "")) + + +# Create blank dataframe +master <- data.frame() + +for (i in 1:length(stop.list)) { + + setwd(path) + + tmp.file <- read.csv(stop.list[i], header = TRUE) + + master <- rbind(master, tmp.file) + + print(i) + + } + +# Column RPID represents 3-digit run protocol ID +# 101: Standard BBS Protocol: 3-minute counts, 1 observer, single run per year +# 102: Standard BBS Protocol, Replicate 1: 3-minute counts, 1 observer +# (same or different person), second run in year +# 103: Standard BBS Protocol, Replicate 1: 3-minute counts, 1 observer +# (same or different person), second run in year +# 104: Standard BBS Protocol, Replicate 3: 3-minute counts, 1 observer +# (same or different person), fourth run in year. +# 203: Experimental Protocol: Double-Observer Independent: 3-minute counts, +# 2 observer method, single run per year, Secondary Observer +# 501: Experimental Protocol: Distance and time-of-detection methods: Three +# 1-minute sample periods; two distance bands (0-50 m, >50-400m) +# 502: Experimental Protocol: Distance and time-of-detection methods, Rep1: +# Three 1-minute sample periods; two distance bands (0-50 m, >50-400m), +# second run in year. + +# Frequency of each protocol type +master %>% + count(RPID) %>% + mutate(prop = prop.table(n)) + +# We will keep only protocol 101 which represents 99.4% of the data +master <- master %>% + filter(RPID == 101) + +# Define both statenum and Route as character columns so that trailing zeros +# are retained +master$StateNum <- as.character(master$StateNum) +master$Route <- as.character(master$Route) + +# Create a unique route identifier +master$unique_route <- paste(master$StateNum, master$Route, sep = "_") + +# Filter data to be only for 2001 - 2023 +master <- master %>% + filter(Year >= 2001) + +# How many routes do we have given these initial filters? +length(unique(master$unique_route)) +# 4944 + + +############################################### +### ### +### Stop-Level Data ### +### Processing Time Series ### +############################################### + +# To ensure the time series analysis can proceed, we need to see which routes +# do not have sufficient observations within the 2001-2023 period + +# Make a vector of unique route numbers +route.num <- unique(master$unique_route) + +# Create a data frame in which to store results +total.years <- data.frame(Route = character(), Total = integer()) + +# Loop to calculate the number of years for the routes +for (i in 1:length(route.num)) { + + # find the unique years for each unique route + years <- length(unique(master$Year[master$unique_route == route.num[i]])) + + # Create a dataframe for consecutive years + tmp.df <- data.frame(Route = route.num[i], Total = years) + + # Rbind to the total.years dataframe + total.years <- rbind(total.years, tmp.df) + + print(i) + +} + +# Frequency of total years surveyed between 2001-2023 +total.years %>% + count(Total) %>% + mutate(prop = prop.table(n)) + +# Retain routes that have been surveyed at least 15 years out of the 22 possible +# *** This threshold could be adjusted later depending on the causal methodology +# we use + +# Create a vector of routes to retain +routes_to_retain <- total.years$Route[total.years$Total >= 15] + +# Filter the master df according to routes to retain +master <- master %>% + filter(unique_route %in% routes_to_retain) + +# Create a column for the total abundance (summed across each stop) +master$Total.Abund <- rowSums(master[, grep("^Stop", names(master))], + na.rm = TRUE) + +# Reorder columns +master <- master[ , c(1:4, 58, 5:57, 59)] + +# clean up workspace +rm(tmp.df, tmp.file, total.years, i, path, route.num, stop.list, + years) + + +############################################### +### ### +### Migrants ### +### Processing ### +############################################### + +# RPID, unique_route, year range, routes-to-retain, Total abundance + +migrants <- read.csv(paste(path, "/00_Data/Raw/BBS/Migrants.csv", + sep = ""), + header = TRUE) + +# keep only protocol 101 +migrants <- migrants %>% + filter(RPID == 101) + +# Define both statenum and Route as character columns so that trailing zeros +# are retained +migrants$StateNum <- as.character(migrants$StateNum) +migrants$Route <- as.character(migrants$Route) + +# Create a unique route identifier +migrants$unique_route <- paste(migrants$StateNum, migrants$Route, sep = "_") + +# Filter data to be only for 2001 - 2023 +migrants <- migrants %>% + filter(Year >= 2001) + +# Filter the migrants df according to routes to retain +migrants <- migrants %>% + filter(unique_route %in% routes_to_retain) + +length(unique(migrants$unique_route)) +# 1418 routes of the 2605 in the master data have info on migrants + +# Create a column for the total abundance (summed across each stop) +migrants$Total.Abund <- rowSums(migrants[, grep("^Stop", names(migrants))], + na.rm = TRUE) + + +############################################### +### ### +### Routes ### +### Processing ### +############################################### + +routes <- read.csv(paste(path, "/00_Data/Raw/BBS/Routes.csv", + sep = ""), + header = TRUE) + +# Active: active (1) or discontinued (0) + +# RouteTypeID: indicates if a route was established along a roadside or +# on a body of water (1 = Roadside, 2 = water) + +# RouteTypeDetailID: Indicates route length and selection criteria +# 1: Location of route randomly established, 50 point counts +# 2: Location of route non-randomly established, 50 point counts +# 3: Location of route non-randomly established, < 50 point counts + +# Frequency of DetailID types +routes %>% + count(RouteTypeDetailID) %>% + mutate(prop = prop.table(n)) + +# Remove type 3 which accounts for 0.003% of routes +routes <- routes %>% + filter(RouteTypeDetailID != 3) + +# Define both statenum and Route as character columns so that trailing zeros +# are retained +routes$StateNum <- as.character(routes$StateNum) +routes$Route <- as.character(routes$Route) + +# Create a unique route identifier +routes$unique_route <- paste(routes$StateNum, routes$Route, sep = "_") + + + +############################################### +### ### +### Weather/Quality ### +### Processing ### +############################################### + +# RPID, unique_route, year range, routes-to-retain, Total abundance + +weather <- read.csv(paste(path, "/00_Data/Raw/BBS/Weather.csv", + sep = ""), + header = TRUE) + +# keep only protocol 101 +weather <- weather %>% + filter(RPID == 101) + +# Define both statenum and Route as character columns so that trailing zeros +# are retained +weather$StateNum <- as.character(weather$StateNum) +weather$Route <- as.character(weather$Route) + +# Create a unique route identifier +weather$unique_route <- paste(weather$StateNum, weather$Route, sep = "_") + +# Filter data to be only for 2001 - 2023 +weather <- weather %>% + filter(Year >= 2001) + +# Filter the weather df according to routes to retain +weather <- weather %>% + filter(unique_route %in% routes_to_retain) + +length(unique(weather$unique_route)) +# 2605 + +# Month: Month surveyed; values range from April - July. We may want to filter +# this further depending on what we acquire for MODIS, or just include this +# in the analysis + +# Day: Day surveyed; same as above + +# Quality Current ID: indicates if the sampling took place under suitable +# weather conditions and met suitable time, data, and route completion criteria +# 1 - Data meet quality control criteria +# 0 - Data do not meet one or more quality control criteria + +# Remove samplings with a 0 for quality control +weather <- weather %>% + filter(QualityCurrentID == 1) + +# Runtype: identifies which data were collected consistent with all standard +# BBS criteria +# 1: consistent +# 0: not consistent + +# Remove samplings with a 0 for RunType +weather <- weather %>% + filter(RunType == 1) + + + +############################################### +### ### +### Last step in Initial Cleaning ### +### ### +############################################### + + +# Each dataset has been pared to retain only routes with high quality and +# adherence to protocols + +length(unique(master$unique_route)) +# 2605 + +length(unique(routes$unique_route)) +# 5805 + +length(unique(weather$unique_route)) +# 2455 + +# The routes in weather should be used to further pare the master and routes +# dataframes, as well as the migrant dataframe. + +final.routes <- unique(weather$unique_route) + +# Filter the master df according to final.routes +master <- master %>% + filter(unique_route %in% final.routes) + +# Filter the routes df according to final.routes +routes <- routes %>% + filter(unique_route %in% final.routes) + +migrants <- migrants %>% + filter(unique_route %in% final.routes) + + + +# Write the resulting files +write.csv(master, paste(path, + "/00_Data/Processed/BBS/1st Cleaning/BBS_StopLev_01_23_15plus.csv", + sep = ""), row.names = FALSE) + +write.csv(migrants, paste(path, + "/00_Data/Processed/BBS/1st Cleaning/BBS_MigStop_01_23_15plus.csv", + sep = ""), row.names = FALSE) + +write.csv(routes, paste(path, + "/00_Data/Processed/BBS/1st Cleaning/BBS_Routes.csv", + sep = ""), row.names = FALSE) + +write.csv(weather, paste(path, + "/00_Data/Processed/BBS/1st Cleaning/BBS_Weather_Quality.csv", + sep = ""), row.names = FALSE) \ No newline at end of file diff --git a/Code/01_Creating spatial BBS routes.R b/Code/01_Creating spatial BBS routes.R new file mode 100644 index 0000000..1c7a62f --- /dev/null +++ b/Code/01_Creating spatial BBS routes.R @@ -0,0 +1,290 @@ +############################################################################################ +######## ######## +######## Determining spatial coordinates of BBS routes ######## +######## ######## +############################################################################################ + +# Author: Kimberly Thompson + +# This code creates a simple features (package sf) dataframe of spatial points for each route +# in the North American Breeding Bird Survey Dataset (subsetted to SW as a pilot). +# The name of the route (as an identifier that can be matched with the biodiversity data) +# is also included. + +# Important websites: +# Download shapefile of BBS routes from +# https://www.mbr-pwrc.usgs.gov/bbs/geographic_information/Instructions_trend_route.htm + +# This shapefile of routes is from the 2004 release, the USGS no longer maintains +# data on the route paths so we have to work with what we have. + +########## clean workspace and load required packages #################### + +# clean workspace to improve efficiency: # +rm(list = ls() ) +gc() #releases memory + +library(sp) +library(sf) +library(units) + +library(ggplot2) # Plotting + +library(tidyverse) # Data organization + +# Set path to Bioviewpoint folder +path <- "I:/mas/04_personal/Kim_T/BioViewPoint" + + +############################################### +### ### +### Some Notes about Coordinate Systems ### +### ### +############################################### + +# Deciding which projection to use +# https://gis.stackexchange.com/questions/104005/choosing-projected-coordinate-system-for-mapping-all-us-states +# Albers Equal Area Conic (Heinrich Albers, 1805): Like Lambert Conformal Conic, +# this is a very popular map projection for the US, Canada and other continental/large countries +# with a primarily E-W extent. Used by the USGS for maps showing the conterminous United States +# (48 states) or large areas of the United States. Used for many thematic maps, especially +# choropleth and dot density maps. + +# From searchable list of codes at http://www.spatialreference.org +# Could not actually find the epsg code for this reference though +# EPSG code found at this site +#https://guides.library.duke.edu/r-geospatial/CRS +# epsg 5070: USA_Contiguous_Albers_Equal_Area_Conic + + +############################################### +### ### +### Load Shapefiles and CSVs ### +### ### +############################################### + + +# Define the coordinate system +albers = sp:: CRS("+init=epsg:5070") + +# Load USA and Canada shapefile +usa.shape <- sf :: st_read(paste(path, "/00_Data/Raw/USACANAB.shp", + sep = "")) + +# Define coordintate system as albers: +usa.shape <- sf :: st_set_crs( usa.shape, albers ) + + +# Load BBS routes shapefile +route.shapefile <- sf :: st_read(paste(path, + "/00_Data/Raw/BBS/GIS/nabbs02_mis_alb.shp", + sep = "")) + +# Define coordintate system as albers: +route.shapefile <- sf :: st_set_crs( route.shapefile, albers ) + + +# Load the BBS route data +bbs_route.data <- read.csv(paste(path, "/00_Data/Processed/BBS/BBS_Routes.csv", + sep = ""), + header = TRUE) + + + +############################################### +### ### +### Convert bbs.routes into sf object ### +### ### +### ### +############################################### + +# Make bbs.route.data an sf object (lat, long in degrees) +bbs_route.data_sf <- sf :: st_as_sf(x = bbs_route.data, + coords = c("Longitude", "Latitude"), + crs = "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0") + +# Convert crs to albers: +bbs_route.data_sf <- sf :: st_transform( bbs_route.data_sf, albers ) + + + +############################################### +### ### +### Find which route shapes match the ### +### bbs observations ### +### ### +############################################### + +# The spatial data provided in the BBS download gives the starting point of +# each route. We need to match the starting point to the vector of route +# shapes. + +# Find the route shape feature that is closest to the bbs observations (point) +closest.route <- st_nearest_feature(bbs_route.data_sf, route.shapefile) + +# Check that the order of the vector of indices for the closest linestring lines up +# with the corresponding point +plot(route.shapefile$geometry[closest.route[1]]) +plot(bbs_route.data_sf$geometry[1], add = TRUE) + + +############################################### +### ### +### Calculate the distance btw the ### +### staring point and the closest route ### +### ### +############################################### + +# Find the nearest points on the linestrings +nearest.points <- st_nearest_points(bbs_route.data_sf, + route.shapefile[closest.route, ], + pairwise = TRUE) +# Calculate distances +distances <- st_length(nearest.points) + +# Make dataframe from distances and convert to integer +df <- data.frame(distance = distances) +df$distance <- as.integer(df$distance) + +# Examine frequencies of distances (in meters) +ggplot(df, aes(x = distance)) + + geom_histogram(binwidth = 100, fill = "steelblue", color = "black") + + scale_x_continuous(lim = c(0, 2500)) + + scale_y_continuous(lim = c(0, 300)) + + labs(title = "Histogram of Distances", + x = "Distance", + y = "Frequency") + + theme_minimal() + +# We need to set a threshold for the distance which we fine to be too far to +# confidently say that the starting point of the route matches the linestring +# of the route +# 1 km (1000 meters) seems appropriate + +# Add the distances to bbs_route.data_sf +bbs_route.data_sf$dist_to_route <- distances + + +############################################### +### ### +### Merging point geometry and ### +### linestring geometry ### +### ### +############################################### + +# Reduce the Route shapefile based on the vector of indices - this already has +# it in the right order +updated.route.shapefile <- route.shapefile[closest.route, ] + +# Create a dataframe with all the information in both the bbs routes df and the +# updated.route df but with the point geometry only +bbs_route.data_sf.point <- cbind(bbs_route.data_sf, updated.route.shapefile) +bbs_route.data_sf.point$geometry.1 <- NULL + +# Create a dataframe with all the information in both the bbs routes df and the +# updated.route df but with the route shape (linestring) geometry only +bbs_route.data_sf.line <- cbind(bbs_route.data_sf, updated.route.shapefile) +bbs_route.data_sf.line$geometry <- NULL +names(bbs_route.data_sf.line)[21] <- "geometry" + + +############################################### +### ### +### Verifying the merge by comparing ### +### route names and examining ### +### distances ### +############################################### + +# Make a sepearate dataframe (to aid in visual inspection) with the route +# names + +name.comparison <- data.frame(Point.Name = bbs_route.data_sf.point$RouteName, + Line.Name = bbs_route.data_sf.point$SRTENAME, + Logical = bbs_route.data_sf.point$RouteName == + bbs_route.data_sf.point$SRTENAME, + Distance = + as.integer(bbs_route.data_sf.point$dist_to_route)) + +# Examine the frequency of true/falses +name.comparison %>% + count(Logical) %>% + mutate(Prop = prop.table(n)) + +# 152 falses, 2 NAs + +# Visual inspection of these instances to see if it is really a mismatch or +# simply a slight different in how the names were recorded. +# It seems like the distance is a better way to determine the routes to retain +# than matching the names because for some routes it seems the naming convention +# changed either slightly or to a total renaming. + +# check if after removing the high distances does this reduce the number of +# falses +name.comparison_reduced <- name.comparison %>% + filter(Distance <= 1000) + +# Examine the frequency of true/falses +name.comparison_reduced %>% + count(Logical) %>% + mutate(Prop = prop.table(n)) + +############################################### +### ### +### Pare dataframes to observations ### +### that fit distance criteria ### +### ### +############################################### + +# Distance threshold: 1 km + +bbs_route.data_sf.line <- bbs_route.data_sf.line %>% + filter(dist_to_route <= set_units(1000, "m")) + +bbs_route.data_sf.point <- bbs_route.data_sf.point %>% + filter(dist_to_route <= set_units(1000, "m")) + + +############################################### +### ### +### Final Cleaning: Exclude Canada ### +### and Alaska ### +### ### +############################################### + +# 840 is the country number of the US +# 3 is the state number of Alaska + +bbs_route.data_sf.line <- bbs_route.data_sf.line %>% + filter(bbs_route.data_sf.line$CountryNum == 840 & + bbs_route.data_sf.line$StateNum != 3) + +bbs_route.data_sf.point <- bbs_route.data_sf.point %>% + filter(bbs_route.data_sf.point$CountryNum == 840 & + bbs_route.data_sf.point$StateNum != 3) + + +# examine results +plot(bbs_route.data_sf.line$geometry, + col = "black") +plot(usa.shape$geometry, add = TRUE) +plot(bbs_route.data_sf.point$geometry, + col = "red", add = TRUE) + + +############################################### +### ### +### Write sf objects ### +### ### +### ### +############################################### + +# Write each sf dataframe as a geo json file + +st_write(bbs_route.data_sf.point, + paste(path, "/00_Data/Processed/BBS_Rtes_Point.geojson", + sep = "")) +st_write(bbs_route.data_sf.line, + paste(path, "/00_Data/Processed/BBS_Rtes_Linestring.geojson", + sep = "")) + + diff --git a/Code/02_BBS Final Cleaning.R b/Code/02_BBS Final Cleaning.R new file mode 100644 index 0000000..fd5ca7c --- /dev/null +++ b/Code/02_BBS Final Cleaning.R @@ -0,0 +1,117 @@ +############################################################################### +######## ######## +######## Final Cleaning of BBS Data ######## +######## ######## +############################################################################### + +# Author: Kimberly Thompson + +# This code harmonizes BBS data including stop-level breeding species counts, +# route data, weather data, and migrant data with the spatial data for the +# routes, which has already undergone final cleaning to represent only +# quality routes (adhering to quality and protocols) in the contiguous USA. + + +########## clean workspace and load required packages #################### + +# clean workspace to improve efficiency: # +# rm(list = ls() ) +# gc() #releases memory + +library(tidyverse) # Data cleaning +library(sf) + + +############################################### +### ### +### Data Loading ### +### ### +############################################### + +# Set path to Bioviewpoint folder (where data resides) +path <- "~/Library/CloudStorage/GoogleDrive-mainelobster28@gmail.com/.shortcut-targets-by-id/1HhR4gs3fKXyAXBhQom6t69gAqR6SeKhx/BioViewPoint" + +# Load Stop level breeding data +bbs.data <- read.csv(paste(path, + "/00_Data/Processed/BBS/1st Cleaning/BBS_StopLev_01_23__15plus.csv", + sep = ""), + header = TRUE) + +# Load Migrant data +migrant.data <- read.csv(paste(path, + "/00_Data/Processed/BBS/1st Cleaning/BBS_MigStop_01_23__15plus.csv", + sep = ""), + header = TRUE) + +# Load route data +route.data <- read.csv(paste(path, + "/00_Data/Processed/BBS/1st Cleaning/BBS_Routes.csv", + sep = ""), + header = TRUE) + +# Load Weather data +weather.data <- read.csv(paste(path, + "/00_Data/Processed/BBS/1st Cleaning/BBS_Weather_Quality.csv", + sep = ""), + header = TRUE) + +# Load the spatial point file +spatial.data <- sf :: st_read(paste(path, + "/00_Data/Processed/BBS/BBS_Rtes_Point.geojson", + sep = "")) + + +############################################### +### ### +### Harmonize the Data ### +### ### +############################################### + +# Dataframes need to match the routes present in the spatial data + +# Make a list of routes to retain +routes.to.retain <- unique(spatial.data$unique_route) + +# Filter each dataframe to be only these routes +bbs.data <- bbs.data %>% + filter(unique_route %in% routes.to.retain) + +migrant.data <- migrant.data %>% + filter(unique_route %in% routes.to.retain) + +route.data <- route.data %>% + filter(unique_route %in% routes.to.retain) + +weather.data <- weather.data %>% + filter(unique_route %in% routes.to.retain) + +# All dfs now have 1984 unique routes (with the exception of migrants where +# not all routes had migrants) + +############################################### +### ### +### Write the resulting dfs ### +### ### +############################################### + +write.csv(bbs.data, paste(path, + "/00_Data/Processed/BBS/Final_Cleaning/BBS_StopLev_01_23_15plus.csv", + sep = ""), row.names = FALSE) + +write.csv(migrant.data, paste(path, + "/00_Data/Processed/BBS/Final_Cleaning/BBS_MigStop_01_23_15plus.csv", + sep = ""), row.names = FALSE) + +write.csv(route.data, paste(path, + "/00_Data/Processed/BBS/Final_Cleaning/BBS_Routes.csv", + sep = ""), row.names = FALSE) + +write.csv(weather.data, paste(path, + "/00_Data/Processed/BBS/Final_Cleaning/BBS_Weather_Quality.csv", + sep = ""), row.names = FALSE) + + + + + +