Blocking function added

8Ginette8 · Jan 7, 2025 · 421652b · 421652b
1 parent 412d6c4
commit 421652b
Show file tree

Hide file tree

Showing 22 changed files with 332 additions and 78 deletions.
diff --git a/.Rproj.user/DBBE3EC9/pcs/source-pane.pper b/.Rproj.user/DBBE3EC9/pcs/source-pane.pper
@@ -1,3 +1,3 @@
 {
-    "activeTab": 0
+    "activeTab": -1
 }
diff --git a/.Rproj.user/DBBE3EC9/pcs/windowlayoutstate.pper b/.Rproj.user/DBBE3EC9/pcs/windowlayoutstate.pper
@@ -1,7 +1,7 @@
 {
     "left": {
         "splitterpos": 383,
-        "topwindowstate": "NORMAL",
+        "topwindowstate": "HIDE",
         "panelheight": 958,
         "windowheight": 996
     },

diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths
@@ -1,12 +1 @@
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/.github/workflows/R-CMD-check-all.yml="1A84B01C"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/.github/workflows/R-CMD-check-month-test.yml="D4BBA76F"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/.gitignore="ECF04EF5"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/NAMESPACE="79356337"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/R/conv_function.R="99CB140E"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/README.md="F2158441"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/inst/examples/get_range_help.R="C765AE8E"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/inst/examples/make_ecoregion_help.R="DBD0841D"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/man/get_gbif.Rd="5FF8208F"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/man/get_range.Rd="D136AD48"
-C:/Users/am92guke/Documents/iDiv/code/gbif_range/gbif.range/man/make_ecoregion.Rd="D4D98882"
 D:/OneDrive - Eidg. Forschungsanstalt WSL/gitlab/gbif.range/DESCRIPTION="D306FE16"
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: gbif.range
 Version: 1.0
 Title: A toolbox to generate species range maps based on ecoregions and a user-friendly GBIF wrapper
 Depends: R (>= 4.0.0)
-Imports: terra, rgbif, CoordinateCleaner, sf, ClusterR, FNN, geometry, cluster, rnaturalearth, mclust, methods, utils, zip
+Imports: terra, rgbif, CoordinateCleaner, sf, ClusterR, FNN, geometry, cluster, rnaturalearth, mclust, methods, utils, zip, class, NMOF
 Description: This package provides a workflow to generate species range maps from scratch based on ecoregions and a user-friendly GBIF wrapper. It includes functions to create taxa range maps using ecoregions and to interact with the Global Biodiversity Information Facility (GBIF) database (retrieving species records, backbone taxonomy and IUCN conservation status). The package facilitates the process of accessing GBIF data and integrating it with ecoregion information for species distribution analysis.
 License: GPL (>= 3) | file LICENCE
 BugReports: https://github.com/8Ginette8/gbif.range/issues
@@ -32,3 +32,4 @@ Collate:
     'make_ecoregion.R'
     'get_bioreg.R'
     'evaluate_range.R'
+    'make_blocks.R'
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,6 +9,7 @@ export(get_doi)
 export(get_gbif)
 export(get_range)
 export(get_status)
+export(make_blocks)
 export(make_ecoregion)
 export(make_tiles)
 export(obs_filter)
@@ -17,6 +18,8 @@ importFrom(ClusterR,KMeans_rcpp)
 importFrom(CoordinateCleaner,cd_ddmm)
 importFrom(CoordinateCleaner,cd_round)
 importFrom(FNN,knn.dist)
+importFrom(NMOF,gridSearch)
+importFrom(class,knn)
 importFrom(cluster,clara)
 importFrom(grDevices,dev.off)
 importFrom(grDevices,pdf)

diff --git a/R/make_blocks.R b/R/make_blocks.R
@@ -0,0 +1,180 @@
+### =========================================================================
+### cv blocks
+### =========================================================================
+#' Block-wise split data into training and testing
+#'
+#' Creates a stratum vector based on a data.frame with n columns. If the data.frame
+#' has one column, strata vector is created based on clusters separated by quantiles. If
+#' the data.frame has two or more columns, strata vector is created based on 'Clustering
+#' Large Applications' (function 'clara' from package cluster). Also, instead of a data.frame
+#' the argument 'npoints' can be provided, which create groups by random sampling. An
+#' opitimization algorithm (function 'gridSearch' from package NMOF) optimizes for equal stratum sizes.
+#'
+#' @param nfolds Numeric. Number of approximately equal-sized classes (folds) to separate groups in block-cross validation
+#' @param df Object of class 'data.frame' with n columns containing critera for cluster building.
+#' Not necessary if argument npoints is supplied
+#' @param nblocks Number of clusters (blocks) based on the number of folds that should be built.
+#' Minimum is the same number as 'nFolds'. Maximum is nrow(df)/10.
+#' @param pres Binary vector. Optional argument. If 'df' is supplied, this argument can be used to
+#' save processing time. '1' stands for the points on which CLARA is appplied (most likely the species
+#' observations), and '0' stands for the points on which K-nearest neighbors is applied relative to
+#' the '1' (most likely the absences, background points...). If 'df' is not supplied, for which points
+#' should random sampling be made? 
+#' @param npoints Optional argument if 'df' is not supplied. For how many points should random sampling be made?
+#' @return Object of class 'vector' of length nrow(df) or 'npoints', with integers defining
+#' different folds
+#' @references
+#' Brun, P., Thuiller, W., Chauvier, Y., Pellissier, L., Wüest, R. O., Wang, Z., & Zimmermann, N. E. (2020).
+#' Model complexity affects species distribution projections under climate change.
+#' Journal of Biogeography, 47(1), 130-142.
+#' @author Philipp Brun
+#' @example inst/examples/make_blocks_help.R
+#' @export
+#' @importFrom cluster clara
+#' @importFrom class knn
+#' @importFrom NMOF gridSearch
+make_blocks<-function(nfolds = 4,
+                      df = data.frame(),
+                      nblocks = nfolds*5,
+                      npoints = NA,
+                      pres = numeric()){
+
+  ### ------------------------
+  ### check input data
+  ### ------------------------
+
+  if (nrow(df)==0 & is.na(npoints)){
+    stop("Please supply number of points if no data.frame is supplied")
+  }
+
+  ### ------------------------
+  ### generate clusters
+  ### ------------------------
+
+  if (nrow(df)==0){
+
+    ### do ordinary sampling if no strata are supplied
+    out.strat=sample(rep(1:nfolds,ceiling(npoints/nfolds)),size=npoints)
+
+  } else {
+
+    # check for reasonable number of boxes
+    if (nrow(df)<4*nblocks){
+      stop("Too many boxes required!")
+    }
+
+    if (ncol(df)==1){
+
+      ### do quantile-based clustering if df contains only one column
+      rngi = quantile(df[,1],probs=0:(nblocks)/(nblocks))
+      rngi[1] = rngi[1]-1
+      rngi[length(rngi)] = rngi[length(rngi)]+1
+      clist = as.numeric(cut(df[,1],breaks=rngi,right=TRUE))
+
+    } else {
+
+      # Scale input data
+      scd = apply(df,2,scale)
+
+      if (length(pres)==0){
+
+        # do kmedoid clustering for 2 or more columns in df
+        kmed = clara(scd,k=nblocks,metric="euclidean")
+
+        # get clusters
+        clist = kmed$clustering
+
+      } else {
+
+        # do kmedoid clustering for 2 or more columns in df
+        kmed = clara(scd[which(pres==1),],k=nblocks,metric="manhattan")
+        knnab = knn(train=scd[which(pres==1),],test=scd[which(pres==0),],cl=kmed$clustering)
+        clist = kmed$clustering
+        cliful = rep(NA,nrow(scd))
+        cliful[which(pres==1)] = kmed$clustering
+        cliful[which(pres==0)] = as.numeric(knnab)
+      }
+    }
+
+    # sort obtained clusters
+    tbl = sort(table(clist),decreasing = TRUE)
+
+    ### ------------------------
+    ### regularly assign clusters to strata
+    ### ------------------------
+
+    if (nblocks != nfolds){
+
+      # prepare strata layers
+      grps = rep(list(numeric()),nfolds)
+
+      # for the clusters with many observations
+      # distribute them regularly among strata but keep
+      # last six clusters for estimating most
+      # regular distribution
+
+      if (length(tbl)>6){
+
+        for (i in 1:(length(tbl)-6)){
+
+          # determine to which stratum the cluster should be
+          # added
+          fl=(floor((i-1)/nfolds))
+          if (fl%%2==0){
+
+            j = round(1+nfolds*((i-1)/nfolds-(floor((i-1)/nfolds))))
+
+          } else {
+
+            j = (nfolds+1)-round(1+nfolds*((i-1)/nfolds-(floor((i-1)/nfolds))))
+          }
+          # add cluster
+          grps[[j]] = append(grps[[j]],tbl[i])
+        }
+      }
+
+      # prepare for optimal distribution of last 6 clusters
+      vlis = factor(1:nfolds,levels=1:nfolds)
+      prs = rep(list(vlis),min(length(tbl),6))
+      sstab = tbl[max(1,(length(tbl)-5)):length(tbl)]
+
+      # Run brute-forcing gridSearch obtimization
+      srch = gridSearch(levels=prs,fun=optme,nms=as.vector(sstab),grps=grps,tot=sum(tbl))
+
+      # pull out results
+      wi = as.numeric(as.character(srch$minlevels))
+
+      # combine results with predistributed clusters
+      for (i in 1:length(grps)){
+        grps[[i]] = append(grps[[i]],sstab[wi==i])
+      }
+
+      # define vector with output strata
+      out.strat = rep(NA,nrow(df))
+      for (i in 1:length(grps)){
+
+        if (length(pres)==0){
+
+          out.strat[which(as.character(clist)%in%names(grps[[i]]))] = i
+
+        } else {
+
+          out.strat[which(as.character(cliful)%in%names(grps[[i]]))] = i
+        }
+      }
+
+    } else {
+
+      # if as many strata as clusters are required, simply return clusters
+      if (length(pres)==0){
+
+        out.strat = clist
+      } else{
+
+        out.strat = cliful
+      }
+    }
+  }
+  # return result
+  return(out.strat)
+}
diff --git a/inst/examples/get_doi_help.R b/inst/examples/get_doi_help.R
@@ -4,10 +4,10 @@ obsam <- get_gbif("Ailuropoda melanoleuca")
 
 \dontrun{
 # Retrieve DOI for only one get_gbif() output
-get_doi(obspt,title="GBIF_test1",description="A small example 1",
-   source_url="https://example.com/",user="",pwd="") # Use your own GBIF credentials here
+get_doi(obspt, title = "GBIF_test1", description = "A small example 1",
+   source_url = "https://example.com/", user = "", pwd = "") # Use your own GBIF credentials here
 
 # Retrieve DOIs for several get_gbif() outputs
-get_doi(list(obspt,obsam),title="GBIF_test2",description="A small example 2",
-   source_url="https://example.com/",user="",pwd="") # Use your own GBIF credentials here
+get_doi(list(obspt,obsam),title = "GBIF_test2",description = "A small example 2",
+   source_url = "https://example.com/",user = "",pwd = "") # Use your own GBIF credentials here
 }
diff --git a/inst/examples/get_gbif_help.R b/inst/examples/get_gbif_help.R
@@ -1,19 +1,19 @@
 # Downloading worldwide the observations of Panthera tigris
 obs.pt <- get_gbif("Panthera tigris",
-                   basis=c("OBSERVATION",
+                   basis = c("OBSERVATION",
                            "HUMAN_OBSERVATION",
                            "MACHINE_OBSERVATION"))
-countries <- terra::vect(rnaturalearth::ne_countries(type="countries", returnclass = "sf"))
-terra::plot(countries,col="#bcbddc")
-points(obs.pt[,c("decimalLongitude","decimalLatitude")], pch=20, col="#238b4550", cex=4)
+countries <- terra::vect(rnaturalearth::ne_countries(type = "countries", returnclass = "sf"))
+terra::plot(countries,col = "#bcbddc")
+points(obs.pt[,c("decimalLongitude","decimalLatitude")], pch = 20, col = "#238b4550", cex = 4)
 
 \dontrun{
 # Downloading worldwide the observations of Ailuropoda melanoleuca (with a 100km grain, after 1990
 # and by keeping duplicates and by adding the name of the person who collected the panda records)
 obs.am <- get_gbif("Ailuropoda melanoleuca", grain = 100000 , duplicates = TRUE,
    time_period = c(1990,3000), add_infos = c("recordedBy","issue"))
-terra::plot(countries,col="#bcbddc")
-graphics::points(obs.am[,c("decimalLongitude","decimalLatitude")],pch=20,col="#238b4550",cex=4)
+terra::plot(countries, col = "#bcbddc")
+graphics::points(obs.am[,c("decimalLongitude","decimalLatitude")], pch = 20, col = "#238b4550", cex = 4)
 
 # Downloading worlwide the observations of Phascolarctos cinereus (with a 1km grain, after 1980,
 # and keeping raster centroids)

diff --git a/inst/examples/get_range_help.R b/inst/examples/get_range_help.R
@@ -1,10 +1,10 @@
 \dontrun{
 # Load available ecoregions
-eco_terra = read_bioreg(bioreg_name = "eco_terra", save_dir = NULL)
+eco_terra <- read_bioreg(bioreg_name = "eco_terra", save_dir = NULL)
 
 # First download the worldwide observations of Panthera tigris from GBIF
 occ <- get_gbif("Panthera tigris",
-				  basis=c("OBSERVATION", 
+				  basis = c("OBSERVATION", 
 				          "HUMAN_OBSERVATION", 
 				          "MACHINE_OBSERVATION"))
 
@@ -14,11 +14,10 @@ occ <- get_gbif("Panthera tigris",
 range <- get_range(occ, eco_terra,"ECO_NAME")
 
 # Plot
-terra::plot(range, axes = FALSE, box = FALSE, legend=FALSE, col="chartreuse4")
+terra::plot(range, axes = FALSE, box = FALSE, legend=FALSE, col = "chartreuse4")
 
 # Plot political world boundaries
-terra::plot(rnaturalearth::ne_countries(returnclass = "sf")[1], add=T, col=NA)
+terra::plot(rnaturalearth::ne_countries(returnclass = "sf")[1], add = TRUE, col = NA)
 # Plot the occurance points
-graphics::points(occ[,c("decimalLongitude","decimalLatitude")],pch=20,col="#99340470",cex=1.5)
-
+graphics::points(occ[,c("decimalLongitude","decimalLatitude")],pch = 20,col = "#99340470",cex = 1.5)
 }
diff --git a/inst/examples/get_status_help.R b/inst/examples/get_status_help.R
@@ -1,3 +1,3 @@
 # Get the taxonomy and IUCN status of Panthera tigris (with and without related names)
-get_status("Cypripedium calceolus",all=FALSE)
-get_status("Cypripedium calceolus",all=TRUE)
+get_status("Cypripedium calceolus", all = FALSE)
+get_status("Cypripedium calceolus", all = TRUE)
diff --git a/inst/examples/make_blocks_help.R b/inst/examples/make_blocks_help.R
@@ -0,0 +1,16 @@
+\dontrun{
+# Downloading worldwide the observations of Panthera tigris
+obs.pt <- get_gbif("Panthera tigris",
+                   basis = c("OBSERVATION",
+                           "HUMAN_OBSERVATION",
+                           "MACHINE_OBSERVATION"))
+
+# Create a vector of folds (n = 5) spatially blocked (n = 10)
+block.pt <- make_blocks(nfolds = 5, df = obs.pt[,c("decimalLatitude","decimalLongitude")], nblocks = 10)
+
+# Plot one colour per fold
+countries <- terra::vect(rnaturalearth::ne_countries(type = "countries", returnclass = "sf"))
+countries.focus <- crop(countries, ext(60,100,0,40))
+terra::plot(countries.focus, col = "#bcbddc")
+graphic::points(obs.pt[,c("decimalLongitude","decimalLatitude")], pch = 20, col = block.pt, cex = 1)
+}
diff --git a/inst/examples/make_ecoregion_help.R b/inst/examples/make_ecoregion_help.R
@@ -1,22 +1,21 @@
-#TODO transfer to data()
+# Open
 rst.path <- paste0(system.file(package = "gbif.range"),"/extdata/rst_enl.tif")
 rst <- terra::rast(rst.path)
 shp.path <- paste0(system.file(package = "gbif.range"),"/extdata/shp_lonlat.shp")
 shp.lonlat <- terra::vect(shp.path)
 rst <- terra::crop(rst, shp.lonlat)
-#plot(crp)
 
 # Apply the function by infering 50 classes of environments
 my.eco <- make_ecoregion(rst,50)
 terra::plot(my.eco)
 
 # Downloading in the European Alps the observations of one plant species
-obs.arcto <- get_gbif("Arctostaphylos alpinus",geo=shp.lonlat)
+obs.arcto <- get_gbif("Arctostaphylos alpinus", geo = shp.lonlat)
 
 # Create the range map based on our custom ecoregion
-range.arcto <- get_range(obs.arcto, my.eco, "EcoRegion", res=20)
+range.arcto <- get_range(obs.arcto, my.eco, "EcoRegion", res = 20)
 
 # Plot
-terra::plot(shp.lonlat, col="grey")
-terra::plot(range.arcto,add=TRUE,col=rgb(0.2,1,0.2,0.5,1))
-graphics::points(obs.arcto[,c("decimalLongitude","decimalLatitude")],pch=20,col="orange1",cex=1)
+terra::plot(shp.lonlat, col = "grey")
+terra::plot(range.arcto, add = TRUE, col = rgb(0.2,1,0.2,0.5,1))
+graphics::points(obs.arcto[,c("decimalLongitude","decimalLatitude")], pch = 20, col = "orange1", cex = 1)
diff --git a/inst/examples/make_tiles_help.R b/inst/examples/make_tiles_help.R
@@ -3,4 +3,4 @@ shp.path <- paste0(system.file(package = "gbif.range"),"/extdata/shp_lonlat.shp"
 shp.lonlat <- terra::vect(shp.path)
 
 # Apply the function to divide the extent in ~20 fragments
-mt = make_tiles(geo=shp.lonlat,Ntiles=20,sext=TRUE)
+mt <- make_tiles(geo = shp.lonlat, Ntiles = 20, sext = TRUE)