Merge pull request #50 from pratikunterwegs/develop

fix remove reflections
pratikunterwegs · Feb 12, 2021 · 448dbbf · 448dbbf
2 parents 0743516 + c10dd2b
commit 448dbbf
Show file tree

Hide file tree

Showing 20 changed files with 253 additions and 184 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,7 +19,7 @@
 /*.Rcheck/
 
 # RStudio files
-.Rproj.user/
+.Rproj.user
 
 # produced vignettes
 vignettes/*.html

diff --git a/R/fun_check_data.R b/R/fun_check_data.R
@@ -25,10 +25,12 @@ atl_check_data <- function(data,
   # get the colmumn names
   data_names <- colnames(data)
 
-  invisible(lapply(names_expected, function(nr) {
-    assertthat::assert_that(nr %in% data_names,
-      msg = glue::glue("atl_check_data: {nr} is
+  invisible(
+    vapply(names_expected, function(nr) {
+      assertthat::assert_that(nr %in% data_names,
+        msg = glue::glue("atl_check_data: {nr} is
                          required but missing from data!")
-    )
-  }))
+      )
+    }, FUN.VALUE = TRUE)
+  )
 }
diff --git a/R/fun_filter_covariates.R b/R/fun_filter_covariates.R
@@ -1,15 +1,15 @@
 #' Filter data by position covariates.
 #'
-#' The atlastools function \code{atl_filter_covariates} allows convenient 
+#' The atlastools function \code{atl_filter_covariates} allows convenient
 #' filtering of a dataset by any number of logical filters.
-#' This function can be used to easily filter timestamps in a range, as well as 
+#' This function can be used to easily filter timestamps in a range, as well as
 #' combine simple spatial and temporal filters.
-#' It accepts a character vector of \code{R} expressions that each return a 
+#' It accepts a character vector of \code{R} expressions that each return a
 #' logical vector (i.e., \code{TRUE} or \code{FALSE}).
-#' Each filtering condition is interpreted in the context of the dataset 
-#' supplied, and used to filter for rows that satisfy each of the filter 
+#' Each filtering condition is interpreted in the context of the dataset
+#' supplied, and used to filter for rows that satisfy each of the filter
 #' conditions.
-#' Users must make sure that the filtering variables exist in their dataset in 
+#' Users must make sure that the filtering variables exist in their dataset in
 #' order to avoid errors.
 #'
 #' @author Pratik R. Gupte

diff --git a/R/fun_get_patch_data.R b/R/fun_get_patch_data.R
@@ -1,6 +1,6 @@
 #' Get residence patch data.
 #'
-#' The function \code{atl_patch_summary} can be used to extract patch-specific 
+#' The function \code{atl_patch_summary} can be used to extract patch-specific
 #' summary data such as the median coordinates, the patch duration, the distance
 #'  travelled within the patch, the displacement within the patch, and the patch
 #'  area.

diff --git a/R/fun_make_res_patch.R b/R/fun_make_res_patch.R
@@ -1,24 +1,24 @@
 #' Construct residence patches from position data.
 #'
-#' A cleaned movement track can be classified into residence patches using the 
+#' A cleaned movement track can be classified into residence patches using the
 #' function \code{atl_res_patch}.
 #' The function expects a specific organisation of the data: there should be
 #' at least the following columns, \code{x}, \code{y}, \code{time}, and
 #' \code{id}, all named in lower case, and corresponding to the coordinates,
 #' timestamp in the UNIX format (seconds since 1970), and the identity of the
 #' tracked individual.
-#' The result contains only the data that was classified as a residence patch 
+#' The result contains only the data that was classified as a residence patch
 #' and removes transit between them.
-#' \code{atl_res_patch} requires only three parameters: (1) the distance 
-#' threshold between positions (called \code{buffer_size}), (2) the distance 
-#' threshold between clusters of positions (called \code{lim_spat_indep}), 
+#' \code{atl_res_patch} requires only three parameters: (1) the distance
+#' threshold between positions (called \code{buffer_size}), (2) the distance
+#' threshold between clusters of positions (called \code{lim_spat_indep}),
 #' and (3) the time interval between clusters (called \code{lim_time_indep}).
 #' Clusters formed of fewer than a minimum number of positions can be excluded.
 #' The exclusion of clusters with few positions can help in removing bias due to
-#'  short stops, but if such short stops are also of interest, they can be 
+#'  short stops, but if such short stops are also of interest, they can be
 #' included by reducing the \code{min_fixes} argument.
-#' Position covariates such as speed may also be summarised patch-wise by 
-#' passing covariate names and  summary functions as character vectors to the 
+#' Position covariates such as speed may also be summarised patch-wise by
+#' passing covariate names and  summary functions as character vectors to the
 #' \code{summary_variables} and \code{summary_functions} arguments, respectively
 #' .
 #'

diff --git a/R/fun_remove_reflections.R b/R/fun_remove_reflections.R
@@ -1,16 +1,13 @@
 #' Remove reflected positions.
 #'
-#' Remove reflections, or prolonged spikes from a movement track by identifying 
+#' Remove reflections, or prolonged spikes from a movement track by identifying
 #' the bounds and removing positions between them.
-#' The important function arguments here are \code{point_angle_cutoff} ($A$), 
-#' \code{reflection_speed_cutoff} ($S$), and \code{est_ref_len}, the maximum 
-#' number of positions after the anchor point that are candidates for the end of
-#'  the prolonged spike.
-#' If the prolonged spike ends after less than N positions, the true end point 
+#' The important function arguments here are \code{point_angle_cutoff} ($A$),
+#' \code{reflection_speed_cutoff} ($S$).
+#' If the prolonged spike ends before the last row of data, the true end point
 #' is used as the outer bound of the spike.
-#' However, the algorithm behind this function fails when the prolonged spike 
-#' ends after more than N positions. Users are advised to use a liberally large 
-#' value of N in the \code{est_ref_len} argument; 1,000 may be appropriate.
+#' If the prolonged spike does not end within the last row of data, all the
+#' data are retained and a message is printed.
 #'
 #' @author Pratik R. Gupte
 #'
@@ -22,7 +19,6 @@
 #' high instantaneous speeds are considered an anomaly rather than fast transit.
 #' @param reflection_speed_cutoff The speed (in m/s) above which an anomaly is
 #' detected when combined with a high turning angle.
-#' @param est_ref_len How many positions are expected to be in a reflection.
 #'
 #' @return A dataframe with reflections removed.
 #' @examples
@@ -31,8 +27,7 @@
 #'   data = track_data,
 #'   x = "x", y = "y", time = "time",
 #'   point_angle_cutoff = A,
-#'   reflection_speed_cutoff = S,
-#'   est_ref_len = N
+#'   reflection_speed_cutoff = S
 #' )
 #' }
 #' @export
@@ -41,85 +36,92 @@ atl_remove_reflections <- function(data,
                                    y = "y",
                                    time = "time",
                                    point_angle_cutoff = 45,
-                                   reflection_speed_cutoff = 20,
-                                   est_ref_len = 1000) {
-  speed <- angle <- NULL
+                                   reflection_speed_cutoff = 20) {
+  speed_in <- speed_out <- angle <- NULL
   # check data
   atl_check_data(data, names_expected = c(x, y, time))
 
   # set order
   data.table::setorderv(data, time)
 
   # get speed and angle
-  speed <- atl_get_speed(data, x = x, y = y, time = time)
-  angle <- atl_turning_angle(data, x = x, y = y, time = time)
-  data[, `:=`(
-    speed = speed,
-    angle = angle
-  )]
+  data$speed_in <- atl_get_speed(
+    data,
+    x = x, y = y, time = time,
+    type = "in"
+  )
+  data$speed_out <- atl_get_speed(
+    data,
+    x = x, y = y, time = time,
+    type = "out"
+  )
+  data$angle <- atl_turning_angle(data, x = x, y = y, time = time)
 
   # remove points that cannot be assessed
   # can't determine whether the last few points are reflections hence remove
-  data <- data[!is.na(speed) & !is.na(angle), ]
+  data <- data[
+    !is.na(speed_in) & !is.na(speed_out) & !is.na(angle) & !is.nan(angle),
+  ]
 
   # prepare a vector of rows to discard
   vec_discard <- integer()
 
-  # identify the last point before an anomaly
-  anchor_point <- which(data$speed >=
-    reflection_speed_cutoff &
-    data$angle >= point_angle_cutoff)[1] - 1
+  # identify the anomaly point
+  anchor_point <- which(
+    data$speed_in > reflection_speed_cutoff &
+      data$angle > point_angle_cutoff
+  )[1]
 
   # message
   message(glue::glue("first anchor at {anchor_point}"))
 
-  while (anchor_point < nrow(data) - 1) {
+  while (anchor_point < (nrow(data) - 1)) {
 
-    # the next est_ref_len subsequent points are suspect
-    suspect_point <- anchor_point + 1
-    # find the max speed after the first anomaly, which is the blink away
-    # the next highest should be the blink back
-    suspect_speeds <- data[(suspect_point + 1):est_ref_len, speed]
+    # find next point with speed out > S
+    est_ref_end <- which(data[
+      seq(
+        anchor_point,
+        nrow(data)
+      ),
+      speed_out
+    ] > reflection_speed_cutoff)
 
-    # drop NA here
-    suspect_speeds <- suspect_speeds[!is.na(suspect_speeds)]
-
-    # get the next highest speed
-    nx_high_speed <- which.max(rank(suspect_speeds))
-    # this gets the next highest speed, which should be the end of the
-    # reflection, but may also be the beginning or end of another reflection
-    if (suspect_speeds[nx_high_speed] < reflection_speed_cutoff) {
-      reflection_end <- nrow(data)
-      message(glue::glue("remove_reflections: reflection does not end within \\
-              {est_ref_len} positions"))
+    # handle case where there is no end, conservatively keep all data
+    if (!any(est_ref_end)) {
+      message("the reflection does not appear to end: keeping all points")
+      break()
     } else {
-      reflection_end <- suspect_point + nx_high_speed + 1 # one added for safety
-      # message ref end
-      message(glue::glue("reflection ends {reflection_end}"))
-    }
+      # identify end point
+      est_ref_end <- anchor_point + est_ref_end
 
-    # identify rows to remove
-    # may be excessive but works
-    vec_discard <- c(vec_discard, seq(anchor_point, reflection_end))
+      # print message
+      message(sprintf("reflection ends at %i", est_ref_end))
 
-    # set the next anchor
-    next_anchor <- which(data$speed[reflection_end:nrow(data)] >=
-      reflection_speed_cutoff &
-      data$angle[reflection_end:nrow(data)] >=
-        point_angle_cutoff)[1]
+      # update discard vector
+      vec_discard <- c(vec_discard, seq(anchor_point, est_ref_end))
 
-    if (is.na(next_anchor)) {
-      # break the loop if there's no further anomalies
-      break()
-    } else {
-      anchor_point <- reflection_end + next_anchor - 1
-      # check for errors in order
-      assertthat::assert_that(anchor_point > reflection_end,
-        msg = glue::glue("anchor point {anchor_point} is \\
+      # set the next anchor relative to the ref end
+      next_anchor <- which(
+        (data$speed_in[seq(est_ref_end, nrow(data))] > reflection_speed_cutoff) &
+          (data$angle[seq(est_ref_end, nrow(data))] > point_angle_cutoff)
+      )[1]
+
+      # break if there are no more reflections
+      # must check for NA because we access the first element of an empty vec
+      if (any(!any(next_anchor), is.na(next_anchor))) {
+        message("no more reflections; ending")
+        break()
+      } else {
+        # the absolute next anchor
+        anchor_point <- est_ref_end + next_anchor
+        # check for errors in order
+        assertthat::assert_that(anchor_point > est_ref_end,
+          msg = glue::glue("anchor point {anchor_point} is \\
                             before reflection end {reflection_end}")
-      )
-      # message
-      message(glue::glue("next anchor is {anchor_point}"))
+        )
+        # message
+        message(glue::glue("next anchor is {anchor_point}"))
+      }
     }
   }
 

diff --git a/R/fun_simple_distance.R b/R/fun_simple_distance.R
@@ -1,6 +1,6 @@
 #' Calculate distances between successive points.
 #'
-#' Gets the euclidean distance between consecutive points in a coordinate 
+#' Gets the euclidean distance between consecutive points in a coordinate
 #' reference system in metres, i.e., UTM systems.
 #'
 #' @param x A column name in a data.frame object that contains the numeric X or

diff --git a/R/fun_smooth_data.R b/R/fun_smooth_data.R
@@ -1,6 +1,6 @@
 #' Apply a median smooth to coordinates.
 #'
-#' Applies a median smooth defined by a rolling window to the X and Y 
+#' Applies a median smooth defined by a rolling window to the X and Y
 #' coordinates of the data.
 #' This function \emph{modifies in place}, i.e., \emph{the results
 #' need not be assigned to a new data.table}.

diff --git a/R/fun_thin_data.R b/R/fun_thin_data.R
@@ -1,27 +1,27 @@
 #' Thin tracking data by resampling or aggregation.
 #'
-#' Uniformly reduce data volumes with either aggregation or resampling 
-#' (specified by the \code{method} argument) over an interval specified in 
+#' Uniformly reduce data volumes with either aggregation or resampling
+#' (specified by the \code{method} argument) over an interval specified in
 #' seconds using the \code{interval} argument.
-#' Both options make two important assumptions: 
-#' (1) that timestamps are named `time', and 
+#' Both options make two important assumptions:
+#' (1) that timestamps are named `time', and
 #' (2) all columns except the identity columns can be averaged in \code{R}.
-#' While the `resample' option returns a thinned dataset with all columns from 
-#' the input data, the `aggregate' option drops the column \code{COVXY}, since 
+#' While the `resample' option returns a thinned dataset with all columns from
+#' the input data, the `aggregate' option drops the column \code{COVXY}, since
 #' this cannot be propagated to the averaged position.
-#' Both options handle the column `time' differently: while `resample' returns 
-#' the actual timestamp (in UNIX time) of each sample, `aggregate' returns the 
+#' Both options handle the column `time' differently: while `resample' returns
+#' the actual timestamp (in UNIX time) of each sample, `aggregate' returns the
 #' mean timestamp (also in UNIX time).
 #' In both cases, an extra column, \code{time_agg}, is added which has a uniform
-#'  difference between each element corresponding to the user-defined thinning 
+#'  difference between each element corresponding to the user-defined thinning
 #' interval.
-#' The `aggregate' option only recognises errors named \code{VARX} and 
+#' The `aggregate' option only recognises errors named \code{VARX} and
 #' \code{VARY}, and standard deviation around each position named \code{SD}.
-#' If all of these columns are not present together the function assumes there 
+#' If all of these columns are not present together the function assumes there
 #' is no measure of error, and drops those columns.
-#' If there is actually no measure of error, the function simply returns the 
+#' If there is actually no measure of error, the function simply returns the
 #' averaged position and covariates in each time interval.
-#' Grouping variables' names (such as animal identity) may be passed as a 
+#' Grouping variables' names (such as animal identity) may be passed as a
 #' character vector to the \code{id_columns} argument.
 #'
 #' @param data Cleaned data to aggregate. Must have a numeric column named time.

diff --git a/man/atl_filter_bounds.Rd b/man/atl_filter_bounds.Rd