-
Notifications
You must be signed in to change notification settings - Fork 1
/
nwt_saddle_snowdepth_eda.R
154 lines (121 loc) · 4.62 KB
/
nwt_saddle_snowdepth_eda.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# First-pass check of Niwot Saddle snow statistics
# SN 3 March 2020
library(ggplot2)
library(dplyr)
library(tidyr)
snowdep = read.csv('~/repos/poorcasting-real/00_raw_data/snowdepth_all/saddsnow.dw.data.csv')
head(snowdep)
nrow(snowdep)
# 35.5K rows
table(snowdep$local_site)
# All observations are at the saddle.
table(snowdep$point_ID)
# Yeah, good. Okay.
# NOTE: acc'd to metadata, 101, 201, etc. are same as 10A, 20A, etc.
# What is the quality of the depth column?
hist(snowdep$mean_depth)
# Good.
sum(is.na(snowdep$mean_depth))
# Okay. There are 332 bad records... what are they?
snowdep %>%
filter(is.na(snowdep$mean_depth))
# Hmm..
# READ THE README
# Add a year column.
snowdep = snowdep %>%
mutate(year = date %>% as.character() %>% strsplit(split = '\\-') %>%
sapply(function(x) x %>% unlist() %>% (function(y) as.numeric(y[1]))),
month = date %>% as.character() %>% strsplit(split = '\\-') %>%
sapply(function(x) x %>% unlist() %>% (function(y) as.numeric(y[2]))),
dayofyear = date %>% as.character() %>% strsplit(split = '\\-') %>%
sapply(function(x) x %>% unlist() %>% (function(y) as.numeric(y[3]))))
# Did it work?
table(snowdep$year)
# yes.
snowdep %>%
group_by(point_ID, year) %>%
summarise(nobs = n(),
totmeas = sum(num_meas))
# Sampling effort appears to be inconsistent...
# Look at the sampling dates
snowdep %>%
distinct(date)
# This poses a problem. How to handle differences in sampling when the timing of snow is also inconsistent??
snowdep %>%
distinct(month, year)
# Actually this looks okay. Looks like there is sampling in every month with snow.
# How many times per month does sampling happen?
snowdep %>%
group_by(point_ID, year, month) %>%
summarise(nobs = n(),
totmeas = sum(num_meas))
# Looks like mostly ones and twos
# Confirm this.
snowdep %>%
group_by(point_ID, year, month) %>%
summarise(nobs = n()) %>%
group_by(nobs) %>%
summarise(n = n())
# Wtf! 440 times there are five samples in a month?
# Maybe take the minimum/max? See below.
# Maybe pick one year and see what it looks like.
snowdep %>%
filter(year %in% 1993) %>%
distinct(point_ID, month, .keep_all = TRUE) %>%
ggplot() +
geom_line(aes())
# Okay. We have inconsistencies across year,
#
snowdep.one.month = snowdep %>%
select(point_ID, year, month, dayofyear, mean_depth) %>%
# distinct(year, month, dayofyear, .keep_all = TRUE) %>%
arrange(year, month, dayofyear) %>%
distinct(point_ID, year, month, .keep_all = TRUE) %>%
mutate(mean_depth = ifelse(is.na(mean_depth) | is.nan(mean_depth), 0, mean_depth))
table(snowdep.one.month$point_ID)
# 404 measurements here for each point. This is good!
# Aggregate by growing season... but need to define growing season
snowdep.one.month %>%
group_by(point_ID, year) %>%
mutate(season = year + as.numeric(month > 9)) %>%
View()
# Adivce from Sarah:
# consider max snowdepth (could use a spline)
# consider date of first melt-out (n.b., maybe should be first time with TWO zero-snow measuremens?)
snowdep %>%
filter(!mean_depth) %>%
mutate(date = as.Date(date)) %>%
mutate(mean_depth = ifelse(is.na(mean_depth) | is.nan(mean_depth), 0, mean_depth)) %>%
mutate(season = year + as.numeric(month > 9)) %>%
group_by(point_ID, season) %>%
summarise(first.date = min(date)) %>%
View()
# Here is an attempt to find cases with two consecutive measurements of no snowmelt:
snowmelt = snowdep %>%
select(date, point_ID, mean_depth, year, month, dayofyear) %>%
mutate(date = as.Date(date)) %>%
mutate(mean_depth = ifelse(is.na(mean_depth) | is.nan(mean_depth), 0, mean_depth)) %>%
mutate(season = year + as.numeric(month > 9)) %>%
arrange(point_ID, date) %>%
group_by(point_ID, season) %>%
mutate(snow.diff = c(0, diff(mean_depth))) %>%
filter(!mean_depth & !snow.diff) %>%
distinct(season, point_ID, .keep_all = TRUE)
# Is every point_ID-year combo represented here?
with(snowmelt, table(season, point_ID))
# No - lots are missing. (e.g., point 65!)
# Is every point measured in every year?
snowdep %>%
mutate(season = year + as.numeric(month > 9)) %>%
with(table(season, point_ID))
# Look at a representative plot: 65:
snowdep %>% filter(point_ID %in% 65) %>% View()
# There are some years where there is snow in June/July
# and "first snowmelt" days are gone.
# Likely need to impute these using a spline.
snowdep %>%
filter(point_ID %in% 65) %>%
mutate(jd = paste(month, dayofyear, sep = '-') %>% as.Date(format = '%m-%d', origin = '01-01-2020') %>% julian()) %>%
ggplot() +
geom_line(aes(x = jd, y = mean_depth, group = year)) +
geom_point(aes(x = jd, y = mean_depth)) + facet_wrap(~ year)