-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path00-prepare-data.R
293 lines (248 loc) · 16.5 KB
/
00-prepare-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
source("utils.R")
# Global copper mines - this dataset is not provided due to copyright restrictions
raw_data <- read_excel('./data/Copper data for analysis_20240503.xlsx') |>
dplyr::mutate(id_mine = row_number()) # add an unique id for each mine
### Add GRACE - Trends in Global Freshwater Availability from the Gravity Recovery and Climate Experiment (GRACE), v1 (2002 – 2016)
# Rodell, M., J. S. Famiglietti, D. N. Wiese, J. T. Reager, H. K. Beaudoing, F. W. Landerer, and M.-H. Lo. 2019. Trends in Global Freshwater Availability from the Gravity Recovery and Climate Experiment (GRACE). Palisades, New York: NASA Socioeconomic Data and Applications Center (SEDAC). https://doi.org/10.7927/H4TT4P2C.
# Rodell, M., J. S. Famiglietti, D. N. Wiese, J. T. Reager, H. K. Beaudoing, F. W. Landerer, and M.-H. Lo. 2018. Emerging Trends in Global Freshwater Availability. Nature 557(7707): 651-659. https://doi.org/10.1038/s41586-018-0123-1.
# The Trends in Global Freshwater Availability from the Gravity Recovery and Climate Experiment (GRACE), 2002-2016, is a global gridded data set at a spatial resolution of 0.5 degrees that presents trends (rate of change measured in centimeters per year) in freshwater availability based on data obtained from 2002 to 2016 by NASA GRACE. Terrestrial water availability storage is the sum of groundwater, soil moisture, snow and ice, surface waters, and wet biomass, expressed as an equivalent height of water. GRACE measures changes in the terrestrial water cycle by assessing small changes in Earth's gravity field. This observation-based assessment of how the world's water cycle is responding to human impacts and climate variations provides an important tool for evaluating and predicting emerging threats to water and food security.
# Dowanload requires login to Earthdata. Direct links:
# Trends: https://sedac.ciesin.columbia.edu/downloads/data/sdei/sdei-trends-freshwater-availability-grace/sdei-trends-freshwater-availability-grace-2002-2016-geotiff.zip
# Water mask: https://sedac.ciesin.columbia.edu/downloads/data/gpw-v4/gpw-v4-data-quality-indicators-rev11/gpw-v4-data-quality-indicators-rev11_watermask_30_min_tif.zip
if(file.exists("data/sdei-trends-freshwater-availability-grace-2002-2016-geotiff.zip")){
unzip("data/sdei-trends-freshwater-availability-grace-2002-2016-geotiff.zip", exdir = "data/")
}
freshwater_availability <- rast("data/freshwater_availability.tif")
if(file.exists("data/gpw-v4-data-quality-indicators-rev11_watermask_30_min_tif.zip")){
unzip("data/gpw-v4-data-quality-indicators-rev11_watermask_30_min_tif.zip", exdir = "data/")
}
water_mask <- rast("data/gpw_v4_data_quality_indicators_rev11_watermask_30_min.tif")
# Water Depletion from WaterGap3
if(!file.exists("data/WaterDepletion_WaterGap3.zip")){
print("Download Water Depletion from WaterGap3")
download.file("https://s3.us-east-2.amazonaws.com/earthstatdata/WaterDepletion_WaterGap3.zip", destfile = "data/WaterDepletion_WaterGap3.zip")
unzip("data/WaterDepletion_WaterGap3.zip", exdir = "data/")
}
water_depletion <- rast("data/WaterDepletion_WaterGap3/WaterDepletionCat_WG3.tif")
# CGMW Bedrock and Structural geology - BRGM Bureau de Recherches Géologiques et Minières https://portal.onegeology.org/OnegeologyGlobal
if(!file.exists("./data/geological_unit.gpkg")){
print("Download CGMW Bedrock and Structural geology")
geological_unit <- read_sf("http://mapsref.brgm.fr/wxs/1GG/CGMW_Bedrock_and_Structural_Geology?request=GetCapabilities&service=WFS", layer = "World_CGMW_50M_GeologicalUnitsOnshore")
st_write(geological_unit, "./data/geological_unit.gpkg")
}
geological_unit <- read_sf("./data/geological_unit.gpkg")
# Download from: https://sedac.ciesin.columbia.edu/data/set/gpw-v4-population-density-rev11/data-download
# Population Density, v4.11 (2000, 2005, 2010, 2015, 2020), 30 Min (55 km)
if(file.exists("./data/gpw-v4-population-density-rev11_totpop_30_min_nc/gpw_v4_population_density_rev11_30_min.nc")){
unzip("data/gpw-v4-population-density-rev11_totpop_30_min_nc.zip", exdir = "data/")
}
population_density <- rast("./data/gpw-v4-population-density-rev11_totpop_30_min_nc/gpw_v4_population_density_rev11_30_min.nc", lyrs = 1:5)
pop_density_slope <- app(population_density, calc_slope)
pop_density_mean <- app(population_density, "mean")
# Global Aridity Index and Potential Evapotranspiration (ET0) Database: Version 3
# Paper: https://www.nature.com/articles/s41597-022-01493-1
if(!file.exists("./data/ai_et0/Global-AI_ET0_v3_annual/ai_v3_yr.tif")){
print("Download Global Aridity Index and Potential Evapotranspiration (ET0) Database: Version 3")
options(timeout = 1000)
download.file("https://figshare.com/ndownloader/articles/7504448/versions/5", destfile = "data/ai_et0.zip")
options(timeout = 60)
unzip("data/ai_et0.zip", files = "Global-AI_ET0_annual_v3.zip", exdir = "data/ai_et0")
unzip("data/ai_et0/Global-AI_ET0_annual_v3.zip", exdir = "data/ai_et0")
unlink(c("data/ai_et0.zip", "data/ai_et0/Global-AI_ET0_annual_v3.zip"))
}
ai_annual <- rast("./data/ai_et0/Global-AI_ET0_v3_annual/ai_v3_yr.tif")
et0_annual <- rast("./data/ai_et0/Global-AI_ET0_v3_annual/et0_v3_yr.tif")
# pre-process copper data
sf_data <- select(raw_data, id_mine, Longitude, Latitude, REG_TOP_20, mine, snl_id, country, country_code, region, cumulative_production, average_production,
byproduct_group = `by-prod-group\ 2`, mine_type = mine_type_combined, ore_body_group = `Ore Body Group`, process_route = `Process route`, ore_grade = `Ore Grade_combined`) |>
dplyr::mutate(
Latitude = as.numeric(ifelse(Latitude == "Copper", 0, Latitude)),
check_0 = Longitude == 0 & Latitude == 0,
Longitude = ifelse(check_0, NA, Longitude),
Latitude = ifelse(check_0, NA, Latitude)) |>
select(-check_0) |>
dplyr::filter(!is.na(Longitude)) |>
st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326, agr = "constant")
# add extended layers
sf_data$water_depletion <- extract(water_depletion, sf_data)[,2]
sf_data$freshwater_availability <- extract(freshwater_availability, sf_data)[,2]
sf_data$pop_density_mean <- extract(pop_density_mean, sf_data)[,2]
sf_data$pop_density_slope <- extract(pop_density_slope, sf_data)[,2]
sf_data$ai_annual <- extract(ai_annual, sf_data)[,2] * 0.0001 # correct scale
sf_data$et0_annual <- extract(et0_annual, sf_data)[,2] * 0.0001 # correct scale
sf_use_s2(FALSE)
sf_data <- st_join(sf_data, geological_unit)
sf_use_s2(TRUE)
ts_data <- select(raw_data, id_mine, `2015`, `2016`, `2017`, `2018`, `2019`, ToWa_2015, RaWa_2015, ToWa_2016, RaWa_2016, ToWa_2017, RaWa_2017, ToWa_2018, RaWa_2018, ToWa_2019, RaWa_2019) |>
pivot_longer(cols = -id_mine, names_to = "year", values_to = "value") |>
dplyr::mutate(row_data = ifelse(str_detect(year, "ToWa"), "total_water",
ifelse(str_detect(year, "RaWa"), "raw_water", "production")),
year = str_replace_all(year, "ToWa_", ""),
year = str_replace_all(year, "RaWa_", ""),
year = as.numeric(year)) |>
pivot_wider(id_cols = c(id_mine, year), names_from = row_data, values_from = value) |>
dplyr::mutate(production = ifelse(production == 0, NA, production), # replace 0 with NA since they are actually missing data not zero
total_water = ifelse(total_water == 0, NA, total_water),
raw_water = ifelse(raw_water == 0, NA, raw_water)) |>
left_join(st_drop_geometry(sf_data)) |>
dplyr::mutate(process_route = ifelse(process_route=="Other", "other", process_route), # harmonize to lowercase other
process_route = ifelse(process_route=="0", "other", process_route), # harmonize zero to other
process_route = ifelse(process_route=="pyro/hydro", "pyro", process_route), # merge "pyro/hydro" with "pyro" since there are not sufficient samples for "pyro/hydro"
process_route = as.character(process_route),
byproduct_group = as.character(byproduct_group),
mine_type = tolower(as.character(mine_type)),
ore_body_group = tolower(as.character(ore_body_group)),
mine_type = ifelse(mine_type=="open pit", "pit", mine_type), # simplify pit category
byproduct_group = tolower(ifelse(byproduct_group == "CuCu", "CuNN", byproduct_group)), # harmonize to lowercase
process_route = tolower(ifelse(process_route == "other", NA, process_route)), # replace other with NA since the data is actually missing
average_production = ifelse(average_production==0,NA,average_production)) |> # replace 0 with NA since they are actually missing data not zero
dplyr::mutate(id = row_number()) |> # add an unique id for each observation
relocate(id, .before = id_mine)
# Check data availability
png(filename = "./results/data_availability.png",
width = 300, height = 200, units = "mm", pointsize = 12, res = 300, bg = "white")
ts_data |>
select(total_water, raw_water, ore_body_group, process_route, mine_type, byproduct_group) |>
pivot_longer(cols = -c(total_water, raw_water)) |>
pivot_longer(cols = c(total_water, raw_water), names_to = "water_use", values_to = "volume", values_drop_na = TRUE) |>
ggplot(aes(x = value, y = after_stat(count), fill = water_use)) +
facet_wrap(~name, scales = "free") +
geom_bar(stat = 'count', position = "dodge", width = 0.7) +
geom_text(stat='count', aes(label=after_stat(count)), vjust=-0.1, position = position_dodge(width = 0.7)) +
scale_fill_grey(start = 0.8, end = 0.4) +
theme_bw()
dev.off()
png(filename = "./results/data_strata_boxplot.png",
width = 300, height = 200, units = "mm", pointsize = 12, res = 300, bg = "white")
ts_data |>
select(total_water, raw_water, ore_body_group, process_route, mine_type, byproduct_group) |>
pivot_longer(cols = -c(total_water, raw_water)) |>
pivot_longer(cols = c(total_water, raw_water), names_to = "water_use", values_to = "volume", values_drop_na = TRUE) |>
ggplot(aes(x = value, y = volume, colour = water_use)) +
facet_wrap(~name, scales = "free") +
geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=2, notch=FALSE) +
scale_colour_grey(start = 0.8, end = 0.2) +
theme_bw()
dev.off()
png(filename = "./results/data_distribution.png",
width = 300, height = 200, units = "mm", pointsize = 12, res = 300, bg = "white")
transmute(ts_data, production, ore_grade, raw_water, total_water,
water_diff = total_water - raw_water, freshwater_availability) |>
pivot_longer(cols = everything(), names_to = "variable", values_to = "value") |>
drop_na() |>
ggplot(aes(x = value)) +
geom_histogram(bins = 30, alpha = 0.7) + # Adjust 'bins' as necessary
facet_wrap(~ variable, scales = "free") + # Free scales if variables have different ranges
theme_bw() +
labs(title = "Distribution of Variables", x = "Value", y = "Frequency")
dev.off()
# ts_data |>
# dplyr::group_by(ore_body_group) |>
# summarise(production = sum(production, na.rm = TRUE)) |>
# arrange(desc(production)) |>
# dplyr::mutate(perc = production / sum(production), cum_perc = cumsum(production) / sum(production))
# ts_data |>
# dplyr::group_by(process_route) |>
# summarise(production = sum(production, na.rm = TRUE)) |>
# arrange(desc(production)) |>
# dplyr::mutate(perc = production / sum(production), cum_perc = cumsum(production) / sum(production))
# ggplot(ts_data, aes(x = ai_annual, y = raw_water)) +
# geom_point()
write_csv2(ts_data, "./data/ts_data_raw.csv")
st_write(sf_data, "./data/sf_data_raw.gpkg", delete_dsn = TRUE)
# Prepare model data
model_data <- select(ts_data, id, id_mine, mine, country_code, year, raw_water, total_water, production, average_production, ore_grade, byproduct_group, mine_type, ore_body_group, process_route, et0_annual) |>
mutate_if(is.character, as.factor) |>
dplyr::group_by(id_mine) |>
dplyr::mutate(average_production = mean(production, na.rm = TRUE), production = ifelse(is.na(production), average_production, production)) |>
dplyr::mutate(total_water = ifelse(total_water < raw_water, raw_water, total_water)) |>
dplyr::ungroup() |>
dplyr::mutate(outlier_flag = ifelse(((raw_water / production) < 0.01 | (raw_water / production) > 40) |
((total_water / production) < 0.01 | (total_water / production) > 40), 1, 0))
# Check water outliers and set to NA variables that are probably incorrect
outliers_ids <- dplyr::filter(model_data, id_mine %in% unique(as.character(dplyr::filter(model_data, outlier_flag == 1)$id_mine)))$id
model_data |>
dplyr::filter(id %in% outliers_ids) |>
select(id, id_mine, mine, country_code, year, raw_water, total_water, production, average_production)# write to write_csv2("./data/manual_correction_outliers.csv")
# Remove outliers with strange or incorrect reported numbers
corrections <- read_csv2("./data/manual_correction_outliers.csv") |>
select(id, raw_water, total_water, production, average_production)
model_data <- model_data |>
rows_update(corrections, by = "id") |>
select(-outlier_flag)
# # Remove Antamina mine as it reports incorrect water data
# model_data <- model_data |>
# dplyr::mutate(raw_water = ifelse(mine == "Antamina", NA, raw_water),
# total_water = ifelse(mine == "Antamina", NA, total_water)) |>
# dplyr::mutate(id_mine = factor(id_mine))
model_data |>
dplyr::mutate(
raw_water_intensity = raw_water / production,
total_water_intensity = total_water / production,
ore_extracted = 100 * production / ore_grade,
ratio_raw_water_ore = raw_water / ore_extracted,
ratio_total_water_ore = total_water / ore_extracted) |>
write_csv2("./data/ts_model_data.csv")
# Fill predictors gaps using knn
names(model_data)
pred_data <- recipe(raw_water + total_water ~ ., data = model_data) |>
step_impute_knn(any_of(c("production", "ore_grade", "mine_type", "process_route")),
impute_with = imp_vars(any_of(c("id", "mine", "year", "et0_annual"))), neighbors = 1,) |>
prep() |>
bake(model_data) |>
dplyr::group_by(id_mine) |>
dplyr::mutate(average_production = mean(production, na.rm = TRUE)) |>
dplyr::ungroup() |>
dplyr::mutate(
raw_water_intensity = raw_water / production,
total_water_intensity = total_water / production,
ore_extracted = 100 * production / ore_grade,
ratio_raw_water_ore = raw_water / ore_extracted,
ratio_total_water_ore = total_water / ore_extracted)
# aux <- model_data |>
# select(id, id_mine, mine, country_code, year, production, average_production, ore_grade, mine_type, process_route) |>
# filter(if_any(everything(), is.na))
# pred_data |>
# dplyr::filter(id_mine %in% unique(aux$id_mine)) |>
# select(id, id_mine, mine, country_code, year, raw_water, total_water, production, average_production, ore_grade, mine_type, process_route) |>
# View()
write_csv2(pred_data, "./data/ts_pred_data.csv")
################################################################################
######## Check outliers
# Fit a linear model
model_data <- select(pred_data, id, id_mine, raw_water_intensity, production, ore_grade, process_route, mine_type, byproduct_group) |>
drop_na()
model <- lm(raw_water_intensity ~ production + ore_grade + process_route + mine_type + byproduct_group, data = pred_data)
# Calculate residuals
model_data$residuals <- abs(residuals(model))
# Identify large residuals
outlier_threshold <- mean(model_data$residuals) + 2 * sd(model_data$residuals)
outliers <- model_data$residuals > outlier_threshold
outlier_data <- model_data[outliers, ]
outlier_data |>
arrange(residuals)
select(dplyr::filter(pred_data, id_mine == 329), any_of(c(names(model_data), "raw_water", "total_water")))
ggplot(model_data, aes(sample = residuals)) +
stat_qq() +
stat_qq_line() +
labs(title = "Q-Q Plot of Residuals", x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_minimal()
# Cook's Distance
model_data$cooks_distance <- cooks.distance(model)
# Identify influential points
influential_threshold <- 4 / nrow(model_data)
influential_points <- model_data$cooks_distance > influential_threshold
# Calculate Z-scores
pred_data2 <- pred_data |>
dplyr::mutate(raw_water_intensity_z = scale(raw_water_intensity, center = TRUE, scale = TRUE)[,1])
# Identify outliers using Z-score
outliers_z <- pred_data2 |>
dplyr::filter(abs(raw_water_intensity_z) > 3)
# Calculate IQR and identify outliers
iqr_value <- IQR(pred_data2$raw_water_intensity, na.rm = TRUE)
q1 <- quantile(pred_data2$raw_water_intensity, 0.25, na.rm = TRUE)
q3 <- quantile(pred_data2$raw_water_intensity, 0.75, na.rm = TRUE)
# Choose a multiplier
multiplier <- 2 # Less strict; adjust based on your data and needs
outliers_iqr <- dplyr::filter(pred_data2, raw_water_intensity < (q1 - multiplier * iqr_value) | raw_water_intensity > (q3 + multiplier * iqr_value))
outliers_iqr