-
Notifications
You must be signed in to change notification settings - Fork 0
/
age_structure.Rmd
299 lines (215 loc) · 6.39 KB
/
age_structure.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
---
title: "Making age structure data by province"
output: html_document
date: "2023-02-24"
editor_options:
chunk_output_type: console
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Introduction
The code below uses raster-based modelled data from WorldPop on age structure to
generate a data frame of age structure by administrative divisions (here
provinces). The GitHub repository of this code is
[https://github.com/OUCRU-Modelling/aes](https://github.com/OUCRU-Modelling/aes).
To transform this code into an R script, use:
```{r eval = FALSE}
knitr::purl("age_structure.Rmd", "age_structure.R")
```
## Local folders
Specifying the input and output folders:
```{r folders}
root <- "~/OneDrive - Oxford University Clinical Research Unit/data/"
gadm_data_folder <- paste0(root, "GADM/")
wpop_data_folder <- paste0(root, "WorldPop/raw/age structure/Vietnam")
wpop_data_processed <- paste0(root, "WorldPop/processed/age structure/Vietnam/age_data.rds")
```
Checking whether the WorldPop processed data already exist locally:
```{r}
data_available <- file.exists(wpop_data_processed)
```
## Packages
Packages needed for the analysis:
```{r}
needed_package <- c("dplyr", "tidyr", "purrr", "terra", "stringr", "sf", "parallel", "magrittr")
```
Installing the packages that are not already installed:
```{r}
to_install <- needed_package[which(!needed_package %in% installed.packages()[, "Package"])]
if (length(to_install)) install.packages(to_install)
```
Loading the packages:
```{r message = FALSE}
dev_null <- sapply(needed_package, function(libname) library(libname, character.only = TRUE))
rm(dev_null)
```
## Downloading provinces polygons data from GADM
GADM root URL:
```{r}
gadm <- "https://geodata.ucdavis.edu/gadm/"
```
Version and format:
```{r}
vers_form <- "gadm3.6/Rsf/"
```
File:
```{r}
file_name <- "gadm36_VNM_1_sf.rds"
```
Creating the local folder structure if it does not exist:
```{r}
local_folder <- paste0(gadm_data_folder, vers_form)
if (!dir.exists(local_folder)) dir.create(local_folder, recursive = TRUE)
```
Downloading the data if not already available locally:
```{r}
local_file <- paste0(local_folder, file_name)
if (!file.exists(local_file)) download.file(paste0(gadm, vers_form, file_name), local_file)
```
Loading the polygons data:
```{r}
provinces <- local_file %>%
readRDS() %>%
st_set_crs(4326) # setting the modern code of the WGS 84 projection
```
Checking the map of the provinces polygons:
```{r}
provinces %>%
st_geometry() %>%
plot()
```
## Downloading demographic raster data from WorldPop
WorldPop URL:
```{r}
worldpop <- "https://data.worldpop.org/"
```
All the combinations of years, ages and gender:
```{r}
ages <- c(0, 1, seq(5, 80, 5))
combinations <- expand_grid(years = 2000:2020,
ages = ages,
sexes = c("f", "m"))
```
Creating the long files names:
```{r}
files_names <- with(combinations,
paste0("GIS/AgeSex_structures/Global_2000_2020/",
years, "/VNM/vnm_", sexes, "_", ages, "_", years, ".tif"))
```
The corresponding local files names:
```{r}
local_files_names <- paste0(wpop_data_folder, str_remove(files_names, "^GIS.*VNM"))
```
Checking for those that do not exist locally yet:
```{r}
sel <- which(!map_lgl(local_files_names, file.exists))
```
Downloading the files that are missing:
```{r}
if (length(sel) & !data_available) {
walk2(paste0(worldpop, files_names[sel]), local_files_names[sel], download.file)
}
```
## Generating province data
A function that converts the polygon of a given province of `provinces` into a
`SpatVector` object:
```{r}
prov2vect <- function(prov_name) {
provinces %>%
filter(VARNAME_1 == prov_name) %>%
vect()
}
```
A function that crops and masks a `SpatRaster` object `rst` with a `SpatVector`
object `pol`:
```{r}
crop_mask <- function(rst, pol) {
rst %>%
crop(pol) %>%
mask(pol)
}
```
A function that extracts the number of people of a raster `rst` inside a polygon
`pol`:
```{r}
nb_people <- function(rst, pol) {
crop_mask(rst, pol) %>%
values() %>%
sum(na.rm = TRUE)
}
```
A function that converts a TIFF file (i.e. number of people for a given year,
age, and sex) into a data frame with number of people per province:
```{r}
tif2df <- function(tif_file) {
tmp <- first(str_split(str_remove(tif_file, ".tif"), "_"))
tif_file %>%
paste0(wpop_data_folder, "/", .) %>%
rast() %>%
map_dbl(prov_vect_list, nb_people, rst = .) %>%
tibble(year = tmp[4], province = provinces_names, sex = tmp[2], age = tmp[3], n = .)
}
```
The names of the provinces:
```{r}
provinces_names <- provinces$VARNAME_1
```
A list of provinces polygons as `SpatVector`:
```{r}
prov_vect_list <- map(provinces_names, prov2vect)
```
Processing all the files to generate the age data frame (it takes about 30' on
11 cores of a 3.2 GHz 6-Core Intel i7):
```{r}
if (data_available) {
age_data <- readRDS(wpop_data_processed)
} else {
hash <- setNames(c(paste0("[", head(ages, -1), ";", tail(ages, -1), "["), "80+"), ages)
age_data <- wpop_data_folder %>%
dir() %>%
mclapply(tif2df, mc.cores = detectCores() - 1) %>%
bind_rows() %>%
mutate(age_class = hash[age],
sex = c(f = "female", m = "male")[sex],
age = as.integer(age))
saveRDS(age_data, wpop_data_processed)
}
```
The data look like this:
```{r}
age_data
```
## Testing the data
Let's look at the change in the whole country population size over the years:
```{r}
pop_in_a_year <- function(x) {
age_data %>%
filter(year == x) %>%
pull(n) %>%
sum()
}
plot2 <- function(...) {
plot(..., xlab = "", type = "o", col = 4)
}
years <- 2000:2020
pop_sizes <- map_dbl(years, pop_in_a_year)
plot2(years, pop_sizes, ylab = "population size")
```
Looks OK. Now, let's see proportions of different age classes over the years,
for the whole country again:
```{r}
tmp <- age_data %>%
group_by(year, age) %>%
summarise(n = sum(n)) %>%
mutate(p = n / sum(n),
p = cumsum(p)) %>%
ungroup()
with(tmp, plot(year, p, ylim = 0:1, type = "n", xlab = "",
ylab = "proportions (from younger to older)"))
tmp %>%
group_by(age) %>%
group_walk(~ with(.x, lines(year, p, col = 4)))
```
Looks OK too, in the sense that the proportions of the age classes are not
constant with time, and also that the population is getting older and older.