-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path05_region-info.R
256 lines (196 loc) · 9.79 KB
/
05_region-info.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
##################################################################
### 5. Add region info
##################################################################
print("05_region-info.R")
regions_fao_exio <- readODS::read_ods(paste0(folder_path, "hybrid/fabio-exiobase.ods"), sheet = 3)
regions_fabio <- regions_fao_exio %>%
dplyr::select(name = Country, iso3 = ISO)
regions_exio <- regions_fao_exio %>%
dplyr::select(EXIOBASE, EXIOBASE_code) %>%
dplyr::distinct() %>%
dplyr::arrange(EXIOBASE_code) %>%
dplyr::filter(!is.na(EXIOBASE)) %>% # filter NA, used for RoW
dplyr::select(name = EXIOBASE)
# FABIO data for region table --------------------------------------------------
region <- RPostgres::dbReadTable(db, "region")
if(nrow(region) == 0){
insert_data <- regions_fabio
DBI::dbAppendTable(db, name = "region", value = insert_data)
rm(insert_data)
region <- RPostgres::dbReadTable(db, "region")
}
# EXIOBASE data for region table -----------------------------------------------
if(nrow(region) <= n_region_fabio){ # this means that only FABIO regions are in the table so far
insert_data <- regions_exio
DBI::dbAppendTable(db, name = "region", value = insert_data)
rm(insert_data)
region <- RPostgres::dbReadTable(db, "region")
}
# region_cluster table ---------------------------------------------------------
# we use two main clusters, FABIO and EXIOBASE
region_cluster <- RPostgres::dbReadTable(db, "region_cluster")
if(nrow(region_cluster) == 0){
insert_data <- data.frame(
name = c(name_fabio, name_exio)
)
DBI::dbAppendTable(db, name = "region_cluster", value = insert_data)
rm(insert_data)
region_cluster <- RPostgres::dbReadTable(db, "region_cluster")
}
# region_cluster_region table --------------------------------------------------
# now we populate the FABIO and EXIOBASE clusters
region_cluster_region <- RPostgres::dbReadTable(db, "region_cluster_region")
if(nrow(region_cluster_region) == 0){
region_cluster_fabio <- region %>%
dplyr::slice(1:n_region_fabio) %>%
dplyr::select(id) %>%
dplyr::rename(id_region = id) %>%
dplyr::mutate(id_region_cluster = region_cluster$id[region_cluster$name == name_fabio])
region_cluster_exio <- region %>%
dplyr::slice((n_region_fabio+1):nrow(region)) %>%
dplyr::select(id) %>%
dplyr::rename(id_region = id) %>%
dplyr::mutate(id_region_cluster = region_cluster$id[region_cluster$name == name_exio])
DBI::dbAppendTable(db, name = "region_cluster_region", value = region_cluster_fabio)
DBI::dbAppendTable(db, name = "region_cluster_region", value = region_cluster_exio)
region_cluster_region <- RPostgres::dbReadTable(db, "region_cluster_region")
}
# now add the information about region_cluster to the region so we get
# region_fabio and region_exio that are used for further processing
region_cluster_tbl <- dplyr::tbl(db, "region_cluster")
region_cluster_region_tbl <- dplyr::tbl(db, "region_cluster_region")
region_tbl <- dplyr::tbl(db, "region") %>%
dplyr::full_join(region_cluster_region_tbl, by = c("id" = "id_region")) %>%
dplyr::left_join(region_cluster_tbl, by = c("id_region_cluster" = "id"), suffix=c("",".cluster")) %>%
dplyr::filter(name.cluster %in% c(name_fabio, name_exio)) %>%
dplyr::select(-geometry, -id_region_cluster)
region_fabio <- region_tbl %>%
dplyr::filter(name.cluster == name_fabio) %>%
dplyr::select(-name.cluster) %>%
dplyr::collect()
region_exio <- region_tbl %>%
dplyr::filter(name.cluster == name_exio) %>%
dplyr::select(-name.cluster) %>%
dplyr::collect()
# function for region_aggregate ------------------------------------------------
add_region_aggregate <- function(db,
region_aggregate_id,
region_in_aggregate_ids){
## 1. check if the db is valid
if(!DBI::dbIsValid(db)){
base::stop("Database connection is invalid.")
}
## 2. check the args with the ids from the db to make sure all are region ids
ids <- dplyr::tbl(db, "region") %>% dplyr::select(id) %>% dplyr::collect()
if(!(region_aggregate_id %in% ids$id)){
base::stop("Region_aggregate_id is not in the list of possible ids from the database.")
}
if(any(!(region_in_aggregate_ids %in% ids$id))){
base::stop("At least one of the region_in_aggregate_ids is not
in the list of possible ids from the database.")
}
## 3. before we insert the data, we check for data already in the db
reg_agg <- dplyr::tbl(db, "region_aggregate") %>%
dplyr::filter(region_aggregate == region_aggregate_id) %>%
dplyr::select(region_in_aggregate) %>%
dplyr::collect()
# we only insert the ids that are not yet in the db
region_in_aggregate_ids <- region_in_aggregate_ids[!(region_in_aggregate_ids %in%
reg_agg$region_in_aggregate)]
# return if there are no new ids to insert
if(length(region_in_aggregate_ids) == 0){ return() }
## 4. create the insert_data
insert_data <- data.frame(
region_aggregate = region_aggregate_id,
region_in_aggregate = region_in_aggregate_ids
)
## 5. insert data into the db, that doesn't exist yet
DBI::dbAppendTable(db, name = "region_aggregate", value = insert_data)
}
# match FABIO and EXIOBASE regions in region_aggregate -------------------------
# e.g. the FABIO region Austria will be matched with EXIO region Austria
# but FABIO regions Albania, Andorra and others with EXIO region RoW Europe
for(reg_exio_id in region_exio$id){
reg_exio_name <- region_exio$name[region_exio$id == reg_exio_id]
# get all FAO countries in this EXIOregion
regions_in_aggregate <- regions_fao_exio$Country[regions_fao_exio$EXIOregion ==
reg_exio_name &
!is.na(regions_fao_exio$EXIOregion)]
# now get the ids from the database for all exact matches in the names
ids <- region_fabio$id[region_fabio$name %in% regions_in_aggregate]
add_region_aggregate(db, reg_exio_id, ids)
}
# continent-data for region_cluster_region -------------------------------------
## now we take the data for Continent and create a region cluster
## methodology behind choosing what cluster to use:
# 1. if FABIO country = EXIO region,
# then take the continent data from regions_fabio
# 2. if RoW EXIO region = FABIO continent, take the continent data
# 3. if RoW EXIO region covers multiple FABIO continents
# i.e. RoW Middle East, I choose the dominant one (which is Asia & Pacific)
# region_cluster_region table --------------------------------------------------
# now we populate the FABIO and EXIOBASE clusters
region_cluster_region <- RPostgres::dbReadTable(db, "region_cluster_region")
if(nrow(region_cluster_region) <= (n_region_fabio+n_region_exio)){ # 241 = 192 (FABIO reg) + 49 (EXIO reg)
# first, the Continent information is manually recoded
# to allow for displaying the new names directly
region_cluster_fabio <- read.csv("../data/regions.csv", encoding = "latin1")
region_cluster_fabio$continent <-
dplyr::recode(region_cluster_fabio$continent,
`AFR` = "Africa",
`ASI` = "Asia and Pacific",
`EU` = "European Union",
`EUR` = "Europe (non-EU)",
`LAM` = "Latin America",
`NAM` = "North America",
`OCE` = "Asia and Pacific",
`ROW` = "Rest of World"
)
region_cluster_fabio$continent <- as.character(region_cluster_fabio$continent)
# NOTE: the term "Continent" might be a bit confusing here
# as EU or RoW are not actual continents but we keep using it for now
region_cluster_exio <- regions_fao_exio %>%
dplyr::left_join(region_cluster_fabio[,c("iso3c","continent")],
by = c("ISO" = "iso3c")) %>%
dplyr::select(EXIOBASE, continent) %>%
dplyr::mutate(continent = if_else(EXIOBASE == "RoW Middle East",
"Asia and Pacific",
continent)) %>%
dplyr::distinct() %>%
dplyr::arrange(EXIOBASE) %>%
dplyr::filter(!is.na(EXIOBASE),
!is.na(continent))
## we create our continents in the region_cluster table
# region_cluster table ---------------------------------------------------------
region_cluster <- RPostgres::dbReadTable(db, "region_cluster")
if(nrow(region_cluster) <= 2){
insert_data <- data.frame(
name = unique(region_cluster_fabio$continent) %>% sort()
)
DBI::dbAppendTable(db, name = "region_cluster", value = insert_data)
rm(insert_data)
region_cluster <- RPostgres::dbReadTable(db, "region_cluster")
}
## now we join ids from the concordance tables
# insert continent-clusters for FABIO regions
insert_data <- region_cluster_fabio %>%
dplyr::left_join(region_cluster, by = c("continent" = "name")) %>%
dplyr::rename(id_region_cluster = id) %>%
dplyr::left_join(region_fabio[,c("id", "name")], by = c("name" = "name")) %>%
dplyr::rename(id_region = id) %>%
dplyr::select(id_region_cluster, id_region)
# insert into db
DBI::dbAppendTable(db, name = "region_cluster_region", value = insert_data)
# insert continent-clusters for EXIOBASE regions
insert_data <- region_cluster_exio %>%
dplyr::left_join(region_cluster, by = c("continent" = "name")) %>%
dplyr::rename(id_region_cluster = id) %>%
dplyr::left_join(region_exio[,c("id", "name")], by = c("EXIOBASE" = "name")) %>%
dplyr::rename(id_region = id) %>%
dplyr::select(id_region_cluster, id_region) %>%
dplyr::distinct()
# insert into db
DBI::dbAppendTable(db, name = "region_cluster_region", value = insert_data)
rm(insert_data)
region_cluster_region <- RPostgres::dbReadTable(db, "region_cluster_region")
}