Skip to content

Commit

Permalink
clarified new TR methods a bit
Browse files Browse the repository at this point in the history
  • Loading branch information
gclawson1 committed Sep 21, 2023
1 parent 48c257b commit 42272f2
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 46 deletions.
55 changes: 20 additions & 35 deletions globalprep/tr/v2023/R/process_UNWTO_arrivals.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@ unwto_clean <- unwto_arrivals %>%
group_by(country, year) %>% # group by county and year
mutate(
tourism_arrivals_ct = case_when(
# !is.na(overnights) & !is.na(same_day) ~ overnights + same_day, # when there are overngihts and same_day values, use the sum of those for tourism-related arrivals
# !is.na(overnights) & is.na(same_day) ~ overnights, # when there is just overnights, use that value
# !is.na(same_day) & is.na(overnights) ~ same_day, # when there is just same_day, use that value
# is.na(same_day) & is.na(overnights) & !is.na(total) ~ total, # when there are neither of those values and a total value is available, use that (last resort because may include non-tourism related arrivals; total is often the sum of overnights and same_day, however)
!is.na(overnights) ~ overnights,
is.na(overnights) & !is.na(same_day) & !is.na(total) ~ total - same_day,
TRUE ~ tourism_arrivals_ct
Expand All @@ -54,34 +50,18 @@ unwto_match_iso3c <- unwto_clean %>%
dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total) %>%
filter(!is.na(rgn_id))

unwto_clean_names_bonaire <- name_2_rgn(df_in = unwto_clean %>% filter(country == "Bonaire"),
unwto_clean_names_bonaire <- name_2_rgn(df_in = unwto_clean %>% filter(country == "Bonaire"), # do this just for Bonaire since it is the only region not matching above
fld_name = 'country',
# flds_unique = c('year'),
keep_fld_name = TRUE) %>%
dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total)#### losing lots of regions here for some reason... most concerndely USA
# v2023: no countries removed for not have any match in the lookup tables
# DUPLICATES found. Confirm your script consolidates these as appropriate for your data.
#
# # A tibble: 11 × 1
# country
# <chr>
# 1 China
# 2 Guadeloupe
# 3 Guam
# 4 Hong Kong, China
# 5 Macao, China
# 6 Martinique
# 7 Montenegro
# 8 Northern Mariana Islands
# 9 Puerto Rico
# 10 Serbia And Montenegro
# 11 United States Virgin Islands


unwto_clean_names <- rbind(unwto_clean_names_bonaire, unwto_match_iso3c) %>% # rbind back together. I would've used the name_2_rgns fxn for everything, but it was excluding a lot of regiosn for some reason...
left_join(rgns_eez) %>%
dplyr::select(rgn_id, rgn_name, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total)

# fix duplicates
# fix duplicates if there are any
unwto_dupe_fix <- unwto_clean_names %>%
group_by(rgn_id, year, arrivals_method, arrivals_gapfilled) %>%
summarize(sum_fix = ifelse(all(is.na(tourism_arrivals_ct)), NA, sum(tourism_arrivals_ct, na.rm = TRUE)),
Expand All @@ -91,12 +71,11 @@ unwto_dupe_fix <- unwto_clean_names %>%
total = sum_fix_2)

# check out things so far
summary(unwto_dupe_fix) # v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in total
summary(unwto_dupe_fix) # v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total`

# gapfill arrivals
# downfill then upfill missing values
# use 2019 if available to fill 2021, then 2020 if not, to account for COVID-19
# v2023: check rgn_id 114, for example, for if choosing 2019 vs. 2020 worked
# downfill then upfill missing values using a linear model of the average increase per years across all years of data for 1995-2019
# for 2020 and 2021, use the global average proportion increase or decrease and add to the previous years value

test <- unwto_dupe_fix %>%
filter(year %in% c(2020, 2021)) %>%
Expand All @@ -105,9 +84,9 @@ test <- unwto_dupe_fix %>%
mutate(tourism_ct_diff = (tourism_arrivals_ct_2021 - tourism_arrivals_ct_2020)/tourism_arrivals_ct_2020) %>%
mutate(total_diff = (total_2021 - total_2020)/total_2020)

gf_2021_tourism <- mean(test$tourism_ct_diff, na.rm = TRUE)
gf_2021_tourism <- mean(test$tourism_ct_diff, na.rm = TRUE) # global average increase for 2021 tourist

gf_2021_total <- mean(test$total_diff, na.rm = TRUE)
gf_2021_total <- mean(test$total_diff, na.rm = TRUE) # global average increase for 2021 total

#
# plot(test$total_diff, test$tourism_ct_diff) # looks pretty linear, so thats good, we can use the same method of gapfilling
Expand All @@ -121,9 +100,9 @@ test <- unwto_dupe_fix %>%
pivot_wider(names_from = year, values_from = c(tourism_arrivals_ct, total)) %>%
mutate(tourism_ct_diff = (tourism_arrivals_ct_2020 - tourism_arrivals_ct_2019)/tourism_arrivals_ct_2019) %>%
mutate(total_diff = (total_2020 - total_2019)/total_2019)

gf_2020_tourism <- mean(test$tourism_ct_diff, na.rm = TRUE)
gf_2020_total <- mean(test$total_diff, na.rm = TRUE)
gf_2020_tourism <- mean(test$tourism_ct_diff, na.rm = TRUE) # global average decerease for 2020 tourst
gf_2020_total <- mean(test$total_diff, na.rm = TRUE) # global average decrease for 2020 total

#
# plot(test$total_diff, test$tourism_ct_diff) # looks pretty linear, so thats good, we can use the same method of gapfilling
Expand All @@ -137,16 +116,18 @@ unwto_upfill <- unwto_dupe_fix %>%
group_by(rgn_id) %>%
arrange(rgn_id, year) %>%
tidyr::fill(tourism_arrivals_ct, .direction = "up") %>%
tidyr::fill(total, .direction = "up") %>% # fill in any values that are empty from early years with values from the nearest year
tidyr::fill(total, .direction = "up") %>% # fill in any values that are empty from early years with values from the nearest year. Doing this because doesn't make sense to add earlier years based on a trend
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "nearby year", arrivals_method)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "nearby year", "gapfilled", arrivals_gapfilled))

## calculate regional average increase or decrease in number of total arrivals
lm_coef_data_total <- unwto_dupe_fix %>%
filter(!(year %in% c(2020, 2021))) %>%
group_by(rgn_id) %>%
filter(!is.na(total)) %>%
summarize(lm_coef_total = if (n() > 1) lm(total ~ year)$coefficients[2] else 0, .groups = 'drop') # give it an addition of 0 if it is stagnant

## calculate regional average increase or decrease in number of tourism arrivals
lm_coef_data_tourism <- unwto_dupe_fix %>%
filter(!(year %in% c(2020, 2021))) %>%
group_by(rgn_id) %>%
Expand All @@ -157,6 +138,7 @@ lm_coef_data_tourism <- unwto_dupe_fix %>%
# Initialize a flag to check if there are still NAs
na_flag <- TRUE

# filter out any regions with all nas for each year, as these can't be gapfilled
all_nas_tourism <- unwto_upfill %>%
group_by(rgn_id) %>%
filter(all(is.na(tourism_arrivals_ct))) %>%
Expand Down Expand Up @@ -233,18 +215,20 @@ while(na_flag) {

unwto_gapfill_lm_2019_total_all <- unwto_gapfill_lm_2019_total %>%
dplyr::select(-lm_coef_total) %>%
mutate(total = ifelse(rgn_id == 67 & total < 0, 204000, total)) %>% # fix libya, as it was being given negative values with the gapfill. Just give it its latest year (downfill)
mutate(total = ifelse(rgn_id == 67 & total < 0, unwto_gapfill_lm_2019_total %>%
filter(rgn_id == 67, year == 2009 ) %>% pull(total) , total)) %>% # fix libya, as it was being given negative values with the gapfill. Just give it its latest year (downfill)
rbind(all_nas_total)


unwto_2020_2021 <- unwto_dupe_fix %>%
filter(year > 2019)
filter(year > 2019) # lets fix 2020 and 2021 now

unwto_all_gf <- unwto_gapfill_lm_2019_tourism_all %>%
left_join(unwto_gapfill_lm_2019_total_all) %>%
rbind(unwto_2020_2021) %>%
group_by(rgn_id) %>%
arrange(rgn_id, year) %>%
# apply global average proportional increase or decrease for 2020 and 2021, because of covid pandemic messing up trends...
mutate(tourism_arrivals_ct = ifelse(year == 2020 & is.na(tourism_arrivals_ct), lag(tourism_arrivals_ct, n = 1) + lag(tourism_arrivals_ct, n = 1)*gf_2020_tourism, tourism_arrivals_ct),
total = ifelse(year == 2020 & is.na(total), lag(total, n = 1) + lag(total, n = 1)*gf_2020_total, total)) %>%

Expand All @@ -258,6 +242,7 @@ unwto_all_gf <- unwto_gapfill_lm_2019_tourism_all %>%



## old way
# unwto_dupe_fix_downup_gf <- unwto_dupe_fix %>%
# fill(tourism_arrivals_ct, .direction = "downup") %>%
# fill(total, .direction = "downup") %>%
Expand Down
12 changes: 1 addition & 11 deletions globalprep/tr/v2023/tr_data_prep.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,13 @@ tr = Ap * Sr
and
Xtr = tr/90th quantile across regions

* Ap = Proportion of arrivals to area of coastline to population (formerly Ep when it represented employment)
* Ap = Proportion of overnight tourist arrivals to total arrivals
* Sr = (S-1)/5; Sustainability of tourism


## The following data are used:

* Numbers of tourist arrivals, used in the calculation of the proportion of arrivals to area of coastline to population: obtained through the [UNWTO](https://www.unwto.org/tourism-statistics/key-tourism-statistics) (in the form of thousands of people). More info on tourism terms [here](https://www.unwto.org/glossary-tourism-terms). Range: 1995-2021 (only 2008-2021 is used)
* Area of coastline, used in the calculation of the proportion of arrivals to area of coastline to population: calculated in the [LSP goal](https://github.com/OHI-Science/ohiprep_v2023/tree/gh-pages/globalprep/lsp/v2023). Range: Static
* Population, used in the calculation of the proportion of arrivals to area of coastline to population: primarily uses [World Bank](https://data.worldbank.org/indicator/SP.POP.TOTL) data obtained through the WDI() function. Combines [Our World in Data](https://ourworldindata.org/grapher/population) and Statista data ([1](https://www.statista.com/statistics/706807/population-of-saba-in-the-caribbean-netherlands/), [2](https://www.statista.com/statistics/706806/population-of-sint-eustatius-in-the-caribbean-netherlands/), and [3](https://www.statista.com/statistics/706799/population-of-bonaire-in-the-caribbean-netherlands/)) (obtained on their respective websites) with this to get population data for all OHI regions with arrivals and area of coastline data. Range: World Bank - 1960-2022 (2008-2021 is selected and used); Our World in Data - 10,000 BCE-2021 (2008-2021 is used); Statista - 2011-2023 (2011-2021 is used; as of v2023, 3 OHI regions use this data).
* Tourism sustainability: World Economic Forum. The Travel & Tourism Development Index 2021 dataset (version 24 May 2022). 2022. [TTDI](https://www.weforum.org/reports/travel-and-tourism-development-index-2021/downloads-510eb47e12#report-nav)
* Per capita GDP: (World Bank with gaps filled using CIA data), used to gapfill missing values in Tourism sustainability

Expand Down Expand Up @@ -99,14 +97,6 @@ However, in light of the Covid-19 pandemic, we have adopted a distinct approach
# source in cleaned UNWTO data for current version year (make sure to download from website and put on Mazu in the UNWTO folder first)
source(here(paste0("globalprep/tr/v", version_year, "/R/process_UNWTO_arrivals.R"))) # outputs unwto_dupe_fix_downup_gf
# source in prepared area of coastline data: uses current version year for these files by default (acquire up-to-date LSP data if applicable)
# source(here(paste0("globalprep/tr/v", version_year, "/R/process_area_of_coastline.R"))) # outputs inland_offshore
# source in prepared populations data by ohi region: uses current version year for these files by default (download up-to-date data if applicable)
# source(here(paste0("globalprep/tr/v", version_year, "/R/process_populations.R"))) # outputs combined_pops_filled
# check outputs of everything!
coastal_pop_data <- read.csv(here("globalprep/mar_prs_population/v2021/output/mar_pop_25mi.csv")) ## read in coastal population data from other data layer
coastal_pop_data_fill <- coastal_pop_data %>%
Expand Down
File renamed without changes.

0 comments on commit 42272f2

Please sign in to comment.