Wave 1+2 descriptives.Rmd

---
title: "Wave 1+2 Descriptives"
author: "Oliver Twardus & Igor"
date: "7/27/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(forecast)
library(psych)
library(tidyverse)
library(irr)
library(lme4)
library(ggplot2)
library(tidyr)
library(emmeans)
library(car)
library(jtools)
library(dplyr)
library(ggsci)
library(dplyr)
library(Hmisc)

options(max.print = 20000, scipen = 1000)

```

```{r Global Variables}

# indicates the linetype used in all graphs
lineStyle <- "loess"


# xAxis <- scale_x_continuous(breaks=seq(2, 19, 2))
# xAxis2 <- scale_x_continuous(breaks=seq(-11, 13, 4))


# list of domains
domains <- c("lifesat", "posaffect", "negaffect", "ideoldem",  "ideolrep",  "polar", "iasian", "easian", "iafric", "eafric", "igend", "egend")

```

```{r Functions}

pct_change <- function(previous, new, as_decimal = FALSE) {
  x <- abs(((new - previous) / previous) * 100)
  if (as_decimal) x <- x / 100
  return(x)
}

```

```{r setup working directory}
setwd("~/GitHub/Forecasting-Tournament") #igor's working directory
```

```{r historical data}


# import 46 months of historical values up to October 2020
dat_hist <- read.csv("historical_data.csv", stringsAsFactors = FALSE)

# Create a list of trends for each domain, ordered in the same manner as domains variable up above
hist_trend <- list()

for (i in 1:length(domains)) {

  trend <- dat_hist[40, domains[i]] - dat_hist[1, domains[i]]
  hist_trend[[i]] <- trend

}
```


```{r Import Data}

# filters by completion time, so that prolific sample excludes predictions that took less than 50 seconds to make
dat <- read.csv("Wave1+2data.csv", stringsAsFactors = FALSE)

# data set below does not filter lay sample by completion time
# dat <- read.csv("Wave1+2data_coded_2021-05-18.csv", stringsAsFactors = FALSE)

# list of notable columns and what they mean:
# * some columns are omitted because they will be removed / are redundant

# phase - value of 1 indicates submission was received during phase 1 (June 2020), value of 2 indicates submission was received during phase 2 (November 2020)

# isExpert - indicates whether submission is by an academic (1) or layperson (0)
dat$isExpert.factor <- factor(dat$isExpert, levels = c(0,1), labels = c("Prolific", "Academic"))

# revised - indicates whether a team submitted to both phase 1 & 2 of the tournament (i.e. submitted in June and then sent a revised submission in November)

# domain - indicates which domain the forecast is for. Shorthand is used here with the following terms referring to each domain:
# lifesat = Life Satisfaction
# posaffect = Positive Affect
# negaffect = Negative Affect
# ideoldem = Political Ideology - Democrat
# ideolrep = Political Ideology - Republican
# polar = Political Polarization
# iasian = Implicit Asian-American Bias
# easian = Explicit Asian-American Bias
# iafric = Implicit African-American Bias
# eafric = Explicit African-American Bias
# igend = Implicit Gender-Career Bias
# egend = Explicit Gender-Career Bias

# Month.1 - Month.18 columns list participant predictions for a given domain. 
#   All phase 1 (June) predictions range from Month.1 - Month.12
#   All phase 2 (November) predictions range from Month.7 - Month.18

# mean_error - output of forecast package's accuracy() function - displays the mean error (ME) of Month.1 - Month.6 predictions compared to the objective data
# root_mean_sqr_error - displays the root mean square error (RMSE) of Month.1 - Month.6 predictions compared to the objective data
# mean_abs_error - displays the root mean absolute error (MAE) of Month.1 - Month.6 predictions compared to the objective data
# mean_percent_error - displays the mean percent error (MPE) of Month.1 - Month.6 predictions compared to the objective data
# mean_abs_percent_error - displays the mean absolute percent error (MAPE) of Month.1 - Month.6 predictions compared to the objective data
# mean_abs_scaled_error_1 - MASE computed using custom computeMASE function
# mean_abs_scaled_error_2 - MASE computed using Metrics::mase function

# RMSE_cutoff - whether the prediction's RMSE is less than or greater than a naive forecast for the same time period
dat$RMSE_cutoff_Naive_linear.factor <- factor(dat$RMSE_cutoff_Naive_linear, levels = c(0, 1), labels = c("below cutoff", "above cutoff"))
dat$RMSE_cutoff_Naive_rwf.factor <- factor(dat$RMSE_cutoff_Naive_rwf, levels = c(0, 1), labels = c("below cutoff", "above cutoff"))

# confidence - indicates on scale of 1-7 how confident participants were in their predictions
# subexpert - indicates on scale of 1-7 the participant's self-reported expertise in the domain they are predicting
# pub - the number of publications the team has made on the predicted domain

# model, theory, parameters - all contain participant written responses regarding what model they used, what theory they relied on, and what conditionals they considered

# numpred - number of conditionals (beyond the domain predicted) that participants considered in their prediction
# covidcondyn - whether covid-19 was considered as a conditional in their forecast
# datatrain - whether participants used the forecast data that was provided to them
# counterfact & othercounter - written response indicating the counterfactual participants considered
# counter_imp & othercountim - how important they consider their counterfactual to be

# Method - indicates forecasting method used to generate forecast - either Intuition/Theory, Data-Driven, Mixed, Simulation, or Objective - latter category is used to indicate the objective data for each domain for Months 1-6
# Method.coded - 1 - Intuition / 2 - Theory / 3 - Data-driven / 4 - Mixed / 5 - Objective / 6 - Naive - linear / 7 - Naive - rwf

dat$Method.code[dat$Method.coded==1]<-"Intuition/Theory"
dat$Method.code[dat$Method.coded==2]<-"Intuition/Theory"
dat$Method.code[dat$Method.coded==3]<-"Data-Driven"
dat$Method.code[dat$Method.coded==4]<-"Hybrid"
dat$Method.code[dat$Method.coded==5]<-"Ground Truth"
dat$Method.code[dat$Method.coded==6]<-"Naive-linear"
dat$Method.code[dat$Method.coded==7]<-"Naive-rfw"
dat$Method.code[dat$isExpert==0]<-"Lay People"


# Method.complex - ONLY PHASE 1, coded 1-3 scale indicating whether the Data-driven or Mixed method used is simple (e.g., regression to the mean), moderate (e.g., auto-regression w time lag, univariate time series), or complex (e.g., ARIMA, dynamic econometric model)

dat$Method.complex.factor <- factor(dat$Method.complex, levels = c(1:3), labels = c("simple", "moderate", "complex"))

# team_size.coded - self-reported measure indicating number of team-members in the team
# team_expertise - written response of team's general expertise

# FOLLOWING VARIABLES ARE EXCLUSIVE TO LAY SAMPLE because it consists entirely of individuals whereas academic sample consists of teams
# Age (num)
# Sex (1 = Male, 2 = Female, 3 = Prefer not to say)
dat$Sex.factor <- factor(dat$Sex, levels = c(1:3), labels = c("Male", "Female", "Prefer not to say")) 

# Genderident (1 = trans/woman, 2= trans/man, 3= genderqueer, 4 = Prefer not to say, 5 = other)
dat$Genderident.factor <- factor(dat$Genderident, levels = c(1:5), labels = c("trans/woman", "trans/man", "genderqueer", "Prefer not to say", "other"))

# education (1-8 = less than highschool, high school, some college, Vocation or technical school, Bachelor's, Master's, Doctorate, Professional degree)
dat$education.factor <- factor(dat$Education, levels = c(1:8), labels = c("less than highschool", "high school", "some college", "Vocation or technical school", "Bachelor's", "Master's", "Doctorate", "Professional degree"))
# occupation (written response)

# Ethnicity
dat$Ethnicity.factor <- factor(dat$Ethnicity, levels = c(1:9), labels = c("Aboriginal/Native", "Asian", "Black", "White", "Middle Eastern", "Hispanic", "East Indian", "Mixed Race", "Other/Not Listed"))

# Religion
dat$Religion.factor <- factor(dat$Religion, levels = c(1:10), labels = c("Buddhist", "Christian - Catholic", "Christian - Protestant", "Christian - Other", "Hindu", "Jewish", "Muslim", "Sikh", "Other", "Non-Religious"))

# Income
dat$Income.factor <- factor(dat$Income, levels = c(1:8), labels = c("Under $15,000", "$15,001 - $25,000", "$25,001 - $35,000", "$35,001 - $50,000", "$50,001 - $75,000", "$75,001 - $100,000", "$100,001 - $150,000", "Over $150,000"))

# Residential Area
dat$Residential.Area.factor <- factor(dat$Residential.Area, levels = c(1:3), labels = c("Urban", "Suburban", "Rural"))

# get factor scores for team discipline coded
dat$discipline[dat$team_discipline.coded==1]<-"Behavioral Sciences"
dat$discipline[dat$team_discipline.coded==2]<-"Social Sciences"
dat$discipline[dat$team_discipline.coded==3]<-"Data/Computer Science"
dat$discipline[dat$team_discipline.coded==4]<-"Multi-disciplinary"
dat$discipline[dat$team_discipline.coded==5]<-"Other"

  
  # Whether the team is multi-disciplinary (1) or mono (0)
dat$multi_dis.factor <- ifelse(dat$team_discipline.coded==4,"Multi domain expertise", "Single domain expertise")
#IMPORTANT: one team - Spartacus - does not have any demographics, and hene NA for discipline!

write.csv(dat,"dat_for_analyses.csv")

```

```{r Data - long format + absolute percent difference}

# set dataframe to long format
dat_long <- pivot_longer(dat, cols = starts_with("Month"), names_to = "Month", names_prefix = "Month.")
dat_long$Month <- as.numeric(dat_long$Month)

# exclude rows without values in the "value" column
dat_long <- filter(dat_long, !is.na(value))


# add column to store difference values as change compared to objective results for that given month/domain
dat_long$value.dif <- as.numeric(NA)

# for each of the 12 domains:
for (i in 1:length(domains)) {
  
  # Retrieve row with correct historical value for the domain
  hist <-  dat[which(dat$domain == domains[i] & dat$Method.coded == 5), ]
  
  for (n in 1:12) {
    # retrieve all rows from dat_long that match the domain + Month n and calculate the correct absolute percent difference
    histval <- hist[1, paste0("Month.", n)]
    predval <- dat_long[which(dat_long$domain == domains[i] & dat_long$Month == n), "value" ]
    
    dat_long[which(dat_long$domain == domains[i] & dat_long$Month == n), "value.dif" ] <- pct_change(histval, predval)
  }
  
}

# create subsetted version that only includes
# dat_long <- dat_long %>% subset(flag_lay_response == 0 | is.na(flag_lay_response))

dat_long$Method.code[dat_long$Method.coded==1]<-"Intuition/Theory"
dat_long$Method.code[dat_long$Method.coded==2]<-"Intuition/Theory"
dat_long$Method.code[dat_long$Method.coded==3]<-"Data-Driven"
dat_long$Method.code[dat_long$Method.coded==4]<-"Hybrid"
dat_long$Method.code[dat_long$Method.coded==5]<-"Ground Truth"
dat_long$Method.code[dat_long$Method.coded==6]<-"Naive-linear"
dat_long$Method.code[dat_long$Method.coded==7]<-"Naive-rfw"
dat_long$Method.code[dat_long$isExpert==0]<-"Lay People"

write.csv(dat_long,"dat_long.csv")

```

```{r Import Team member Demographic info}

# contains demographics info from participants who responded to the survey. Team names have been corrected to match those in the dat_exp dataframe

dat_demo <- read.csv("Wave1+2demographics.csv", stringsAsFactors = FALSE)

# demo_1 - participant name
# demo_2 - participant email
# education - 1-5 indicating current role: undergrad, grad, postdoc/fellow, Professor, Other (with text entry)
# educaton2 - 1-5 indicating how much education they have: some uni/college, bachelors, masters, PhD, Other
# gender - 1 = Male, 2 = Female
# org - what kind of organization they're affiliated with - 1 = college/university, 2 = government, 3 = Private Company, 4 = self-employed, 5 = other
# expertise 1 & 2 - written responses on areas of expertise 
# prevtournament - Whether they participated in a previous forecasting tournament 1 = Yes, 2 = No
# prevtour_list - written response of previous tournaments

# creating factor columns for the following variables:

# Academic sample - academic position
dat_demo$position.factor <- factor(dat_demo$education, levels = c(1:5), labels = c("Undergrad", "Grad", "Postdoc/fellow", "Professor", "Other"))

# Academic sample - education attained
dat_demo$education.factor <- factor(dat_demo$education, levels = c(1:5), labels = c("some uni/college", "bachelors", "masters", "PhD", "Other"))

# Academic sample - sex
dat_demo$sex_acad.factor <- factor(dat_demo$gender, levels = c(1:3), labels = c("Male", "Female", "Other"))

# Academic sample - organization/affiliation
dat_demo$org.factor <- factor(dat_demo$org, levels = c(1:5), labels = c("College/University", "Government", "Private Company", "Self-Employed", "Other"))

```

```{r Academic sample descriptives}

#datasets that are filtered by phase (1 = May, 2 = November)
phase1 <- filter(dat, phase == 1)
phase2 <- filter(dat, phase == 2)

# Phase 1 & 2further filtered to only include academics won't be necessary once we have updated objective data
phase1_exp <- filter(phase1, isExpert == 1)
phase2_exp <-filter(phase2, isExpert == 1)


# dataset that only includes academic predictions
academic_only <- filter(dat, isExpert == 1)

# Number of predictions by project phase + group
num_forecast <- dat %>% group_by(phase, isExpert.factor) %>% 
  dplyr::summarise(
  N = length(isExpert.factor),
  Percent =  N / nrow(dat)
)

print(num_forecast)
#NAs are rows of objective markers and predictions of naive models.

# Number of teams per phase
team_num <- academic_only %>% group_by(phase) %>% 
  dplyr::summarise(
  numberOfTeams = length(unique(team_name))
)

print(team_num)
# Number of teams total
team_num_total <- length(unique(academic_only$team_name))
print(team_num_total)
# 120 teams total, 86 teams participated during phase 1 (88th team is NA because I didn't filter out lay sample and 87th team indicated their predictions were for another country), 72 during phase 2 

# Filter so that only one row per team is retained
unique_teams <- academic_only[!duplicated(academic_only$team_name),]

describe(unique_teams$team_size.coded)

#unique_teams$team_size.coded 
#       n  missing distinct     Info     Mean      Gmd 
#     120        0        6    0.654    1.583   0.9289 

#lowest : 1 2 3 4 5, highest: 2 3 4 5 7
                                              
#Value          1     2     3     4     5     7
#Frequency     84    17     9     7     2     1
#Proportion 0.700 0.142 0.075 0.058 0.017 0.008
# 1 was the most common team size (70%)

# Summarize spread of teams size (does not exclude NAs)
prop.table(table(unique_teams$team_size.coded))

#1           2           3           4           5           7 
#0.700000000 0.141666667 0.075000000 0.058333333 0.016666667 0.008333333

# Filter data set by project wave
phase1_team <- filter(unique_teams, team_name %in% phase1$team_name)


# distribution of team size for phase 1
                    
prop.table(table(phase1_team$team_size.coded))

#         1          2          3          4          5          7 
#0.70930233 0.15116279 0.04651163 0.06976744 0.01162791 0.01162791 


phase2_team <- filter(unique_teams, team_name %in% phase2$team_name)

# distribution of team size for phase 2
as.data.frame(table(phase2_team$team_size.coded))

# Number of predictions below/above RMSE cutoff

# overall
as.data.frame(table(phase1_exp$RMSE_cutoff_Naive_linear.factor))

#   RMSE_cutoff.factor     N 
# 
# 1 below cutoff          109 - 30%
# 2 above cutoff         250  - 70%

# 70% of predictions were above the RMSE cutoff

# look at method used

method <- academic_only %>% group_by(Method.coded) %>% 
  dplyr::summarise(
  N = length(Method.coded),
  Percent = N / nrow(academic_only)
)
knitr::kable((method))

  #|Method.code      |   N|   Percent|
#|:----------------|---:|---------:|
#|Data-Driven      | 365| 0.5027548|
#|Hybrid           |  58| 0.0798898|
#|Intuition/Theory | 303| 0.4173554|
  
# Per domain
naive_RMSE_domain <- phase1_exp %>% group_by(domain, RMSE_cutoff_Naive_linear.factor) %>% 
  dplyr::summarise(N = length(RMSE_cutoff_Naive_linear.factor)) %>% ungroup() %>% 
  group_by(domain) %>% mutate(ptg = prop.table(N)*100) %>% ungroup() %>% 
  arrange(by_group=RMSE_cutoff_Naive_linear.factor,desc(ptg))
knitr::kable((naive_RMSE_domain))

# Implicit Asian bias, explicit African American, and positive affect were all 100% above the cutoff
# More than 50% of predictions for implicit gender, ideology-republican, and ideology-democrat were below the cutoff

# look at multi-disciplinarity

multidisciplinarity <- phase1_exp %>% group_by(multi_dis.factor) %>% 
  dplyr::summarise(
  N = length(multi_dis.factor),
  Percent = N / nrow(phase1_exp)
)
knitr::kable((multidisciplinarity))

#|multi_dis.factor        |   N|   Percent|
#|:-----------------------|---:|---------:|
#|Multi domain expertise  |  56| 0.1559889|
#|Single domain expertise | 302| 0.8412256|
#|NA                      |   1| 0.0027855|


```

```{r Prolific Descriptives}

# List of descriptives for prolific sample

# filter sample to only include unflagged Prolific responses
dat_lay_demo <- subset(dat, isExpert == 0)

# time spent on upload task
time_spent_desc_up <- psych::describe(dat_lay_demo$time_upload)
print(time_spent_desc_up)
#    vars    n   mean     sd median trimmed   mad min     max   range skew kurtosis   se
# X1    1 1467 180.01 227.96 109.38  130.78 62.94 50.22 3434.49 3384.27 6.03    62.59 5.95

age_stats <- psych::describe(dat_lay_demo$Age)
print(age_stats)
#    vars    n  mean    sd median trimmed mad min max range skew kurtosis   se
# X1    1 1389 30.55 10.68     28   29.03 10.38  18  78    60 1.27      1.6 0.29


#Education
prolific_edu <- dat_lay_demo %>% group_by(education.factor) %>% 
  dplyr::summarise(
  N = length(education.factor),
  Percent =  N / nrow(dat_lay_demo)
)

as.data.frame(table(dat_lay_demo$education.factor))
prop.table(table(dat_lay_demo$education.factor))

#        less than highschool                  high school                 some college Vocation or technical school                   Bachelor's                     Master's                    Doctorate          Professional degree 
#                 0.003599712                  0.082073434                  0.245500360                  0.042476602                  0.416846652                  0.162706983                  0.017278618                  0.029517639 

# Ethnicity
prolific_eth <- dat_lay_demo %>% group_by(Ethnicity.factor) %>% 
  dplyr::summarise(
  N = length(Ethnicity.factor),
  Percent =  N / nrow(dat_lay_demo)
)

as.data.frame(table(dat_lay_demo$Ethnicity.factor))
prop.table(table(dat_lay_demo$Ethnicity.factor))

#Aboriginal/Native             Asian             Black             White    Middle Eastern          Hispanic       East Indian        Mixed Race  Other/Not Listed 
#      0.007215007       0.170995671       0.094516595       0.595238095       0.007215007       0.074314574       0.007936508       0.034632035       0.007936508 

# Religion
prolific_rel <- dat_lay_demo %>% group_by(Religion.factor) %>% 
  dplyr::summarise(
  N = length(Religion.factor),
  Percent =  N / nrow(dat_lay_demo)
)
knitr::kable((prolific_rel))

# |Religion.factor        |   N|   Percent|
#|:----------------------|---:|---------:|
#|Buddhist               |  29| 0.0197682|
#|Christian - Catholic   | 197| 0.1342877|
#|Christian - Protestant | 214| 0.1458759|
#|Christian - Other      | 130| 0.0886162|
#|Hindu                  |  27| 0.0184049|
#|Jewish                 |  36| 0.0245399|
#|Muslim                 |  57| 0.0388548|
#|Sikh                   |   2| 0.0013633|
#|Other                  |  57| 0.0388548|
#|Non-Religious          | 638| 0.4349012|
#|NA                     |  80| 0.0545331|


# Politics
prolific_pol <- dat_lay_demo %>% group_by(Politics_1) %>% 
  dplyr::summarise(
  N = length(Politics_1),
  Percent =  N / nrow(dat_lay_demo)
)
knitr::kable((prolific_pol))

#| Politics_1|   N|   Percent|
#|----------:|---:|---------:|
#|          1| 343| 0.2338105|
#|          2| 313| 0.2133606|
#|          3| 192| 0.1308793|
#|          4| 300| 0.2044990|
#|          5| 126| 0.0858896|
#|          6|  83| 0.0565781|
#|          7|  32| 0.0218132|
#|         NA|  78| 0.0531697|

# Residential Area
prolific_res <- dat_lay_demo %>% group_by(Residential.Area.factor) %>% 
  dplyr::summarise(
  N = length(Residential.Area.factor),
  Percent =  N / nrow(dat_lay_demo)
)
knitr::kable((prolific_res))

#|Residential.Area.factor |   N|   Percent|
#|:-----------------------|---:|---------:|
#|Urban                   | 449| 0.3060668|
#|Suburban                | 791| 0.5391956|
#|Rural                   | 147| 0.1002045|
#|NA                      |  80| 0.0545331|

# Income
prolific_inc <- dat_lay_demo %>% group_by(Income.factor) %>% 
  dplyr::summarise(
  N = length(Income.factor),
  Percent =  N / nrow(dat_lay_demo)
)
knitr::kable((prolific_inc))

# |Income.factor       |   N|   Percent|
#|:-------------------|---:|---------:|
#|Under $15,000       |  92| 0.0627130|
#|$15,001 - $25,000   | 106| 0.0722563|
#|$25,001 - $35,000   | 129| 0.0879346|
#|$35,001 - $50,000   | 179| 0.1220177|
#|$50,001 - $75,000   | 290| 0.1976823|
#|$75,001 - $100,000  | 227| 0.1547376|
#|$100,001 - $150,000 | 189| 0.1288344|
#|Over $150,000       | 164| 0.1117928|
#|NA                  |  91| 0.0620314|

```