diff --git "a/docs/1.5-pesquisa-nacional-por-amostra-de-domic\303\255lios-cont\303\255nua-pnad-continua.html" "b/docs/1.5-pesquisa-nacional-por-amostra-de-domic\303\255lios-cont\303\255nua-pnad-continua.html" index 491ec9e..f5ecdd4 100644 --- "a/docs/1.5-pesquisa-nacional-por-amostra-de-domic\303\255lios-cont\303\255nua-pnad-continua.html" +++ "b/docs/1.5-pesquisa-nacional-por-amostra-de-domic\303\255lios-cont\303\255nua-pnad-continua.html" @@ -274,102 +274,102 @@

1.5 Pesquisa Nacional por Amostra get_pnadc(2022 , interview = 5 , design = FALSE , - labels = FALSE) -
## Deflator year was not provided, so deflator year was set to 2022.
-
names(pnadc_df) <- tolower(names(pnadc_df))
+ labels = FALSE) + +names(pnadc_df) <- tolower(names(pnadc_df))

Recode a number of variables:

-
pnadc_df <-
-  transform(
-    pnadc_df ,
-    
-    household_id = paste0(upa , v1008 , v1014) ,
-    
-    deflated_labor_income = vd4019 * co2 ,
-    
-    deflated_other_source_income = vd4048 * co2e
-  )
-
-labor_income_sum_df <-
-  aggregate(
-    cbind(household_deflated_labor_income = deflated_labor_income) ~ household_id ,
-    data = pnadc_df[!(pnadc_df[, 'vd2002'] %in% 17:19) ,] ,
-    sum ,
-    na.rm = TRUE
-  )
-
-other_income_sum_df <-
-  aggregate(
-    cbind(household_deflated_other_source_income = deflated_other_source_income) ~ household_id ,
-    data = pnadc_df[!(pnadc_df[, 'vd2002'] %in% 17:19) ,] ,
-    sum ,
-    na.rm = TRUE
-  )
-
-before_nrow <- nrow(pnadc_df)
-pnadc_df <- merge(pnadc_df , labor_income_sum_df , all.x = TRUE)
-pnadc_df <- merge(pnadc_df , other_income_sum_df , all.x = TRUE)
-stopifnot(nrow(pnadc_df) == before_nrow)
-
-pnadc_df[is.na(pnadc_df[, 'household_deflated_labor_income']) , 'household_deflated_labor_income'] <-
-  0
-pnadc_df[is.na(pnadc_df[, 'household_deflated_other_source_income']) , 'household_deflated_other_source_income'] <-
-  0
-
-
-pnadc_df <-
-  transform(
-    pnadc_df ,
-    
-    deflated_per_capita_income =
-      (
-        household_deflated_labor_income + household_deflated_other_source_income
-      ) / vd2003
-    
-  )
+
pnadc_df <-
+  transform(
+    pnadc_df ,
+    
+    household_id = paste0(upa , v1008 , v1014) ,
+    
+    deflated_labor_income = vd4019 * co2 ,
+    
+    deflated_other_source_income = vd4048 * co2e
+  )
+
+labor_income_sum_df <-
+  aggregate(
+    cbind(household_deflated_labor_income = deflated_labor_income) ~ household_id ,
+    data = pnadc_df[!(pnadc_df[, 'vd2002'] %in% 17:19) ,] ,
+    sum ,
+    na.rm = TRUE
+  )
+
+other_income_sum_df <-
+  aggregate(
+    cbind(household_deflated_other_source_income = deflated_other_source_income) ~ household_id ,
+    data = pnadc_df[!(pnadc_df[, 'vd2002'] %in% 17:19) ,] ,
+    sum ,
+    na.rm = TRUE
+  )
+
+before_nrow <- nrow(pnadc_df)
+pnadc_df <- merge(pnadc_df , labor_income_sum_df , all.x = TRUE)
+pnadc_df <- merge(pnadc_df , other_income_sum_df , all.x = TRUE)
+stopifnot(nrow(pnadc_df) == before_nrow)
+
+pnadc_df[is.na(pnadc_df[, 'household_deflated_labor_income']) , 'household_deflated_labor_income'] <-
+  0
+pnadc_df[is.na(pnadc_df[, 'household_deflated_other_source_income']) , 'household_deflated_other_source_income'] <-
+  0
+
+
+pnadc_df <-
+  transform(
+    pnadc_df ,
+    
+    deflated_per_capita_income =
+      (
+        household_deflated_labor_income + household_deflated_other_source_income
+      ) / vd2003
+    
+  )

Construct a complex sample survey design:

-
library(survey)
-
-pnadc_design <-
-  svrepdesign(
-    data = pnadc_df ,
-    weight = ~ v1032,
-    type = "bootstrap" ,
-    repweights = "v1032[0-9]{3}" ,
-    mse = TRUE
-  )
+
library(survey)
+
+pnadc_design <-
+  svrepdesign(
+    data = pnadc_df ,
+    weight = ~ v1032,
+    type = "bootstrap" ,
+    repweights = "v1032[0-9]{3}" ,
+    mse = TRUE
+  )

Run the convey_prep() function on the full design:

-
pnadc_design <- convey_prep(pnadc_design)
+
pnadc_design <- convey_prep(pnadc_design)

1.5.1 Per Capita Income

Calculate the gini coefficient with per capita income:

-
# https://sidra.ibge.gov.br/tabela/7435#/n1/all/v/all/p/last%201/d/v10681%203,v10682%201/l/v,p,t/resultado
-
-(
-  pnadc_per_capita_gini <-
-    svygini( ~ deflated_per_capita_income , pnadc_design , na.rm = TRUE)
-)
+
# https://sidra.ibge.gov.br/tabela/7435#/n1/all/v/all/p/last%201/d/v10681%203,v10682%201/l/v,p,t/resultado
+
+(
+  pnadc_per_capita_gini <-
+    svygini( ~ deflated_per_capita_income , pnadc_design , na.rm = TRUE)
+)
##                               gini     SE
 ## deflated_per_capita_income 0.51845 0.0032
-
# match 2022 per_capita gini coefficient
-stopifnot(round(coef(pnadc_per_capita_gini) , 3) == 0.518)
-
-# match 2022 per_capita gini coefficient of variation
-stopifnot(round(cv(pnadc_per_capita_gini), 3) == 0.006) 
+
# match 2022 per_capita gini coefficient
+stopifnot(round(coef(pnadc_per_capita_gini) , 3) == 0.518)
+
+# match 2022 per_capita gini coefficient of variation
+stopifnot(round(cv(pnadc_per_capita_gini), 3) == 0.006) 

1.5.2 Worker Earnings

Calculate the gini coefficient with total earnings:

-
# https://sidra.ibge.gov.br/tabela/7453#/n1/all/v/all/p/last%201/d/v10806%203,v10807%201/l/v,p,t/resultado
-
-(pnadc_earnings_gini <-
-   svygini( ~ deflated_labor_income , pnadc_design , na.rm = TRUE))
+
# https://sidra.ibge.gov.br/tabela/7453#/n1/all/v/all/p/last%201/d/v10806%203,v10807%201/l/v,p,t/resultado
+
+(pnadc_earnings_gini <-
+   svygini( ~ deflated_labor_income , pnadc_design , na.rm = TRUE))
##                          gini     SE
 ## deflated_labor_income 0.48606 0.0036
-
# match 2022 earnings gini coefficient
-stopifnot(round(coef(pnadc_earnings_gini) , 3) == 0.486)
-
-# match 2022 earnings gini coefficient of variation
-stopifnot(round(cv(pnadc_earnings_gini), 3) == 0.007) 
+
# match 2022 earnings gini coefficient
+stopifnot(round(coef(pnadc_earnings_gini) , 3) == 0.486)
+
+# match 2022 earnings gini coefficient of variation
+stopifnot(round(cv(pnadc_earnings_gini), 3) == 0.007) 

diff --git a/docs/1.6-survey-of-consumer-finances-scf.html b/docs/1.6-survey-of-consumer-finances-scf.html index 88cc607..55986c2 100644 --- a/docs/1.6-survey-of-consumer-finances-scf.html +++ b/docs/1.6-survey-of-consumer-finances-scf.html @@ -268,299 +268,285 @@

1.6 Survey of Consumer Finances (

The SCF studies net worth across the United States by asking respondents about both active and passive income, mortgages, pensions, credit card debt, even car leases. Administered by the Board of Governors of the Federal Reserve System triennially since 1989, this complex sample survey generalizes to the civilian non-institutional population and comprehensively assesses household wealth.

This section downloads, imports, and prepares the most current microdata for analysis, then reproduces some statistics and margin of error terms from the Federal Reserve.

This survey uses a multiply-imputed variance estimation technique described in the 2004 Codebook. Most users do not need to study this function carefully. Define a function specific to only this dataset:

-
scf_MIcombine <-
-  function (results,
-            variances,
-            call = sys.call(),
-            df.complete = Inf,
-            ...) {
-    m <- length(results)
-    oldcall <- attr(results, "call")
-    if (missing(variances)) {
-      variances <- suppressWarnings(lapply(results, vcov))
-      results <- lapply(results, coef)
-    }
-    vbar <- variances[[1]]
-    cbar <- results[[1]]
-    for (i in 2:m) {
-      cbar <- cbar + results[[i]]
-      # MODIFICATION:
-      # vbar <- vbar + variances[[i]]
-    }
-    cbar <- cbar / m
-    # MODIFICATION:
-    # vbar <- vbar/m
-    evar <- var(do.call("rbind", results))
-    r <- (1 + 1 / m) * evar / vbar
-    df <- (m - 1) * (1 + 1 / r) ^ 2
-    if (is.matrix(df))
-      df <- diag(df)
-    if (is.finite(df.complete)) {
-      dfobs <- ((df.complete + 1) / (df.complete + 3)) * df.complete *
-        vbar / (vbar + evar)
-      if (is.matrix(dfobs))
-        dfobs <- diag(dfobs)
-      df <- 1 / (1 / dfobs + 1 / df)
-    }
-    if (is.matrix(r))
-      r <- diag(r)
-    rval <- list(
-      coefficients = cbar,
-      variance = vbar + evar *
-        (m + 1) / m,
-      call = c(oldcall, call),
-      nimp = m,
-      df = df,
-      missinfo = (r + 2 / (df + 3)) / (r + 1)
-    )
-    class(rval) <- "MIresult"
-    rval
-  }
+
scf_MIcombine <-
+  function (results,
+            variances,
+            call = sys.call(),
+            df.complete = Inf,
+            ...) {
+    m <- length(results)
+    oldcall <- attr(results, "call")
+    if (missing(variances)) {
+      variances <- suppressWarnings(lapply(results, vcov))
+      results <- lapply(results, coef)
+    }
+    vbar <- variances[[1]]
+    cbar <- results[[1]]
+    for (i in 2:m) {
+      cbar <- cbar + results[[i]]
+      # MODIFICATION:
+      # vbar <- vbar + variances[[i]]
+    }
+    cbar <- cbar / m
+    # MODIFICATION:
+    # vbar <- vbar/m
+    evar <- var(do.call("rbind", results))
+    r <- (1 + 1 / m) * evar / vbar
+    df <- (m - 1) * (1 + 1 / r) ^ 2
+    if (is.matrix(df))
+      df <- diag(df)
+    if (is.finite(df.complete)) {
+      dfobs <- ((df.complete + 1) / (df.complete + 3)) * df.complete *
+        vbar / (vbar + evar)
+      if (is.matrix(dfobs))
+        dfobs <- diag(dfobs)
+      df <- 1 / (1 / dfobs + 1 / df)
+    }
+    if (is.matrix(r))
+      r <- diag(r)
+    rval <- list(
+      coefficients = cbar,
+      variance = vbar + evar *
+        (m + 1) / m,
+      call = c(oldcall, call),
+      nimp = m,
+      df = df,
+      missinfo = (r + 2 / (df + 3)) / (r + 1)
+    )
+    class(rval) <- "MIresult"
+    rval
+  }

Define a function to download and import each stata file:

-
library(haven)
-
-scf_dta_import <-
-  function(this_url) {
-    this_tf <- tempfile()
-    
-    download.file(this_url , this_tf , mode = 'wb')
-    
-    this_tbl <- read_dta(this_tf)
-    
-    this_df <- data.frame(this_tbl)
-    
-    file.remove(this_tf)
-    
-    names(this_df) <- tolower(names(this_df))
-    
-    this_df
-  }
+
library(haven)
+
+scf_dta_import <-
+  function(this_url) {
+    this_tf <- tempfile()
+    
+    download.file(this_url , this_tf , mode = 'wb')
+    
+    this_tbl <- read_dta(this_tf)
+    
+    this_df <- data.frame(this_tbl)
+    
+    file.remove(this_tf)
+    
+    names(this_df) <- tolower(names(this_df))
+    
+    this_df
+  }

Download and import the full, summary extract, and replicate weights tables:

-
scf_df <-
-  scf_dta_import("https://www.federalreserve.gov/econres/files/scf2022s.zip")
-
-ext_df <-
-  scf_dta_import("https://www.federalreserve.gov/econres/files/scfp2022s.zip")
-
-scf_rw_df <-
-  scf_dta_import("https://www.federalreserve.gov/econres/files/scf2022rw1s.zip")
+
scf_df <-
+  scf_dta_import("https://www.federalreserve.gov/econres/files/scf2022s.zip")
+
+ext_df <-
+  scf_dta_import("https://www.federalreserve.gov/econres/files/scfp2022s.zip")
+
+scf_rw_df <-
+  scf_dta_import("https://www.federalreserve.gov/econres/files/scf2022rw1s.zip")

Confirm both the full public data and the summary extract contain five records per family:

-
stopifnot(nrow(scf_df) == nrow(scf_rw_df) * 5)
-stopifnot(nrow(scf_df) == nrow(ext_df))
+
stopifnot(nrow(scf_df) == nrow(scf_rw_df) * 5)
+stopifnot(nrow(scf_df) == nrow(ext_df))

Confirm only the primary economic unit and the five implicate identifiers overlap:

-
stopifnot(all(sort(intersect(
-  names(scf_df) , names(ext_df)
-)) == c('y1' , 'yy1')))
-stopifnot(all(sort(intersect(
-  names(scf_df) , names(scf_rw_df)
-)) == c('y1' , 'yy1')))
-stopifnot(all(sort(intersect(
-  names(ext_df) , names(scf_rw_df)
-)) == c('y1' , 'yy1')))
+
stopifnot(all(sort(intersect(
+  names(scf_df) , names(ext_df)
+)) == c('y1' , 'yy1')))
+stopifnot(all(sort(intersect(
+  names(scf_df) , names(scf_rw_df)
+)) == c('y1' , 'yy1')))
+stopifnot(all(sort(intersect(
+  names(ext_df) , names(scf_rw_df)
+)) == c('y1' , 'yy1')))

Remove the implicate identifier from the replicate weights table, add a column of fives for weighting:

-
scf_rw_df[, 'y1'] <- NULL
-
-scf_df[, 'five'] <- 5
+
scf_rw_df[, 'y1'] <- NULL
+
+scf_df[, 'five'] <- 5

Construct a multiply-imputed, complex sample survey design:

Break the main table into five different implicates based on the final character of the column y1:

-
library(stringr)
-
-s1_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 1 ,]
-s2_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 2 ,]
-s3_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 3 ,]
-s4_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 4 ,]
-s5_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 5 ,]
+
library(stringr)
+
+s1_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 1 ,]
+s2_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 2 ,]
+s3_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 3 ,]
+s4_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 4 ,]
+s5_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 5 ,]

Combine these into a single list, then merge each implicate with the summary extract:

-
scf_imp <- list(s1_df , s2_df , s3_df , s4_df , s5_df)
-
-scf_list <- lapply(scf_imp , merge , ext_df)
+
scf_imp <- list(s1_df , s2_df , s3_df , s4_df , s5_df)
+
+scf_list <- lapply(scf_imp , merge , ext_df)

Replace all missing values in the replicate weights table with zeroes, multiply the replicate weights by the multiplication factor, then only keep the unique identifier and the final (combined) replicate weights:

-
scf_rw_df[is.na(scf_rw_df)] <- 0
-
-scf_rw_df[, paste0('wgt' , 1:999)] <-
-  scf_rw_df[, paste0('wt1b' , 1:999)] * scf_rw_df[, paste0('mm' , 1:999)]
-
-scf_rw_df <- scf_rw_df[, c('yy1' , paste0('wgt' , 1:999))]
+
scf_rw_df[is.na(scf_rw_df)] <- 0
+
+scf_rw_df[, paste0('wgt' , 1:999)] <-
+  scf_rw_df[, paste0('wt1b' , 1:999)] * scf_rw_df[, paste0('mm' , 1:999)]
+
+scf_rw_df <- scf_rw_df[, c('yy1' , paste0('wgt' , 1:999))]

Sort both the five implicates and also the replicate weights table by the unique identifier:

-
scf_list <-
-  lapply(scf_list , function(w)
-    w[order(w[, 'yy1']) ,])
-
-scf_rw_df <- scf_rw_df[order(scf_rw_df[, 'yy1']) ,]
+
scf_list <-
+  lapply(scf_list , function(w)
+    w[order(w[, 'yy1']) ,])
+
+scf_rw_df <- scf_rw_df[order(scf_rw_df[, 'yy1']) ,]

Define the design:

-
library(survey)
-library(mitools)
-
-scf_design <-
-  svrepdesign(
-    weights = ~ wgt ,
-    repweights = scf_rw_df[,-1] ,
-    data = imputationList(scf_list) ,
-    scale = 1 ,
-    rscales = rep(1 / 998 , 999) ,
-    mse = FALSE ,
-    type = "other" ,
-    combined.weights = TRUE
-  )
+
library(survey)
+library(mitools)
+
+scf_design <-
+  svrepdesign(
+    weights = ~ wgt ,
+    repweights = scf_rw_df[,-1] ,
+    data = imputationList(scf_list) ,
+    scale = 1 ,
+    rscales = rep(1 / 998 , 999) ,
+    mse = FALSE ,
+    type = "other" ,
+    combined.weights = TRUE
+  )

Run the convey_prep() function on the full design:

-
scf_design$designs <- lapply(scf_design$designs , convey_prep)
+
scf_design$designs <- lapply(scf_design$designs , convey_prep)

This example matches the “Table 4” tab’s cell Y6 of the Excel Based on Public Data:

-
mean_net_worth <-
-  scf_MIcombine(with(scf_design , svymean(~ networth)))
-
-stopifnot(round(coef(mean_net_worth) / 1000 , 1) == 1059.5)
+
mean_net_worth <-
+  scf_MIcombine(with(scf_design , svymean(~ networth)))
+
+stopifnot(round(coef(mean_net_worth) / 1000 , 1) == 1059.5)

This example comes within $500 of the standard error of mean net worth from Table 2 of the Federal Reserve Bulletin, displaying the minor differences between the Internal Data and Public Data:

-
stopifnot(abs(23.2 - round(SE(mean_net_worth) / 1000 , 1)) < 0.5)
+
stopifnot(abs(23.2 - round(SE(mean_net_worth) / 1000 , 1)) < 0.5)

This example matches the “Table 4” tab’s cells X6 of the Excel Based on Public Data:

-
# compute quantile with all five implicates stacked (not the recommended technique)
-fake_design <-
-  svydesign(~ 1 , data = ext_df[c('networth' , 'wgt')] , weights = ~ wgt)
-
-median_net_worth_incorrect_errors <-
-  svyquantile(~ networth , fake_design , 0.5)
-
-stopifnot(round(coef(median_net_worth_incorrect_errors) / 1000 , 2) == 192.7)
+
# compute quantile with all five implicates stacked (not the recommended technique)
+fake_design <-
+  svydesign(~ 1 , data = ext_df[c('networth' , 'wgt')] , weights = ~ wgt)
+
+median_net_worth_incorrect_errors <-
+  svyquantile(~ networth , fake_design , 0.5)
+
+stopifnot(round(coef(median_net_worth_incorrect_errors) / 1000 , 2) == 192.7)

1.6.1 Analysis Examples with the survey library

Add new columns to the data set:

-
scf_design <-
-  update(
-    scf_design ,
-    
-    hhsex = factor(
-      hhsex ,
-      levels = 1:2 ,
-      labels = c("male" , "female")
-    ) ,
-    
-    married = as.numeric(married == 1) ,
-    
-    edcl =
-      factor(
-        edcl ,
-        levels = 1:4 ,
-        labels =
-          c(
-            "less than high school" ,
-            "high school or GED" ,
-            "some college" ,
-            "college degree"
-          )
-      )
-    
-  )
+
scf_design <-
+  update(
+    scf_design ,
+    
+    hhsex = factor(
+      hhsex ,
+      levels = 1:2 ,
+      labels = c("male" , "female")
+    ) ,
+    
+    married = as.numeric(married == 1) ,
+    
+    edcl =
+      factor(
+        edcl ,
+        levels = 1:4 ,
+        labels =
+          c(
+            "less than high school" ,
+            "high school or GED" ,
+            "some college" ,
+            "college degree"
+          )
+      )
+    
+  )

Count the unweighted number of records in the survey sample, overall and by groups:

-
scf_MIcombine(with(scf_design , svyby(~ five , ~ five , unwtd.count)))
-
-scf_MIcombine(with(scf_design , svyby(~ five , ~ hhsex , unwtd.count)))
+
scf_MIcombine(with(scf_design , svyby(~ five , ~ five , unwtd.count)))
+
+scf_MIcombine(with(scf_design , svyby(~ five , ~ hhsex , unwtd.count)))

Count the weighted size of the generalizable population, overall and by groups:

-
scf_MIcombine(with(scf_design , svytotal(~ five)))
+
scf_MIcombine(with(scf_design , svytotal(~ five)))
+
+scf_MIcombine(with(scf_design ,
+                   svyby(~ five , ~ hhsex , svytotal)))
+

Calculate the mean (average) of a linear variable, overall and by groups:

+
scf_MIcombine(with(scf_design , svymean(~ networth)))
+
+scf_MIcombine(with(scf_design ,
+                   svyby(~ networth , ~ hhsex , svymean)))
+

Calculate the distribution of a categorical variable, overall and by groups:

+
scf_MIcombine(with(scf_design , svymean(~ edcl)))
 
 scf_MIcombine(with(scf_design ,
-                   svyby(~ five , ~ hhsex , svytotal)))
-

Calculate the mean (average) of a linear variable, overall and by groups:

-
scf_MIcombine(with(scf_design , svymean(~ networth)))
+                   svyby(~ edcl , ~ hhsex , svymean)))
+

Calculate the sum of a linear variable, overall and by groups:

+
scf_MIcombine(with(scf_design , svytotal(~ networth)))
 
 scf_MIcombine(with(scf_design ,
-                   svyby(~ networth , ~ hhsex , svymean)))
-

Calculate the distribution of a categorical variable, overall and by groups:

-
scf_MIcombine(with(scf_design , svymean(~ edcl)))
+                   svyby(~ networth , ~ hhsex , svytotal)))
+

Calculate the weighted sum of a categorical variable, overall and by groups:

+
scf_MIcombine(with(scf_design , svytotal(~ edcl)))
 
 scf_MIcombine(with(scf_design ,
-                   svyby(~ edcl , ~ hhsex , svymean)))
-

Calculate the sum of a linear variable, overall and by groups:

-
scf_MIcombine(with(scf_design , svytotal(~ networth)))
-
-scf_MIcombine(with(scf_design ,
-                   svyby(~ networth , ~ hhsex , svytotal)))
-

Calculate the weighted sum of a categorical variable, overall and by groups:

-
scf_MIcombine(with(scf_design , svytotal(~ edcl)))
-
-scf_MIcombine(with(scf_design ,
-                   svyby(~ edcl , ~ hhsex , svytotal)))
+ svyby(~ edcl , ~ hhsex , svytotal)))

Calculate the median (50th percentile) of a linear variable, overall and by groups:

-
scf_MIcombine(with(
-  scf_design ,
-  svyquantile(~ networth ,
-              0.5 , se = TRUE , interval.type = 'quantile')
-))
-
## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-
scf_MIcombine(with(
-  scf_design ,
-  svyby(
-    ~ networth ,
-    ~ hhsex ,
-    svyquantile ,
-    0.5 ,
-    se = TRUE ,
-    interval.type = 'quantile' ,
-    ci = TRUE
-  )
-))
-
## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
+
scf_MIcombine(with(
+  scf_design ,
+  svyquantile(~ networth ,
+              0.5 , se = TRUE , interval.type = 'quantile')
+))
+
+scf_MIcombine(with(
+  scf_design ,
+  svyby(
+    ~ networth ,
+    ~ hhsex ,
+    svyquantile ,
+    0.5 ,
+    se = TRUE ,
+    interval.type = 'quantile' ,
+    ci = TRUE
+  )
+))

Estimate a ratio:

-
scf_MIcombine(with(
-  scf_design ,
-  svyratio(numerator = ~ income , denominator = ~ networth)
-))
+
scf_MIcombine(with(
+  scf_design ,
+  svyratio(numerator = ~ income , denominator = ~ networth)
+))

Restrict the survey design to labor force participants:

-
sub_scf_design <- subset(scf_design , lf == 1)
+
sub_scf_design <- subset(scf_design , lf == 1)

Calculate the mean (average) of this subset:

-
scf_MIcombine(with(sub_scf_design , svymean(~ networth)))
+
scf_MIcombine(with(sub_scf_design , svymean(~ networth)))

Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:

-
this_result <-
-  scf_MIcombine(with(scf_design ,
-                     svymean(~ networth)))
-
-coef(this_result)
-SE(this_result)
-confint(this_result)
-cv(this_result)
-
-grouped_result <-
-  scf_MIcombine(with(scf_design ,
-                     svyby(~ networth , ~ hhsex , svymean)))
-
-coef(grouped_result)
-SE(grouped_result)
-confint(grouped_result)
-cv(grouped_result)
+
this_result <-
+  scf_MIcombine(with(scf_design ,
+                     svymean(~ networth)))
+
+coef(this_result)
+SE(this_result)
+confint(this_result)
+cv(this_result)
+
+grouped_result <-
+  scf_MIcombine(with(scf_design ,
+                     svyby(~ networth , ~ hhsex , svymean)))
+
+coef(grouped_result)
+SE(grouped_result)
+confint(grouped_result)
+cv(grouped_result)

Calculate the degrees of freedom of any survey design object:

-
degf(scf_design$designs[[1]])
+
degf(scf_design$designs[[1]])

Calculate the complex sample survey-adjusted variance of any statistic:

-
scf_MIcombine(with(scf_design , svyvar(~ networth)))
+
scf_MIcombine(with(scf_design , svyvar(~ networth)))

Include the complex sample design effect in the result for a specific statistic:

-
# SRS without replacement
-scf_MIcombine(with(scf_design ,
-                   svymean(~ networth , deff = TRUE)))
-
-# SRS with replacement
-scf_MIcombine(with(scf_design ,
-                   svymean(~ networth , deff = "replace")))
+
# SRS without replacement
+scf_MIcombine(with(scf_design ,
+                   svymean(~ networth , deff = TRUE)))
+
+# SRS with replacement
+scf_MIcombine(with(scf_design ,
+                   svymean(~ networth , deff = "replace")))

Perform a survey-weighted generalized linear model:

-
glm_result <-
-  scf_MIcombine(with(scf_design ,
-                     svyglm(networth ~ married + edcl)))
-
-summary(glm_result)
+
glm_result <-
+  scf_MIcombine(with(scf_design ,
+                     svyglm(networth ~ married + edcl)))
+
+summary(glm_result)

1.6.2 Family Net Worth

Calculate the gini coefficient with family net worth:

-
scf_MIcombine(with(scf_design , svygini(~ networth)))
+
scf_MIcombine(with(scf_design , svygini(~ networth)))
## Multiple imputation results:
 ##       m <- length(results)
 ##       scf_MIcombine(with(scf_design, svygini(~networth)))
@@ -570,7 +556,7 @@ 

1.6.2 Family Net Worth

1.6.3 Family Income

Calculate the gini coefficient with income:

-
scf_MIcombine(with(scf_design , svygini(~ income)))
+
scf_MIcombine(with(scf_design , svygini(~ income)))
## Multiple imputation results:
 ##       m <- length(results)
 ##       scf_MIcombine(with(scf_design, svygini(~income)))
diff --git a/docs/2.2-replication-based-variance-estimation.html b/docs/2.2-replication-based-variance-estimation.html
index af6857e..c5ab6fb 100644
--- a/docs/2.2-replication-based-variance-estimation.html
+++ b/docs/2.2-replication-based-variance-estimation.html
@@ -270,34 +270,34 @@ 

2.2 Replication-Based Variance Es

The function bootVar from the laeken library (Alfons and Templ 2013Alfons, Andreas, and Matthias Templ. 2013. “Estimation of Social Exclusion Indicators from Complex Surveys: The R Package laeken.” Journal of Statistical Software 54 (15): 1–25. http://www.jstatsoft.org/v54/i15/.), also uses the bootstrap method to estimate variances.

2.2.1 Replication Design Example

-
# load libraries
-library( survey )
-library( convey )
-library( laeken ) # for the dataset
-
-# get laeken eusilc data
-data( eusilc ) ; names( eusilc ) <- tolower( names( eusilc ) )
-
-# survey design object for TSL/ifluence function variance estimation
-des_eusilc <- svydesign( ids = ~rb030 , strata = ~db040 ,  weights = ~rb050 , data = eusilc )
-des_eusilc <- convey_prep( des_eusilc )
-
-# influence function SE estimate for the gini index
-convey:::svygini.survey.design( ~eqincome , design = des_eusilc )
+
# load libraries
+library( survey )
+library( convey )
+library( laeken ) # for the dataset
+
+# get laeken eusilc data
+data( eusilc ) ; names( eusilc ) <- tolower( names( eusilc ) )
+
+# survey design object for TSL/ifluence function variance estimation
+des_eusilc <- svydesign( ids = ~rb030 , strata = ~db040 ,  weights = ~rb050 , data = eusilc )
+des_eusilc <- convey_prep( des_eusilc )
+
+# influence function SE estimate for the gini index
+convey:::svygini.survey.design( ~eqincome , design = des_eusilc )
##             gini     SE
 ## eqincome 0.26497 0.0019
-
# create survey design object for replicate-based variance estimation
-des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
-des_eusilc_rep <- convey_prep( des_eusilc_rep )
-
-# replicate-based (bootstrao) SE estimate for the gini index
-# with option to keep replicates
-( gini.repstat <- convey:::svygini.svyrep.design( ~eqincome , design = des_eusilc_rep , return.replicates = TRUE ) )
+
# create survey design object for replicate-based variance estimation
+des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
+des_eusilc_rep <- convey_prep( des_eusilc_rep )
+
+# replicate-based (bootstrao) SE estimate for the gini index
+# with option to keep replicates
+( gini.repstat <- convey:::svygini.svyrep.design( ~eqincome , design = des_eusilc_rep , return.replicates = TRUE ) )
##             gini     SE
 ## eqincome 0.26497 0.0022

To understand how that variance is estimated, we can look at the replicates:

-
# collect gini bootstrap replicates
-( gini.reps <- gini.repstat$replicates )
+
# collect gini bootstrap replicates
+( gini.reps <- gini.repstat$replicates )
##  [1] 0.2628816 0.2653672 0.2663951 0.2646311 0.2660021 0.2648396 0.2633737
 ##  [8] 0.2611656 0.2613233 0.2647174 0.2638256 0.2621616 0.2646830 0.2587950
 ## [15] 0.2642947 0.2651559 0.2663231 0.2673018 0.2687169 0.2671058 0.2654078
@@ -314,16 +314,16 @@ 

2.2.1 Replication Design Example< ## attr(,"mse") ## [1] FALSE

These are resampling (bootstrap) replicates. With them, we can look at the variance of these replicates to get an estimate the gini estimator’s variance:

-
# variance estimate
-des.scale <- des_eusilc_rep$scale
-meantheta <- mean( gini.reps )[[1]]
-v <- sum( ( gini.reps - meantheta )^2 ) * des.scale
-
-# SE estimate
-( gini.se <- ( sqrt( v ) ) )
+
# variance estimate
+des.scale <- des_eusilc_rep$scale
+meantheta <- mean( gini.reps )[[1]]
+v <- sum( ( gini.reps - meantheta )^2 ) * des.scale
+
+# SE estimate
+( gini.se <- ( sqrt( v ) ) )
## [1] 0.002226009
-
# compare estimates
-identical( gini.se , SE( gini.repstat )[[1]] )
+
# compare estimates
+identical( gini.se , SE( gini.repstat )[[1]] )
## [1] TRUE

diff --git a/docs/3.1-at-risk-of-poverty-threshold-svyarpt.html b/docs/3.1-at-risk-of-poverty-threshold-svyarpt.html index 4b5ebee..44d4f4c 100644 --- a/docs/3.1-at-risk-of-poverty-threshold-svyarpt.html +++ b/docs/3.1-at-risk-of-poverty-threshold-svyarpt.html @@ -265,13 +265,13 @@

3.1 At Risk of Poverty Threshold (svyarpt)

-
✔️ commonly used by statistical agencies in the european union working group on statistics on income & living conditions (eurostat)
-✔️ not tied to the inflation rate nor to a basket of goods or consumable products
-✔️ generic calculation that can be broadly applied to different nations or regions
-✔️ easy to understand: defaults to 60% of median income
-❌ the 60% of median income used in ARPT might appear arbitrary for non-EU analyses
-❌ does not account for the intensity/severity of poverty
-❌ not really a poverty measure, but an estimated poverty threshold/poverty line
+
✔️ commonly used by statistical agencies in the european union working group on statistics on income & living conditions (eurostat)
+✔️ not tied to the inflation rate nor to a basket of goods or consumable products
+✔️ generic calculation that can be broadly applied to different nations or regions
+✔️ easy to understand: defaults to 60% of median income
+❌ the 60% of median income used in ARPT might appear arbitrary for non-EU analyses
+❌ does not account for the intensity/severity of poverty
+❌ not really a poverty measure, but an estimated poverty threshold/poverty line

The at-risk-of-poverty threshold (ARPT) is a measure used to define the people whose incomes imply a low standard of living in comparison to the general living standards. Even though some people are not below the effective poverty line, those below the ARPT can be considered “almost deprived”.

This measure is defined as \(0.6\) times the median income for the entire population:

\[ @@ -283,142 +283,142 @@

3.1 At Risk of Poverty Threshold

3.1.1 Replication Example

The R vardpoor package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes an ARPT coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the laeken library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the arpt coefficient
-# using the R vardpoor library
-varpoord_arpt_calculation <-
-  varpoord(
-    # analysis variable
-    Y = "eqincome",
-    
-    # weights variable
-    w_final = "rb050",
-    
-    # row number variable
-    ID_level1 = "IDd",
-    
-    # row number variable
-    ID_level2 = "IDd",
-    
-    # strata variable
-    H = "db040",
-    
-    N_h = NULL ,
-    
-    # clustering variable
-    PSU = "rb030",
-    
-    # data.table
-    dataset = dati,
-    
-    # arpt coefficient function
-    type = "linarpt",
-    
-    # get linearized variable
-    outp_lin = TRUE
-  )
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc
-  )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_arpt_calculation$all_result$value
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the laeken library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the arpt coefficient
+# using the R vardpoor library
+varpoord_arpt_calculation <-
+  varpoord(
+    # analysis variable
+    Y = "eqincome",
+    
+    # weights variable
+    w_final = "rb050",
+    
+    # row number variable
+    ID_level1 = "IDd",
+    
+    # row number variable
+    ID_level2 = "IDd",
+    
+    # strata variable
+    H = "db040",
+    
+    N_h = NULL ,
+    
+    # clustering variable
+    PSU = "rb030",
+    
+    # data.table
+    dataset = dati,
+    
+    # arpt coefficient function
+    type = "linarpt",
+    
+    # get linearized variable
+    outp_lin = TRUE
+  )
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc
+  )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_arpt_calculation$all_result$value
## [1] 10859.24
-
coef(svyarpt( ~ eqincome , des_eusilc))
+
coef(svyarpt( ~ eqincome , des_eusilc))
## eqincome 
 ## 10859.24
-
# linearized variables do match
-# vardpoor
-lin_arpt_varpoord <- varpoord_arpt_calculation$lin_out$lin_arpt
-# convey
-lin_arpt_convey <- attr(svyarpt( ~ eqincome , des_eusilc), "lin")
-
-# check equality
-all.equal(lin_arpt_varpoord, lin_arpt_convey)
+
# linearized variables do match
+# vardpoor
+lin_arpt_varpoord <- varpoord_arpt_calculation$lin_out$lin_arpt
+# convey
+lin_arpt_convey <- attr(svyarpt( ~ eqincome , des_eusilc), "lin")
+
+# check equality
+all.equal(lin_arpt_varpoord, lin_arpt_convey)
## [1] TRUE
-
# variances do not match exactly
-attr(svyarpt( ~ eqincome , des_eusilc) , 'var')
+
# variances do not match exactly
+attr(svyarpt( ~ eqincome , des_eusilc) , 'var')
##          eqincome
 ## eqincome 2564.027
-
varpoord_arpt_calculation$all_result$var
+
varpoord_arpt_calculation$all_result$var
## [1] 2559.442
-
# standard errors do not match exactly
-varpoord_arpt_calculation$all_result$se
+
# standard errors do not match exactly
+varpoord_arpt_calculation$all_result$se
## [1] 50.59093
-
SE(svyarpt( ~ eqincome , des_eusilc))
+
SE(svyarpt( ~ eqincome , des_eusilc))
##          eqincome
 ## eqincome 50.63622

The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svyarpt and vardpoor::linarpt produce the same linearized variable \(z\).

However, the measures of uncertainty do not line up, because library(vardpoor) defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design object.

-
# within each strata, sum up the weights
-cluster_sums <-
-  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc ,
-    fpc = ~ cluster_sum
-  )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
-  convey_prep(des_eusilc_ultimate_cluster)
-
-
-
-# matches
-stopifnot(all.equal(
-  attr(svyarpt( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1] ,
-  varpoord_arpt_calculation$all_result$var
-))
-
-# matches
-stopifnot(all.equal(varpoord_arpt_calculation$all_result$se ,
-                    SE(
-                      svyarpt( ~ eqincome , des_eusilc_ultimate_cluster)
-                    )[1]))
+
# within each strata, sum up the weights
+cluster_sums <-
+  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc ,
+    fpc = ~ cluster_sum
+  )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+  convey_prep(des_eusilc_ultimate_cluster)
+
+
+
+# matches
+stopifnot(all.equal(
+  attr(svyarpt( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1] ,
+  varpoord_arpt_calculation$all_result$var
+))
+
+# matches
+stopifnot(all.equal(varpoord_arpt_calculation$all_result$se ,
+                    SE(
+                      svyarpt( ~ eqincome , des_eusilc_ultimate_cluster)
+                    )[1]))

For additional usage examples of svyarpt, type ?convey::svyarpt in the R console.

diff --git a/docs/3.2-at-risk-of-poverty-ratio-svyarpr.html b/docs/3.2-at-risk-of-poverty-ratio-svyarpr.html index 60cd017..73f6015 100644 --- a/docs/3.2-at-risk-of-poverty-ratio-svyarpr.html +++ b/docs/3.2-at-risk-of-poverty-ratio-svyarpr.html @@ -265,12 +265,12 @@

3.2 At Risk of Poverty Ratio (svyarpr)

-
✔️ EU standard like ARPT, easy to understand, interpret, and implement
-✔️ proportion of individuals below ARPT -- a "companion" function that uses svyarpt() internally
-✔️ measure is easy to understand
-❌ does not account for the intensity or inequality among the poor
-❌ not very common outside of the EU
-❌ just another name for the `svyfgt( g = 0 , thresh = "relq" )`
+
✔️ EU standard like ARPT, easy to understand, interpret, and implement
+✔️ proportion of individuals below ARPT -- a "companion" function that uses svyarpt() internally
+✔️ measure is easy to understand
+❌ does not account for the intensity or inequality among the poor
+❌ not very common outside of the EU
+❌ just another name for the `svyfgt( g = 0 , thresh = "relq" )`

The at-risk-of-poverty rate (ARPR) is the share of persons with an income below the at-risk-of-poverty threshold (ARPT). The logic behind this measure is that although most people below the ARPT cannot be considered “poor”, they are the ones most vulnerable to becoming poor in the event of a negative economic phenomenon like a recession.

The ARPR is a composite estimate, taking into account both the sampling error in the proportion itself and that in the ARPT estimate. The details of the linearization of the ARPR are discussed by Deville (1999Deville, Jean-Claude. 1999. “Variance Estimation for Complex Statistics and Estimators: Linearization and Residual Techniques.” Survey Methodology 25 (2): 193–203. http://www.statcan.gc.ca/pub/12-001-x/1999002/article/4882-eng.pdf.) and Osier (2009Osier, Guillaume. 2009. “Variance Estimation for Complex Indicators of Poverty and Inequality.” Journal of the European Survey Research Association 3 (3): 167–95. http://ojs.ub.uni-konstanz.de/srm/article/view/369.).


@@ -278,270 +278,270 @@

3.2 At Risk of Poverty Ratio (svy

3.2.1 Replication Example

The R vardpoor package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a ARPR coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the vardpoor library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the arpr coefficient
-# using the R vardpoor library
-varpoord_arpr_calculation <-
-  varpoord(
-    # analysis variable
-    Y = "eqincome",
-    
-    # weights variable
-    w_final = "rb050",
-    
-    # row number variable
-    ID_level1 = "IDd",
-    
-    # row number variable
-    ID_level2 = "IDd",
-    
-    # strata variable
-    H = "db040",
-    
-    N_h = NULL ,
-    
-    # clustering variable
-    PSU = "rb030",
-    
-    # data.table
-    dataset = dati,
-    
-    # arpr coefficient function
-    type = "linarpr",
-    
-    # get linearized variable
-    outp_lin = TRUE
-    
-  )
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc
-  )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_arpr_calculation$all_result$value
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the vardpoor library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the arpr coefficient
+# using the R vardpoor library
+varpoord_arpr_calculation <-
+  varpoord(
+    # analysis variable
+    Y = "eqincome",
+    
+    # weights variable
+    w_final = "rb050",
+    
+    # row number variable
+    ID_level1 = "IDd",
+    
+    # row number variable
+    ID_level2 = "IDd",
+    
+    # strata variable
+    H = "db040",
+    
+    N_h = NULL ,
+    
+    # clustering variable
+    PSU = "rb030",
+    
+    # data.table
+    dataset = dati,
+    
+    # arpr coefficient function
+    type = "linarpr",
+    
+    # get linearized variable
+    outp_lin = TRUE
+    
+  )
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc
+  )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_arpr_calculation$all_result$value
## [1] 14.44422
-
coef(svyarpr( ~ eqincome , des_eusilc)) * 100
+
coef(svyarpr( ~ eqincome , des_eusilc)) * 100
## eqincome 
 ## 14.44422
-
# linearized variables do not match
-# because Fprime is the derivative wrt
-# to the estimated threshold, not the estimated quantile
-# for more details, see
-# https://github.com/ajdamico/convey/issues/372#issuecomment-1656264143
-#
-# vardpoor
-lin_arpr_varpoord <- varpoord_arpr_calculation$lin_out$lin_arpr
-# convey
-lin_arpr_convey <- attr(svyarpr( ~ eqincome , des_eusilc), "lin")
-
-# check equality
-all.equal(lin_arpr_varpoord, 100 * lin_arpr_convey)
+
# linearized variables do not match
+# because Fprime is the derivative wrt
+# to the estimated threshold, not the estimated quantile
+# for more details, see
+# https://github.com/ajdamico/convey/issues/372#issuecomment-1656264143
+#
+# vardpoor
+lin_arpr_varpoord <- varpoord_arpr_calculation$lin_out$lin_arpr
+# convey
+lin_arpr_convey <- attr(svyarpr( ~ eqincome , des_eusilc), "lin")
+
+# check equality
+all.equal(lin_arpr_varpoord, 100 * lin_arpr_convey)
## [1] "Mean relative difference: 0.2264738"
-
# variances do not match exactly
-attr(svyarpr( ~ eqincome , des_eusilc) , 'var') * 10000
+
# variances do not match exactly
+attr(svyarpr( ~ eqincome , des_eusilc) , 'var') * 10000
##            eqincome
 ## eqincome 0.07599778
-
varpoord_arpr_calculation$all_result$var
+
varpoord_arpr_calculation$all_result$var
## [1] 0.08718569
-
# standard errors do not match exactly
-varpoord_arpr_calculation$all_result$se
+
# standard errors do not match exactly
+varpoord_arpr_calculation$all_result$se
## [1] 0.2952722
-
SE(svyarpr( ~ eqincome , des_eusilc)) * 100
+
SE(svyarpr( ~ eqincome , des_eusilc)) * 100
##           eqincome
 ## eqincome 0.2756769

The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svyarpr and vardpoor::linarpr produce the same linearized variable \(z\).

However, the measures of uncertainty do not line up. One of the reasons is that library(vardpoor) defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design object.

-
# within each strata, sum up the weights
-cluster_sums <-
-  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc ,
-    fpc = ~ cluster_sum
-  )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
-  convey_prep(des_eusilc_ultimate_cluster)
-
-# does not match
-attr(svyarpr( ~ eqincome , des_eusilc_ultimate_cluster) , 'var') * 10000
+
# within each strata, sum up the weights
+cluster_sums <-
+  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc ,
+    fpc = ~ cluster_sum
+  )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+  convey_prep(des_eusilc_ultimate_cluster)
+
+# does not match
+attr(svyarpr( ~ eqincome , des_eusilc_ultimate_cluster) , 'var') * 10000
##            eqincome
 ## eqincome 0.07586194
-
varpoord_arpr_calculation$all_result$var
+
varpoord_arpr_calculation$all_result$var
## [1] 0.08718569
-
# does not match
-varpoord_arpr_calculation$all_result$se
+
# does not match
+varpoord_arpr_calculation$all_result$se
## [1] 0.2952722
-
SE(svyarpr( ~ eqincome , des_eusilc_ultimate_cluster)) * 100
+
SE(svyarpr( ~ eqincome , des_eusilc_ultimate_cluster)) * 100
##           eqincome
 ## eqincome 0.2754305

Still, there is a difference in the estimates. This is discussed in detail in this issue. In order to still provide additional examples for our code, we proceed with a Monte Carlo experiment.

Using the eusilcP data from the simPop package (Templ et al. 2017Templ, Matthias, Bernhard Meindl, Alexander Kowarik, and Olivier Dupriez. 2017. “Simulation of Synthetic Complex Data: The R Package simPop.” Journal of Statistical Software 79 (10): 1–38. https://doi.org/10.18637/jss.v079.i10.), we can compute the actual value of the at risk of poverty rate for that population:

-
# load libraries
-library(sampling)
-library(survey)
-library(convey)
-library(parallel)
-
-# load pseudo population data
-data("eusilcP" , package = "simPop")
-
-# compute population median
-q50.pop <-
-  convey:::computeQuantiles(eusilcP$eqIncome , rep(1 , length(eusilcP$eqIncome)) , .50)
-
-# compute population poverty threshold
-# as 60% of the median
-thresh.pop <- .60 * q50.pop
-
-# compute population at risk of poverty rate
-(theta.pop <-
-    mean(eusilcP$eqIncome <= thresh.pop , na.rm = TRUE))
+
# load libraries
+library(sampling)
+library(survey)
+library(convey)
+library(parallel)
+
+# load pseudo population data
+data("eusilcP" , package = "simPop")
+
+# compute population median
+q50.pop <-
+  convey:::computeQuantiles(eusilcP$eqIncome , rep(1 , length(eusilcP$eqIncome)) , .50)
+
+# compute population poverty threshold
+# as 60% of the median
+thresh.pop <- .60 * q50.pop
+
+# compute population at risk of poverty rate
+(theta.pop <-
+    mean(eusilcP$eqIncome <= thresh.pop , na.rm = TRUE))
## [1] 0.1469295

Now, to study the distribution of the estimator under a particular sampling design, we select 5000 samples under one-stage cluster sampling of 100 households using the cluster function from the sampling package (Tillé and Matei 2021Tillé, Yves, and Alina Matei. 2021. Sampling: Survey Sampling. https://CRAN.R-project.org/package=sampling.), and use the svyarpr function to estimate the ARPR for each of those samples:

-
# define the number of monte carlo replicates
-mc.rep <- 5000L
-
-# simulation function
-arpr_sim_fun <- function(this.iter) {
-  
-  set.seed(this.iter)
-  
-  
-  library(survey)
-  library(convey)
-  library(sampling)
-  
-  # load pseudo population data
-  data("eusilcP" , package = "simPop")
-  
-  # compute size-like variable for PPS sampling design
-  eusilcP$aux <-
-    log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
-  
-  
-  # select sample
-  tt <-
-    sampling::cluster(
-      data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
-      clustername = "hid" ,
-      size = 1000L ,
-      method = "systematic" ,
-      pik = eusilcP$aux
-    )
-  
-  # collect data
-  this.sample <- getdata(eusilcP , tt)
-  
-  # create survey design object
-  this.desobj <-
-    svydesign(
-      ids = ~ hid ,
-      probs = ~ Prob ,
-      data = this.sample ,
-      nest = FALSE
-    )
-  
-  # prepare for convey functions
-  this.desobj <- convey_prep(this.desobj)
-  
-  # compute estimates
-  svyarpr( ~ eqIncome , this.desobj)
-  
-}
-
-# run replications
-cl <- makeCluster(detectCores() - 1)
-
-arpr.estimate.list <-
-  clusterApply(cl, seq_len(mc.rep) , arpr_sim_fun)
-
-stopCluster(cl)
+
# define the number of monte carlo replicates
+mc.rep <- 5000L
+
+# simulation function
+arpr_sim_fun <- function(this.iter) {
+  
+  set.seed(this.iter)
+  
+  
+  library(survey)
+  library(convey)
+  library(sampling)
+  
+  # load pseudo population data
+  data("eusilcP" , package = "simPop")
+  
+  # compute size-like variable for PPS sampling design
+  eusilcP$aux <-
+    log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
+  
+  
+  # select sample
+  tt <-
+    sampling::cluster(
+      data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
+      clustername = "hid" ,
+      size = 1000L ,
+      method = "systematic" ,
+      pik = eusilcP$aux
+    )
+  
+  # collect data
+  this.sample <- getdata(eusilcP , tt)
+  
+  # create survey design object
+  this.desobj <-
+    svydesign(
+      ids = ~ hid ,
+      probs = ~ Prob ,
+      data = this.sample ,
+      nest = FALSE
+    )
+  
+  # prepare for convey functions
+  this.desobj <- convey_prep(this.desobj)
+  
+  # compute estimates
+  svyarpr( ~ eqIncome , this.desobj)
+  
+}
+
+# run replications
+cl <- makeCluster(detectCores() - 1)
+
+arpr.estimate.list <-
+  clusterApply(cl, seq_len(mc.rep) , arpr_sim_fun)
+
+stopCluster(cl)

Then, we evaluate the Percentage Relative Bias (PRB) of the ARPR estimator. Under this scenario, the PRB of the ARPR estimator is -0.19830%.

-
# estimate the expected value of the ARPR estimator
-# using the average of the estimates
-(theta.exp <- mean(sapply(arpr.estimate.list , coef)))
+
# estimate the expected value of the ARPR estimator
+# using the average of the estimates
+(theta.exp <- mean(sapply(arpr.estimate.list , coef)))
## [1] 0.1466381
-
# estimate the percentage relative bias
-(percentage_relative_bias_arpr <- 100 * (theta.exp / theta.pop - 1))
+
# estimate the percentage relative bias
+(percentage_relative_bias_arpr <- 100 * (theta.exp / theta.pop - 1))
## [1] -0.1982981
-
stopifnot(round(percentage_relative_bias_arpr , 4) == -0.19830)
+
stopifnot(round(percentage_relative_bias_arpr , 4) == -0.19830)

For the variance estimator, we have:

-
# estimate the variance of the ARPR estimator
-# using the empirical variance of the estimates
-(vartheta.popest <- var(sapply(arpr.estimate.list , coef)))
+
# estimate the variance of the ARPR estimator
+# using the empirical variance of the estimates
+(vartheta.popest <- var(sapply(arpr.estimate.list , coef)))
## [1] 0.0001048728
-
# estimate the expected value of the ARPR variance estimator
-# using the (estimated) expected value of the variance estimates
-(vartheta.exp <- mean(sapply(arpr.estimate.list , vcov)))
+
# estimate the expected value of the ARPR variance estimator
+# using the (estimated) expected value of the variance estimates
+(vartheta.exp <- mean(sapply(arpr.estimate.list , vcov)))
## [1] 0.0001081994
-
# estimate the percentage relative bias of the variance estimator
-( percentage_relative_bias_variance <- 100 *  (vartheta.exp / vartheta.popest - 1) )
+
# estimate the percentage relative bias of the variance estimator
+( percentage_relative_bias_variance <- 100 *  (vartheta.exp / vartheta.popest - 1) )
## [1] 3.172023
-
stopifnot( round( percentage_relative_bias_variance , 4 ) == 3.1720 ) 
+
stopifnot( round( percentage_relative_bias_variance , 4 ) == 3.1720 ) 

Under this scenario, the PRB of the ARPR variance estimator is 3.1720%.

Our simulation shows that the Bias Ratio of this estimator is approximately 2%:

-
# compute bias ratio
-100 * abs( theta.exp - theta.pop ) / sqrt( vartheta.popest )
+
# compute bias ratio
+100 * abs( theta.exp - theta.pop ) / sqrt( vartheta.popest )
## [1] 2.84509

if the normal approximation holds, a small bias ratio still allows for approximately valid estimates of the confidence intervals.

Next, we evaluate the Percentage Coverage Rate (PCR). In theory, under repeated sampling, the estimated 95% CIs should cover the population parameter approximately 95% of the time. We can evaluate that using:

-
# estimate confidence intervals of the ARPR
-# for each of the samples
-est.coverage <-
-  sapply(arpr.estimate.list, function(this.stat)
-    confint(this.stat)[, 1] <= theta.pop &
-      confint(this.stat)[, 2] >= theta.pop)
-
-# evaluate empirical coverage
-(empirical_coverage <- mean(est.coverage))
+
# estimate confidence intervals of the ARPR
+# for each of the samples
+est.coverage <-
+  sapply(arpr.estimate.list, function(this.stat)
+    confint(this.stat)[, 1] <= theta.pop &
+      confint(this.stat)[, 2] >= theta.pop)
+
+# evaluate empirical coverage
+(empirical_coverage <- mean(est.coverage))
## [1] 0.9516
-
stopifnot(round(empirical_coverage , 2) == 0.95)
+
stopifnot(round(empirical_coverage , 2) == 0.95)

Our coverages are not too far from the nominal coverage level of 95%.

For additional usage examples of svyarpr, type ?convey::svyarpr in the R console.

diff --git a/docs/3.3-relative-median-income-ratio-svyrmir.html b/docs/3.3-relative-median-income-ratio-svyrmir.html index 2dec577..928e722 100644 --- a/docs/3.3-relative-median-income-ratio-svyrmir.html +++ b/docs/3.3-relative-median-income-ratio-svyrmir.html @@ -265,15 +265,15 @@

3.3 Relative Median Income Ratio (svyrmir)

-
✔️ mainly useful for studies of the income of the elderly following EU definitions
-✔️ a ratio of medians
-✔️ less sensitive to outliers
-❌ solely a measure of medians and does not fully account for the income distribution
-❌ not very common outside of the EU
-❌ hard to interpret
-not (necessarily) a dependency measure
-❌ not an inequality measure (fails the Pigou-Dalton principle)
-not (exactly) a poverty measure (fails the poverty-focus axiom)
+
✔️ mainly useful for studies of the income of the elderly following EU definitions
+✔️ a ratio of medians
+✔️ less sensitive to outliers
+❌ solely a measure of medians and does not fully account for the income distribution
+❌ not very common outside of the EU
+❌ hard to interpret
+not (necessarily) a dependency measure
+❌ not an inequality measure (fails the Pigou-Dalton principle)
+not (exactly) a poverty measure (fails the poverty-focus axiom)

The relative median income ratio (RMIR) is the ratio of the median income of people aged above a value (65) to the median of people aged below the same value. In mathematical terms,

\[ rmir = \frac{median\{y_i; age_i >65 \}}{median\{y_i; age_i \leq 65 \}}. @@ -284,149 +284,149 @@

3.3 Relative Median Income Ratio

3.3.1 Replication Example

The R vardpoor package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a RMIR coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the vardpoor library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the rmir coefficient
-# using the R vardpoor library
-varpoord_rmir_calculation <-
-  varpoord(
-    # analysis variable
-    Y = "eqincome",
-    
-    # weights variable
-    w_final = "rb050",
-    
-    # row number variable
-    ID_level1 = "IDd",
-    
-    # row number variable
-    ID_level2 = "IDd",
-    
-    # strata variable
-    H = "db040",
-    
-    N_h = NULL ,
-    
-    # clustering variable
-    PSU = "rb030",
-    
-    # data.table
-    dataset = dati,
-    
-    # age variable
-    age = "age",
-    
-    # rmir coefficient function
-    type = "linrmir",
-    
-    # get linearized variable
-    outp_lin = TRUE
-    
-  )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc
-  )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_rmir_calculation$all_result$value
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the vardpoor library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the rmir coefficient
+# using the R vardpoor library
+varpoord_rmir_calculation <-
+  varpoord(
+    # analysis variable
+    Y = "eqincome",
+    
+    # weights variable
+    w_final = "rb050",
+    
+    # row number variable
+    ID_level1 = "IDd",
+    
+    # row number variable
+    ID_level2 = "IDd",
+    
+    # strata variable
+    H = "db040",
+    
+    N_h = NULL ,
+    
+    # clustering variable
+    PSU = "rb030",
+    
+    # data.table
+    dataset = dati,
+    
+    # age variable
+    age = "age",
+    
+    # rmir coefficient function
+    type = "linrmir",
+    
+    # get linearized variable
+    outp_lin = TRUE
+    
+  )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc
+  )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_rmir_calculation$all_result$value
## [1] 0.9330361
-
coef(svyrmir( ~ eqincome , des_eusilc, age = ~ age))
+
coef(svyrmir( ~ eqincome , des_eusilc, age = ~ age))
##  eqincome 
 ## 0.9330361
-
# linearized variables do match
-# vardpoor
-lin_rmir_varpoord <- varpoord_rmir_calculation$lin_out$lin_rmir
-# convey
-lin_rmir_convey <-
-  attr(svyrmir( ~ eqincome , des_eusilc, age = ~ age), "lin")
-
-# check equality
-all.equal(lin_rmir_varpoord, lin_rmir_convey[, 1])
+
# linearized variables do match
+# vardpoor
+lin_rmir_varpoord <- varpoord_rmir_calculation$lin_out$lin_rmir
+# convey
+lin_rmir_convey <-
+  attr(svyrmir( ~ eqincome , des_eusilc, age = ~ age), "lin")
+
+# check equality
+all.equal(lin_rmir_varpoord, lin_rmir_convey[, 1])
## [1] TRUE
-
# variances do not match exactly
-attr(svyrmir( ~ eqincome , des_eusilc, age = ~ age) , 'var')
+
# variances do not match exactly
+attr(svyrmir( ~ eqincome , des_eusilc, age = ~ age) , 'var')
##             eqincome
 ## eqincome 0.000127444
-
varpoord_rmir_calculation$all_result$var
+
varpoord_rmir_calculation$all_result$var
## [1] 0.0001272137
-
# standard errors do not match exactly
-varpoord_rmir_calculation$all_result$se
+
# standard errors do not match exactly
+varpoord_rmir_calculation$all_result$se
## [1] 0.0112789
-
SE(svyrmir( ~ eqincome , des_eusilc , age = ~ age)) 
+
SE(svyrmir( ~ eqincome , des_eusilc , age = ~ age)) 
##            eqincome
 ## eqincome 0.01128911

The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svyrmir and vardpoor::linrmir produce the same linearized variable \(z\).

However, the measures of uncertainty do not line up, because library(vardpoor) defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design object.

-
# within each strata, sum up the weights
-cluster_sums <-
-  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc ,
-    fpc = ~ cluster_sum
-  )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
-  convey_prep(des_eusilc_ultimate_cluster)
-
-# matches
-stopifnot(all.equal(
-  attr(
-    svyrmir(~ eqincome , des_eusilc_ultimate_cluster , age = ~ age) ,
-    'var'
-  )[1],
-  varpoord_rmir_calculation$all_result$var
-))
-
-
-# matches
-stopifnot(all.equal(SE(
-  svyrmir(~ eqincome , des_eusilc_ultimate_cluster, age = ~ age)
-)[1], varpoord_rmir_calculation$all_result$se))
+
# within each strata, sum up the weights
+cluster_sums <-
+  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc ,
+    fpc = ~ cluster_sum
+  )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+  convey_prep(des_eusilc_ultimate_cluster)
+
+# matches
+stopifnot(all.equal(
+  attr(
+    svyrmir(~ eqincome , des_eusilc_ultimate_cluster , age = ~ age) ,
+    'var'
+  )[1],
+  varpoord_rmir_calculation$all_result$var
+))
+
+
+# matches
+stopifnot(all.equal(SE(
+  svyrmir(~ eqincome , des_eusilc_ultimate_cluster, age = ~ age)
+)[1], varpoord_rmir_calculation$all_result$se))

For additional usage examples of svyrmir, type ?convey::svyrmir in the R console.

diff --git a/docs/3.4-relative-median-poverty-gap-svyrmpg.html b/docs/3.4-relative-median-poverty-gap-svyrmpg.html index aedeaa6..58c75b4 100644 --- a/docs/3.4-relative-median-poverty-gap-svyrmpg.html +++ b/docs/3.4-relative-median-poverty-gap-svyrmpg.html @@ -265,11 +265,11 @@

3.4 Relative Median Poverty Gap (svyrmpg)

-
✔️ how poor are those below the ARPT?
-✔️ median poverty gap expressed as a percentage of the threshold
-✔️ useful for understanding the depth of poverty
-❌ not common outside of the EU
-❌ not immediately interpretable in terms of income
+
✔️ how poor are those below the ARPT?
+✔️ median poverty gap expressed as a percentage of the threshold
+✔️ useful for understanding the depth of poverty
+❌ not common outside of the EU
+❌ not immediately interpretable in terms of income

The relative median poverty gap (RMPG) is the relative difference between the median income of people having income below the ARPT and the ARPT itself:

\[ rmpg = \frac{median\{y_i, y_i<arpt\}-arpt}{arpt} @@ -280,144 +280,144 @@

3.4 Relative Median Poverty Gap (

3.4.1 Replication Example

The R vardpoor package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a RMPG coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the vardpoor library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the rmpg coefficient
-# using the R vardpoor library
-varpoord_rmpg_calculation <-
-  varpoord(
-    # analysis variable
-    Y = "eqincome",
-    
-    # weights variable
-    w_final = "rb050",
-    
-    # row number variable
-    ID_level1 = "IDd",
-    
-    # row number variable
-    ID_level2 = "IDd",
-    
-    # strata variable
-    H = "db040",
-    
-    N_h = NULL ,
-    
-    # clustering variable
-    PSU = "rb030",
-    
-    # data.table
-    dataset = dati,
-    
-    # rmpg coefficient function
-    type = "linrmpg",
-    
-    # get linearized variable
-    outp_lin = TRUE
-    
-  )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc
-  )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_rmpg_calculation$all_result$value
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the vardpoor library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the rmpg coefficient
+# using the R vardpoor library
+varpoord_rmpg_calculation <-
+  varpoord(
+    # analysis variable
+    Y = "eqincome",
+    
+    # weights variable
+    w_final = "rb050",
+    
+    # row number variable
+    ID_level1 = "IDd",
+    
+    # row number variable
+    ID_level2 = "IDd",
+    
+    # strata variable
+    H = "db040",
+    
+    N_h = NULL ,
+    
+    # clustering variable
+    PSU = "rb030",
+    
+    # data.table
+    dataset = dati,
+    
+    # rmpg coefficient function
+    type = "linrmpg",
+    
+    # get linearized variable
+    outp_lin = TRUE
+    
+  )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc
+  )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_rmpg_calculation$all_result$value
## [1] 18.9286
-
coef(svyrmpg( ~ eqincome , des_eusilc)) * 100
+
coef(svyrmpg( ~ eqincome , des_eusilc)) * 100
## eqincome 
 ##  18.9286
-
# linearized variables do match
-# vardpoor
-lin_rmpg_varpoord <- varpoord_rmpg_calculation$lin_out$lin_rmpg
-# convey
-lin_rmpg_convey <- attr(svyrmpg( ~ eqincome , des_eusilc), "lin")
-
-# check equality
-all.equal(lin_rmpg_varpoord, 100 * lin_rmpg_convey[, 1])
+
# linearized variables do match
+# vardpoor
+lin_rmpg_varpoord <- varpoord_rmpg_calculation$lin_out$lin_rmpg
+# convey
+lin_rmpg_convey <- attr(svyrmpg( ~ eqincome , des_eusilc), "lin")
+
+# check equality
+all.equal(lin_rmpg_varpoord, 100 * lin_rmpg_convey[, 1])
## [1] TRUE
-
# variances do not match exactly
-attr(svyrmpg( ~ eqincome , des_eusilc) , 'var') * 10000
+
# variances do not match exactly
+attr(svyrmpg( ~ eqincome , des_eusilc) , 'var') * 10000
##          eqincome
 ## eqincome 0.332234
-
varpoord_rmpg_calculation$all_result$var
+
varpoord_rmpg_calculation$all_result$var
## [1] 0.3316454
-
# standard errors do not match exactly
-varpoord_rmpg_calculation$all_result$se
+
# standard errors do not match exactly
+varpoord_rmpg_calculation$all_result$se
## [1] 0.5758866
-
SE(svyrmpg( ~ eqincome , des_eusilc)) * 100
+
SE(svyrmpg( ~ eqincome , des_eusilc)) * 100
##           eqincome
 ## eqincome 0.5763974

The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svyrmpg and vardpoor::linrmpg produce the same linearized variable \(z\).

However, the measures of uncertainty do not line up, because library(vardpoor) defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design object.

-
# within each strata, sum up the weights
-cluster_sums <-
-  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc ,
-    fpc = ~ cluster_sum
-  )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
-  convey_prep(des_eusilc_ultimate_cluster)
-
-
-
-# matches
-stopifnot(all.equal(
-  attr(svyrmpg( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1] * 10000 ,
-  varpoord_rmpg_calculation$all_result$var
-))
-
-# matches
-stopifnot(all.equal(SE(
-  svyrmpg(~ eqincome , des_eusilc_ultimate_cluster)
-)[1] * 100 ,
-varpoord_rmpg_calculation$all_result$se))
+
# within each strata, sum up the weights
+cluster_sums <-
+  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc ,
+    fpc = ~ cluster_sum
+  )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+  convey_prep(des_eusilc_ultimate_cluster)
+
+
+
+# matches
+stopifnot(all.equal(
+  attr(svyrmpg( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1] * 10000 ,
+  varpoord_rmpg_calculation$all_result$var
+))
+
+# matches
+stopifnot(all.equal(SE(
+  svyrmpg(~ eqincome , des_eusilc_ultimate_cluster)
+)[1] * 100 ,
+varpoord_rmpg_calculation$all_result$se))

For additional usage examples of svyrmpg, type ?convey::svyrmpg in the R console.

diff --git a/docs/3.5-median-income-below-the-at-risk-of-poverty-threshold-svypoormed.html b/docs/3.5-median-income-below-the-at-risk-of-poverty-threshold-svypoormed.html index 30b2bdd..0c8b122 100644 --- a/docs/3.5-median-income-below-the-at-risk-of-poverty-threshold-svypoormed.html +++ b/docs/3.5-median-income-below-the-at-risk-of-poverty-threshold-svypoormed.html @@ -265,11 +265,11 @@

3.5 Median Income Below the At Risk of Poverty Threshold (svypoormed)

-
✔️ median income among those below the threshold
-✔️ useful for understanding the depth of poverty
-✔️ related to the RMPG
-❌ not very common outside the EU
-❌ not immediately interpretable in terms of the poverty gap
+
✔️ median income among those below the threshold
+✔️ useful for understanding the depth of poverty
+✔️ related to the RMPG
+❌ not very common outside the EU
+❌ not immediately interpretable in terms of the poverty gap

Median income below the at-risk-of-poverty-threshold (POORMED) is median of incomes of people having the income below the ARPT:

\[ poormed = median\{y_i; y_i< arpt\} @@ -280,145 +280,145 @@

3.5 Median Income Below the At Ri

3.5.1 Replication Example

The R vardpoor package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a POORMED coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the vardpoor library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the poormed coefficient
-# using the R vardpoor library
-varpoord_poormed_calculation <-
-  varpoord(
-    # analysis variable
-    Y = "eqincome",
-    
-    # weights variable
-    w_final = "rb050",
-    
-    # row number variable
-    ID_level1 = "IDd",
-    
-    # row number variable
-    ID_level2 = "IDd",
-    
-    # strata variable
-    H = "db040",
-    
-    N_h = NULL ,
-    
-    # clustering variable
-    PSU = "rb030",
-    
-    # data.table
-    dataset = dati,
-    
-    # poormed coefficient function
-    type = "linpoormed",
-    
-    # get linearized variable
-    outp_lin = TRUE
-    
-  )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc
-  )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_poormed_calculation$all_result$value
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the vardpoor library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the poormed coefficient
+# using the R vardpoor library
+varpoord_poormed_calculation <-
+  varpoord(
+    # analysis variable
+    Y = "eqincome",
+    
+    # weights variable
+    w_final = "rb050",
+    
+    # row number variable
+    ID_level1 = "IDd",
+    
+    # row number variable
+    ID_level2 = "IDd",
+    
+    # strata variable
+    H = "db040",
+    
+    N_h = NULL ,
+    
+    # clustering variable
+    PSU = "rb030",
+    
+    # data.table
+    dataset = dati,
+    
+    # poormed coefficient function
+    type = "linpoormed",
+    
+    # get linearized variable
+    outp_lin = TRUE
+    
+  )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc
+  )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_poormed_calculation$all_result$value
## [1] 8803.735
-
coef(svypoormed( ~ eqincome , des_eusilc))
+
coef(svypoormed( ~ eqincome , des_eusilc))
## eqincome 
 ## 8803.735
-
# linearized variables do match
-# vardpoor
-lin_poormed_varpoord <-
-  varpoord_poormed_calculation$lin_out$lin_poormed
-# convey
-lin_poormed_convey <-
-  attr(svypoormed( ~ eqincome , des_eusilc), "lin")
-
-# check equality
-all.equal(lin_poormed_varpoord, lin_poormed_convey)
+
# linearized variables do match
+# vardpoor
+lin_poormed_varpoord <-
+  varpoord_poormed_calculation$lin_out$lin_poormed
+# convey
+lin_poormed_convey <-
+  attr(svypoormed( ~ eqincome , des_eusilc), "lin")
+
+# check equality
+all.equal(lin_poormed_varpoord, lin_poormed_convey)
## [1] "names for current but not for target"
-
# variances do not match exactly
-attr(svypoormed( ~ eqincome , des_eusilc) , 'var')
+
# variances do not match exactly
+attr(svypoormed( ~ eqincome , des_eusilc) , 'var')
##          eqincome
 ## eqincome  5311.47
-
varpoord_poormed_calculation$all_result$var
+
varpoord_poormed_calculation$all_result$var
## [1] 5302.086
-
# standard errors do not match exactly
-varpoord_poormed_calculation$all_result$se
+
# standard errors do not match exactly
+varpoord_poormed_calculation$all_result$se
## [1] 72.81542
-
SE(svypoormed( ~ eqincome , des_eusilc))
+
SE(svypoormed( ~ eqincome , des_eusilc))
##          eqincome
 ## eqincome 72.87983

The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svypoormed and vardpoor::linpoormed produce the same linearized variable \(z\).

However, the measures of uncertainty do not line up, because library(vardpoor) defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design object.

-
# within each strata, sum up the weights
-cluster_sums <-
-  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc ,
-    fpc = ~ cluster_sum
-  )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
-  convey_prep(des_eusilc_ultimate_cluster)
-
-
-# matches
-stopifnot(all.equal(
-  attr(svypoormed(~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1],
-  varpoord_poormed_calculation$all_result$var
-))
-
-# matches
-stopifnot(all.equal(
-  SE(svypoormed(~ eqincome , des_eusilc_ultimate_cluster))[1],
-  varpoord_poormed_calculation$all_result$se
-))
+
# within each strata, sum up the weights
+cluster_sums <-
+  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc ,
+    fpc = ~ cluster_sum
+  )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+  convey_prep(des_eusilc_ultimate_cluster)
+
+
+# matches
+stopifnot(all.equal(
+  attr(svypoormed(~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1],
+  varpoord_poormed_calculation$all_result$var
+))
+
+# matches
+stopifnot(all.equal(
+  SE(svypoormed(~ eqincome , des_eusilc_ultimate_cluster))[1],
+  varpoord_poormed_calculation$all_result$se
+))

For additional usage examples of svypoormed, type ?convey::svypoormed in the R console.

diff --git a/docs/3.6-foster-greer-thorbecke-class-svyfgt-svyfgtdec.html b/docs/3.6-foster-greer-thorbecke-class-svyfgt-svyfgtdec.html index 55d7d0a..b8f17cb 100644 --- a/docs/3.6-foster-greer-thorbecke-class-svyfgt-svyfgtdec.html +++ b/docs/3.6-foster-greer-thorbecke-class-svyfgt-svyfgtdec.html @@ -265,13 +265,13 @@

3.6 Foster-Greer-Thorbecke class (svyfgt, svyfgtdec)

-
✔️ used widely because encompasses interpretable measures
-✔️ allows an arbitrary poverty threshold
-✔️ can incorporate intensity and inequality among the poor
-✔️ when `g >= 2`, measure can be decomposed in interpretable measures of extension, intensity and inequality in poverty
-❌ scales are dependent on `g`
-❌ becomes increasingly difficult to interpret as `g` parameter grows
-❌ no component that allows for time to exit poverty, unlike the watts index
+
✔️ used widely because encompasses interpretable measures
+✔️ allows an arbitrary poverty threshold
+✔️ can incorporate intensity and inequality among the poor
+✔️ when `g >= 2`, measure can be decomposed in interpretable measures of extension, intensity and inequality in poverty
+❌ scales are dependent on `g`
+❌ becomes increasingly difficult to interpret as `g` parameter grows
+❌ no component that allows for time to exit poverty, unlike the watts index

Foster, Greer, and Thorbecke (1984Foster, James, Joel Greer, and Erik Thorbecke. 1984. “A Class of Decomposable Poverty Measures.” Econometrica 52 (3): 761–66. http://www.jstor.org/stable/1913475.) proposed a family of indicators to measure poverty. This class of \(FGT\) measures, can be defined as

\[ p=\frac{1}{N}\sum_{k\in U}h(y_{k},\theta ), @@ -300,23 +300,23 @@

3.6 Foster-Greer-Thorbecke class

The quantile and the mean involved in the definition of the threshold are estimated for the whole population. When \(\gamma=0\) and \(\theta= .6*MED\), this measure is equal to the indicator ARPR computed by the function svyarpr. The linearization of the FGT(0) is presented in Berger and Skinner (2003Berger, Yves G., and Chris J. Skinner. 2003. “Variance Estimation for a Low Income Proportion.” Journal of the Royal Statistical Society: Series C (Applied Statistics) 52 (4): 457–68. https://doi.org/10.1111/1467-9876.00417.).

Next, we give some examples of the function svyfgt to estimate the values of the FGT poverty index.

Consider first the poverty threshold fixed (\(\gamma=0\)) in the value \(10000\). The headcount ratio (FGT0) is

-
svyfgt( ~ eqincome, des_eusilc, g = 0, abs_thresh = 10000)
+
svyfgt( ~ eqincome, des_eusilc, g = 0, abs_thresh = 10000)
            fgt0     SE
 eqincome 0.11444 0.0027

The poverty gap ratio (FGT(1)) (\(\gamma=1\)) index for the poverty threshold fixed at the same value is

-
svyfgt( ~ eqincome, des_eusilc, g = 1, abs_thresh = 10000)
+
svyfgt( ~ eqincome, des_eusilc, g = 1, abs_thresh = 10000)
             fgt1     SE
 eqincome 0.032085 0.0011

To estimate the FGT(0) with the poverty threshold fixed at \(0.6* MED\), we first fix the argument type_thresh="relq" and then use the default values for percent and order:

-
svyfgt( ~ eqincome, des_eusilc, g = 0, type_thresh = "relq")
+
svyfgt( ~ eqincome, des_eusilc, g = 0, type_thresh = "relq")
            fgt0     SE
 eqincome 0.14444 0.0028

This matches the estimate obtained by:

-
svyarpr( ~ eqincome, design = des_eusilc, .5, .6)
+
svyarpr( ~ eqincome, design = des_eusilc, .5, .6)
            arpr     SE
 eqincome 0.14444 0.0028

To estimate the poverty gap ratio with the poverty threshold equal to \(0.6*MEAN\), we use:

-
svyfgt( ~ eqincome, des_eusilc, g = 1, type_thresh = "relm")
+
svyfgt( ~ eqincome, des_eusilc, g = 1, type_thresh = "relm")
             fgt1     SE
 eqincome 0.051187 0.0011

@@ -325,59 +325,59 @@

3.6.1 Replication Example

In July 2006, Jenkins (2008Jenkins, Stephen. 2008. “Estimation and Interpretation of Measures of Inequality, Poverty, and Social Welfare Using Stata.” North American Stata Users' Group Meetings 2006. Stata Users Group. http://EconPapers.repec.org/RePEc:boc:asug06:16.) presented at the North American Stata Users’ Group Meetings on the stata Atkinson Index command. The example below reproduces those statistics.

In order to match the presentation’s results using the svyfgt function from the convey library, the poverty threshold was considered absolute despite being directly estimated from the survey sample. This effectively treats the variance of the estimated poverty threshold as zero; svyfgt does not account for the uncertainty of the poverty threshold when the level has been stated as absolute with the abs_thresh= parameter. In general, we would instead recommend using either relq or relm in the type_thresh= parameter in order to account for the added uncertainty of the poverty threshold calculation. This example serves only to show that svyfgt behaves properly as compared to other software.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the foreign library
-library(foreign)
-
-# create a temporary file on the local disk
-tf <- tempfile()
-
-# store the location of the presentation file
-presentation_zip <-
-  "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
-
-# download jenkins' presentation to the temporary file
-download.file(presentation_zip , tf , mode = 'wb')
-
-# unzip the contents of the archive
-presentation_files <- unzip(tf , exdir = tempdir())
-
-# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
-x81 <-
-  read.dta(grep("ifs81" , presentation_files , value = TRUE))
-x85 <-
-  read.dta(grep("ifs85" , presentation_files , value = TRUE))
-x91 <-
-  read.dta(grep("ifs91" , presentation_files , value = TRUE))
-
-# NOTE: we recommend using ?convey::svyarpt rather than this unweighted calculation #
-
-# calculate 60% of the unweighted median income in 1981
-unwtd_arpt81 <- quantile(x81$eybhc0 , 0.5) * .6
-
-# calculate 60% of the unweighted median income in 1985
-unwtd_arpt85 <- quantile(x85$eybhc0 , 0.5) * .6
-
-# calculate 60% of the unweighted median income in 1991
-unwtd_arpt91 <- quantile(x91$eybhc0 , 0.5) * .6
-
-# stack each of these three years of data into a single data.frame
-x <- rbind(x81 , x85 , x91)
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the foreign library
+library(foreign)
+
+# create a temporary file on the local disk
+tf <- tempfile()
+
+# store the location of the presentation file
+presentation_zip <-
+  "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
+
+# download jenkins' presentation to the temporary file
+download.file(presentation_zip , tf , mode = 'wb')
+
+# unzip the contents of the archive
+presentation_files <- unzip(tf , exdir = tempdir())
+
+# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
+x81 <-
+  read.dta(grep("ifs81" , presentation_files , value = TRUE))
+x85 <-
+  read.dta(grep("ifs85" , presentation_files , value = TRUE))
+x91 <-
+  read.dta(grep("ifs91" , presentation_files , value = TRUE))
+
+# NOTE: we recommend using ?convey::svyarpt rather than this unweighted calculation #
+
+# calculate 60% of the unweighted median income in 1981
+unwtd_arpt81 <- quantile(x81$eybhc0 , 0.5) * .6
+
+# calculate 60% of the unweighted median income in 1985
+unwtd_arpt85 <- quantile(x85$eybhc0 , 0.5) * .6
+
+# calculate 60% of the unweighted median income in 1991
+unwtd_arpt91 <- quantile(x91$eybhc0 , 0.5) * .6
+
+# stack each of these three years of data into a single data.frame
+x <- rbind(x81 , x85 , x91)

Replicate the author’s survey design statement from stata code..

. ge poor = (year==1981)*(x < $z_81)+(year==1985)*(x < $z_85)+(year==1991)*(x < $z_91)
 . * account for clustering within HHs 
 . svyset hrn [pweight = wgt]

.. into R code:

-
# initiate a linearized survey design object
-y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
-
-# immediately run the `convey_prep` function on the survey design
-z <- convey_prep(y)
+
# initiate a linearized survey design object
+y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
+
+# immediately run the `convey_prep` function on the survey design
+z <- convey_prep(y)

Replicate the author’s headcount ratio results with stata..

. svy: mean poor if year == 1981
 (running mean on estimation sample)
@@ -427,52 +427,52 @@ 

3.6.1 Replication Example

poor | .2021312 .0062077 .1899615 .2143009 --------------------------------------------------------------

..using R code:

-
headcount_81 <-
-  svyfgt( ~ eybhc0 ,
-          subset(z , year == 1981) ,
-          g = 0 ,
-          abs_thresh = unwtd_arpt81)
-
-stopifnot(round(coef(headcount_81) , 5) == .14101)
-stopifnot(round(SE(headcount_81) , 5) == .00449)
-
-headcount_81_ci <-
-  confint(headcount_81 , df = degf(subset(z , year == 1981)))
-
-stopifnot(round(headcount_81_ci[1] , 5) == .13222)
-stopifnot(round(headcount_81_ci[2] , 5) == .14981)
-
-headcount_85 <-
-  svyfgt(~ eybhc0 ,
-         subset(z , year == 1985) ,
-         g = 0 ,
-         abs_thresh = unwtd_arpt85)
-
-
-stopifnot(round(coef(headcount_85) , 5) == .13764)
-stopifnot(round(SE(headcount_85) , 5) == .00465)
-
-headcount_85_ci <-
-  confint(headcount_85 , df = degf(subset(z , year == 1985)))
-
-stopifnot(round(headcount_85_ci[1] , 5) == .12852)
-stopifnot(round(headcount_85_ci[2] , 5) == .14677)
-
-headcount_91 <-
-  svyfgt(~ eybhc0 ,
-         subset(z , year == 1991) ,
-         g = 0 ,
-         abs_thresh = unwtd_arpt91)
-
-
-stopifnot(round(coef(headcount_91) , 5) == .20213)
-stopifnot(round(SE(headcount_91) , 5) == .00621)
-
-headcount_91_ci <-
-  confint(headcount_91 , df = degf(subset(z , year == 1991)))
-
-stopifnot(round(headcount_91_ci[1] , 5) == .18996)
-stopifnot(round(headcount_91_ci[2] , 5) == .21430)
+
headcount_81 <-
+  svyfgt( ~ eybhc0 ,
+          subset(z , year == 1981) ,
+          g = 0 ,
+          abs_thresh = unwtd_arpt81)
+
+stopifnot(round(coef(headcount_81) , 5) == .14101)
+stopifnot(round(SE(headcount_81) , 5) == .00449)
+
+headcount_81_ci <-
+  confint(headcount_81 , df = degf(subset(z , year == 1981)))
+
+stopifnot(round(headcount_81_ci[1] , 5) == .13222)
+stopifnot(round(headcount_81_ci[2] , 5) == .14981)
+
+headcount_85 <-
+  svyfgt(~ eybhc0 ,
+         subset(z , year == 1985) ,
+         g = 0 ,
+         abs_thresh = unwtd_arpt85)
+
+
+stopifnot(round(coef(headcount_85) , 5) == .13764)
+stopifnot(round(SE(headcount_85) , 5) == .00465)
+
+headcount_85_ci <-
+  confint(headcount_85 , df = degf(subset(z , year == 1985)))
+
+stopifnot(round(headcount_85_ci[1] , 5) == .12852)
+stopifnot(round(headcount_85_ci[2] , 5) == .14677)
+
+headcount_91 <-
+  svyfgt(~ eybhc0 ,
+         subset(z , year == 1991) ,
+         g = 0 ,
+         abs_thresh = unwtd_arpt91)
+
+
+stopifnot(round(coef(headcount_91) , 5) == .20213)
+stopifnot(round(SE(headcount_91) , 5) == .00621)
+
+headcount_91_ci <-
+  confint(headcount_91 , df = degf(subset(z , year == 1991)))
+
+stopifnot(round(headcount_91_ci[1] , 5) == .18996)
+stopifnot(round(headcount_91_ci[2] , 5) == .21430)

Confirm this replication applies for the normalized poverty gap as well, comparing stata code..

. ge ngap = poor*($z_81- x)/$z_81 if year == 1981
 
@@ -492,199 +492,199 @@ 

3.6.1 Replication Example

ngap | .0271577 .0013502 .0245109 .0298044 --------------------------------------------------------------

..to R code:

-
norm_pov_81 <-
-  svyfgt( ~ eybhc0 ,
-          subset(z , year == 1981) ,
-          g = 1 ,
-          abs_thresh = unwtd_arpt81)
-
-
-stopifnot(round(coef(norm_pov_81) , 5) == .02716)
-stopifnot(round(SE(norm_pov_81) , 5) == .00135)
-
-norm_pov_81_ci <-
-  confint(norm_pov_81 , df = degf(subset(z , year == 1981)))
-
-stopifnot(round(norm_pov_81_ci[1] , 5) == .02451)
-stopifnot(round(norm_pov_81_ci[2] , 5) == .02980)
+
norm_pov_81 <-
+  svyfgt( ~ eybhc0 ,
+          subset(z , year == 1981) ,
+          g = 1 ,
+          abs_thresh = unwtd_arpt81)
+
+
+stopifnot(round(coef(norm_pov_81) , 5) == .02716)
+stopifnot(round(SE(norm_pov_81) , 5) == .00135)
+
+norm_pov_81_ci <-
+  confint(norm_pov_81 , df = degf(subset(z , year == 1981)))
+
+stopifnot(round(norm_pov_81_ci[1] , 5) == .02451)
+stopifnot(round(norm_pov_81_ci[2] , 5) == .02980)

3.6.2 Monte Carlo Simulation

To provide a example for our code, we proceed with a Monte Carlo experiment. Using the eusilcP data from the simPop package (Templ et al. 2017Templ, Matthias, Bernhard Meindl, Alexander Kowarik, and Olivier Dupriez. 2017. “Simulation of Synthetic Complex Data: The R Package simPop.” Journal of Statistical Software 79 (10): 1–38. https://doi.org/10.18637/jss.v079.i10.), we can compute the actual values of the FGT components for that population:

-
# load libraries
-library(sampling)
-library(survey)
-library(convey)
-library(parallel)
-
-# load pseudo population data
-data("eusilcP" , package = "simPop")
-
-# compute population value of the FGT(2) index decomposition
-inc.all <- eusilcP$eqIncome
-gfun <- function(y , thresh)
-  (thresh - y) / thresh
-hfun <-
-  function(y , thresh , g)
-    (((thresh - y) / thresh) ^ g) * (y <= thresh)
-fgt2 <- mean(hfun(inc.all , 10000 , 2) , na.rm = TRUE)
-fgt1 <- mean(hfun(inc.all , 10000 , 1) , na.rm = TRUE)
-fgt0 <- mean(hfun(inc.all , 10000 , 0) , na.rm = TRUE)
-igr.fgt <-
-  mean(hfun(inc.all , 10000 , 1)[inc.all <= 10000] , na.rm = TRUE)
-gei.poor <-
-  convey:::CalcGEI(gfun(inc.all , 10000) , ifelse(inc.all < 10000 , 1 , 0) , 2)
-theta.pop <-
-  c(
-    "fgt2" = fgt2 ,
-    "fgt0" =  fgt0 ,
-    "fgt1" =  fgt1 ,
-    "igr" = igr.fgt ,
-    "gei(poor;epsilon=2)" =  gei.poor
-  )
-theta.pop
+
# load libraries
+library(sampling)
+library(survey)
+library(convey)
+library(parallel)
+
+# load pseudo population data
+data("eusilcP" , package = "simPop")
+
+# compute population value of the FGT(2) index decomposition
+inc.all <- eusilcP$eqIncome
+gfun <- function(y , thresh)
+  (thresh - y) / thresh
+hfun <-
+  function(y , thresh , g)
+    (((thresh - y) / thresh) ^ g) * (y <= thresh)
+fgt2 <- mean(hfun(inc.all , 10000 , 2) , na.rm = TRUE)
+fgt1 <- mean(hfun(inc.all , 10000 , 1) , na.rm = TRUE)
+fgt0 <- mean(hfun(inc.all , 10000 , 0) , na.rm = TRUE)
+igr.fgt <-
+  mean(hfun(inc.all , 10000 , 1)[inc.all <= 10000] , na.rm = TRUE)
+gei.poor <-
+  convey:::CalcGEI(gfun(inc.all , 10000) , ifelse(inc.all < 10000 , 1 , 0) , 2)
+theta.pop <-
+  c(
+    "fgt2" = fgt2 ,
+    "fgt0" =  fgt0 ,
+    "fgt1" =  fgt1 ,
+    "igr" = igr.fgt ,
+    "gei(poor;epsilon=2)" =  gei.poor
+  )
+theta.pop
##                fgt2                fgt0                fgt1                 igr 
 ##          0.01571192          0.11412691          0.03259544          0.28560699 
 ## gei(poor;epsilon=2) 
 ##          0.34386588

Now, to study the distribution of the estimator under a particular sampling design, we select 5000 samples under one-stage cluster sampling of 100 households using the cluster function from the sampling package (Tillé and Matei 2021Tillé, Yves, and Alina Matei. 2021. Sampling: Survey Sampling. https://CRAN.R-project.org/package=sampling.), and use the svyfgtdec function to estimate the FGT components for each of those samples:

-
# define the number of monte carlo replicates
-mc.rep <- 5000L
-
-
-
-# simulation function
-fgtdec_sim_fun <- function(this.iter) {
-  set.seed(this.iter)
-  
-  library(sampling)
-  library(survey)
-  library(convey)
-  
-  # load pseudo population data
-  data("eusilcP" , package = "simPop")
-  
-  
-  # compute size-like variable for PPS sampling design
-  eusilcP$aux <-
-    log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
-  
-  
-  # select sample
-  tt <-
-    sampling::cluster(
-      data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
-      clustername = "hid" ,
-      size = 1000L ,
-      method = "systematic" ,
-      pik = eusilcP$aux
-    )
-  
-  # collect data
-  this.sample <- getdata(eusilcP , tt)
-  
-  # create survey design object
-  this.desobj <-
-    svydesign(
-      ids = ~ hid ,
-      probs = ~ Prob ,
-      data = this.sample ,
-      nest = FALSE
-    )
-  
-  # prepare for convey functions
-  this.desobj <- convey_prep(this.desobj)
-  
-  # compute estimates
-  svyfgtdec( ~ eqIncome , this.desobj , abs_thresh = 10000 , g = 2)
-  
-}
-
-# run replications
-cl <- makeCluster(detectCores() - 1)
-
-fgtdec.estimate.list <-
-  clusterApply(cl, seq_len(mc.rep) , fgtdec_sim_fun)
-
-stopCluster(cl)
+
# define the number of monte carlo replicates
+mc.rep <- 5000L
+
+
+
+# simulation function
+fgtdec_sim_fun <- function(this.iter) {
+  set.seed(this.iter)
+  
+  library(sampling)
+  library(survey)
+  library(convey)
+  
+  # load pseudo population data
+  data("eusilcP" , package = "simPop")
+  
+  
+  # compute size-like variable for PPS sampling design
+  eusilcP$aux <-
+    log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
+  
+  
+  # select sample
+  tt <-
+    sampling::cluster(
+      data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
+      clustername = "hid" ,
+      size = 1000L ,
+      method = "systematic" ,
+      pik = eusilcP$aux
+    )
+  
+  # collect data
+  this.sample <- getdata(eusilcP , tt)
+  
+  # create survey design object
+  this.desobj <-
+    svydesign(
+      ids = ~ hid ,
+      probs = ~ Prob ,
+      data = this.sample ,
+      nest = FALSE
+    )
+  
+  # prepare for convey functions
+  this.desobj <- convey_prep(this.desobj)
+  
+  # compute estimates
+  svyfgtdec( ~ eqIncome , this.desobj , abs_thresh = 10000 , g = 2)
+  
+}
+
+# run replications
+cl <- makeCluster(detectCores() - 1)
+
+fgtdec.estimate.list <-
+  clusterApply(cl, seq_len(mc.rep) , fgtdec_sim_fun)
+
+stopCluster(cl)

The PRB of each component is estimated using the code below. Notice that PRBs are relatively small, with absolute values below 1%, with the largest bias in the GEI index component.

-
# repeat true population values
-(theta.pop)
+
# repeat true population values
+(theta.pop)
##                fgt2                fgt0                fgt1                 igr 
 ##          0.01571192          0.11412691          0.03259544          0.28560699 
 ## gei(poor;epsilon=2) 
 ##          0.34386588
-
# estimate the expected values of the components estimators
-# using the average of the estimates
-(theta.exp <- rowMeans(sapply(fgtdec.estimate.list , coef)))
+
# estimate the expected values of the components estimators
+# using the average of the estimates
+(theta.exp <- rowMeans(sapply(fgtdec.estimate.list , coef)))
##                fgt2                fgt0                fgt1                 igr 
 ##          0.01577844          0.11432776          0.03269125          0.28591312 
 ## gei(poor;epsilon=2) 
 ##          0.34234370
-
# estimate the percentage relative bias
-(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1))
+
# estimate the percentage relative bias
+(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1))
##                fgt2                fgt0                fgt1                 igr 
 ##           0.4233956           0.1759893           0.2939132           0.1071867 
 ## gei(poor;epsilon=2) 
 ##          -0.4426679
-
stopifnot(abs(percentage_relative_bias) < 1) 
+
stopifnot(abs(percentage_relative_bias) < 1) 

For the variance estimators, we estimate the PRB using the code below. Note that the bias is still relatively small, with absolute values of the PRB below 5%.

-
# estimate the variance of the components estimators
-# using the empirical variance of the estimates
-(vartheta.popest <-
-   diag(var(t(
-     sapply(fgtdec.estimate.list , coef)
-   ))))
+
# estimate the variance of the components estimators
+# using the empirical variance of the estimates
+(vartheta.popest <-
+   diag(var(t(
+     sapply(fgtdec.estimate.list , coef)
+   ))))
##                fgt2                fgt0                fgt1                 igr 
 ##        6.447720e-06        1.015546e-04        1.468049e-05        4.753320e-04 
 ## gei(poor;epsilon=2) 
 ##        1.841360e-03
-
# estimate the expected value of the Watts index variance estimator
-# using the (estimated) expected value of the variance estimates
-(vartheta.exp <-
-    rowMeans(sapply(fgtdec.estimate.list , function(z)
-      diag(vcov(
-        z
-      )))))
+
# estimate the expected value of the Watts index variance estimator
+# using the (estimated) expected value of the variance estimates
+(vartheta.exp <-
+    rowMeans(sapply(fgtdec.estimate.list , function(z)
+      diag(vcov(
+        z
+      )))))
##                fgt2                fgt0                fgt1                 igr 
 ##        6.462904e-06        1.014859e-04        1.473047e-05        4.923412e-04 
 ## gei(poor;epsilon=2) 
 ##        1.911158e-03
-
# estimate the percentage relative bias of the variance estimators
-(percentage_relative_bias_variance <- 100 *  (vartheta.exp / vartheta.popest - 1))
+
# estimate the percentage relative bias of the variance estimators
+(percentage_relative_bias_variance <- 100 *  (vartheta.exp / vartheta.popest - 1))
##                fgt2                fgt0                fgt1                 igr 
 ##          0.23548684         -0.06759586          0.34047392          3.57838870 
 ## gei(poor;epsilon=2) 
 ##          3.79052600
-
stopifnot(abs(percentage_relative_bias_variance) < 5)
+
stopifnot(abs(percentage_relative_bias_variance) < 5)

Regarding the MSE of the decomposition estimator, the squared bias accounts for less than 0.1% of the MSE. This means that, with a good estimate of the variance, we should be able to have a good approximation for the MSE.

-
# estimate MSE
-theta.bias2 <- (theta.exp - theta.pop) ^ 2
-(theta.mse <- theta.bias2 + vartheta.popest)
+
# estimate MSE
+theta.bias2 <- (theta.exp - theta.pop) ^ 2
+(theta.mse <- theta.bias2 + vartheta.popest)
##                fgt2                fgt0                fgt1                 igr 
 ##        6.452146e-06        1.015949e-04        1.468967e-05        4.754257e-04 
 ## gei(poor;epsilon=2) 
 ##        1.843678e-03
-
# squared bias component of the MSE
-( squared_bias_over_mse <- 100 * (theta.bias2 / theta.mse) )
+
# squared bias component of the MSE
+( squared_bias_over_mse <- 100 * (theta.bias2 / theta.mse) )
##                fgt2                fgt0                fgt1                 igr 
 ##          0.06858780          0.03970787          0.06247986          0.01971227 
 ## gei(poor;epsilon=2) 
 ##          0.12567511
-
stopifnot( squared_bias_over_mse < 1 )
+
stopifnot( squared_bias_over_mse < 1 )

The CIs based on the normal approximation work reasonably well for all components. The code below shows that the coverage rates are close to the 95% nominal coverage rate.

-
# estimate confidence intervals of the Watts index
-# for each of the samples
-est.coverage <-
-  sapply(fgtdec.estimate.list, function(this.stat)
-    confint(this.stat)[, 1] <= theta.pop &
-      confint(this.stat)[, 2] >= theta.pop)
-
-# evaluate empirical coverage
-stopifnot( abs( rowMeans(est.coverage) - 0.95 ) < 0.015 )
+
# estimate confidence intervals of the Watts index
+# for each of the samples
+est.coverage <-
+  sapply(fgtdec.estimate.list, function(this.stat)
+    confint(this.stat)[, 1] <= theta.pop &
+      confint(this.stat)[, 2] >= theta.pop)
+
+# evaluate empirical coverage
+stopifnot( abs( rowMeans(est.coverage) - 0.95 ) < 0.015 )

For additional usage examples of svyfgt and svyfgtdec, type ?convey::svyfgt or ?convey::svyfgtdec in the R console.

diff --git a/docs/3.7-watts-poverty-measure-svywatts-svywattsdec.html b/docs/3.7-watts-poverty-measure-svywatts-svywattsdec.html index 3e7e9b4..9598350 100644 --- a/docs/3.7-watts-poverty-measure-svywatts-svywattsdec.html +++ b/docs/3.7-watts-poverty-measure-svywatts-svywattsdec.html @@ -265,12 +265,12 @@

3.7 Watts poverty measure (svywatts, svywattsdec)

-
✔️ time to exit poverty interpretation: watts score divided by a growth rate
-✔️ sensitive to intensity and inequality among the poor
-✔️ can be decomposed into interpretable measures
-❌ not defined for individuals with zero or negative incomes
-❌ interpretation is not very straightforward
-❌ less common than the FGT measures nowadays
+
✔️ time to exit poverty interpretation: watts score divided by a growth rate
+✔️ sensitive to intensity and inequality among the poor
+✔️ can be decomposed into interpretable measures
+❌ not defined for individuals with zero or negative incomes
+❌ interpretation is not very straightforward
+❌ less common than the FGT measures nowadays

The measure proposed in Watts (1968Watts, Harold W. 1968. “An Economic Definition of Poverty.” Discussion Papers 5. Institute For Research on Poverty. https://www.irp.wisc.edu/publications/dps/pdfs/dp568.pdf.) satisfies a number of desirable poverty measurement axioms and is known to be one of the first distribution-sensitive poverty measures, as noted by Haughton and Khandker (2009Haughton, Jonathan, and Shahidur Khandker. 2009. Handbook on Poverty and Inequality. World Bank Training Series. World Bank Publications. https://openknowledge.worldbank.org/bitstream/handle/10986/11985/9780821376133.pdf.). It is defined as:

\[ Watts = \frac{1}{N} \sum_{i \in U} \log{ \bigg( \frac{y_i}{\theta} \bigg) \delta ( y_i \leqslant \theta) }. @@ -297,294 +297,294 @@

3.7 Watts poverty measure (svywat

3.7.1 Monte Carlo Simulation

To provide a example for our code, we proceed with a Monte Carlo experiment. Using the eusilcP data from the simPop package (Templ et al. 2017Templ, Matthias, Bernhard Meindl, Alexander Kowarik, and Olivier Dupriez. 2017. “Simulation of Synthetic Complex Data: The R Package simPop.” Journal of Statistical Software 79 (10): 1–38. https://doi.org/10.18637/jss.v079.i10.), we can compute the actual value of the Watts index for that population:

-
# load libraries
-library(sampling)
-library(survey)
-library(convey)
-library(parallel)
-
-# load pseudo population data
-data("eusilcP" , package = "simPop")
-
-# compute population value of the Watts index decomposition
-inc.pos <- eusilcP$eqIncome[eusilcP$eqIncome > 0]
-(theta.pop <-
-    mean(ifelse(inc.pos <= 10000 , log(10000 /  inc.pos) , 0) , na.rm = TRUE))
+
# load libraries
+library(sampling)
+library(survey)
+library(convey)
+library(parallel)
+
+# load pseudo population data
+data("eusilcP" , package = "simPop")
+
+# compute population value of the Watts index decomposition
+inc.pos <- eusilcP$eqIncome[eusilcP$eqIncome > 0]
+(theta.pop <-
+    mean(ifelse(inc.pos <= 10000 , log(10000 /  inc.pos) , 0) , na.rm = TRUE))
## [1] 0.05025374

Now, to study the distribution of the estimator under a particular sampling design, we select 5000 samples under one-stage cluster sampling of 100 households using the cluster function from the sampling package (Tillé and Matei 2021Tillé, Yves, and Alina Matei. 2021. Sampling: Survey Sampling. https://CRAN.R-project.org/package=sampling.), and use the svywatts function to estimate the Watts index for each of those samples:

-
# define the number of monte carlo replicates
-mc.rep <- 5000L
-
-
-# simulation function
-watts_sim_fun <- function(this.iter) {
-  set.seed(this.iter)
-  
-  library(survey)
-  library(convey)
-  library(sampling)
-  
-  
-  # load pseudo population data
-  data("eusilcP" , package = "simPop")
-  
-  # compute size-like variable for PPS sampling design
-  eusilcP$aux <-
-    log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
-  
-  # select sample
-  tt <-
-    sampling::cluster(
-      data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) ,] ,
-      clustername = "hid" ,
-      size = 1000L ,
-      method = "systematic" ,
-      pik = eusilcP$aux
-    )
-  
-  # collect data
-  this.sample <- getdata(eusilcP , tt)
-  
-  # create survey design object
-  this.desobj <-
-    svydesign(
-      ids = ~ hid ,
-      probs = ~ Prob ,
-      data = this.sample ,
-      nest = FALSE
-    )
-  
-  # prepare for convey functions
-  this.desobj <- convey_prep(this.desobj)
-  
-  # filter positive incomes
-  this.desobj <- subset(this.desobj , eqIncome > 0)
-  
-  # compute estimates
-  svywatts( ~ eqIncome , this.desobj , abs_thresh = 10000)
-  
-}
-
-# run replications
-cl <- makeCluster(detectCores() - 1)
-
-watts.estimate.list <-
-  clusterApply(cl, seq_len(mc.rep) , watts_sim_fun)
-
-stopCluster(cl)
+
# define the number of monte carlo replicates
+mc.rep <- 5000L
+
+
+# simulation function
+watts_sim_fun <- function(this.iter) {
+  set.seed(this.iter)
+  
+  library(survey)
+  library(convey)
+  library(sampling)
+  
+  
+  # load pseudo population data
+  data("eusilcP" , package = "simPop")
+  
+  # compute size-like variable for PPS sampling design
+  eusilcP$aux <-
+    log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
+  
+  # select sample
+  tt <-
+    sampling::cluster(
+      data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) ,] ,
+      clustername = "hid" ,
+      size = 1000L ,
+      method = "systematic" ,
+      pik = eusilcP$aux
+    )
+  
+  # collect data
+  this.sample <- getdata(eusilcP , tt)
+  
+  # create survey design object
+  this.desobj <-
+    svydesign(
+      ids = ~ hid ,
+      probs = ~ Prob ,
+      data = this.sample ,
+      nest = FALSE
+    )
+  
+  # prepare for convey functions
+  this.desobj <- convey_prep(this.desobj)
+  
+  # filter positive incomes
+  this.desobj <- subset(this.desobj , eqIncome > 0)
+  
+  # compute estimates
+  svywatts( ~ eqIncome , this.desobj , abs_thresh = 10000)
+  
+}
+
+# run replications
+cl <- makeCluster(detectCores() - 1)
+
+watts.estimate.list <-
+  clusterApply(cl, seq_len(mc.rep) , watts_sim_fun)
+
+stopCluster(cl)

Then, we evaluate the Percentage Relative Bias (PRB) of the Watts index estimator. Under this scenario, the PRB of the Watts index estimator is 0.3772%.

-
# estimate the expected value of the Watts index estimator
-# using the average of the estimates
-(theta.exp <- mean(sapply(watts.estimate.list , coef)))
+
# estimate the expected value of the Watts index estimator
+# using the average of the estimates
+(theta.exp <- mean(sapply(watts.estimate.list , coef)))
## [1] 0.05044329
-
# estimate the percentage relative bias
-(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1) )
+
# estimate the percentage relative bias
+(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1) )
## [1] 0.377195
-
stopifnot( round( percentage_relative_bias , 4 ) == 0.3772)
+
stopifnot( round( percentage_relative_bias , 4 ) == 0.3772)

For the variance estimator, we have:

-
# estimate the variance of the Watts index estimator
-# using the empirical variance of the estimates
-(vartheta.popest <- var(sapply(watts.estimate.list , coef)))
+
# estimate the variance of the Watts index estimator
+# using the empirical variance of the estimates
+(vartheta.popest <- var(sapply(watts.estimate.list , coef)))
## [1] 6.141434e-05
-
# estimate the expected value of the Watts index variance estimator
-# using the (estimated) expected value of the variance estimates
-(vartheta.exp <- mean(sapply(watts.estimate.list , vcov)))
+
# estimate the expected value of the Watts index variance estimator
+# using the (estimated) expected value of the variance estimates
+(vartheta.exp <- mean(sapply(watts.estimate.list , vcov)))
## [1] 6.100902e-05
-
# estimate the percentage relative bias of the variance estimator
-(percentage_relative_bias_variance <- 100 *  (vartheta.exp / vartheta.popest - 1))
+
# estimate the percentage relative bias of the variance estimator
+(percentage_relative_bias_variance <- 100 *  (vartheta.exp / vartheta.popest - 1))
## [1] -0.6599717
-
stopifnot(round(percentage_relative_bias_variance, 4) == -0.6600)
+
stopifnot(round(percentage_relative_bias_variance, 4) == -0.6600)

Under this scenario, the PRB of the Watts index variance estimator is -0.6600%.

Our simulations show that the Squared Bias of this estimator accounts for less than 0.1% of its Mean Squared Error:

-
theta.bias2 <- (theta.exp - theta.pop) ^ 2
-theta.mse <- theta.bias2 + vartheta.popest
-(squared_bias_over_mse <- 100 * (theta.bias2 / theta.mse))
+
theta.bias2 <- (theta.exp - theta.pop) ^ 2
+theta.mse <- theta.bias2 + vartheta.popest
+(squared_bias_over_mse <- 100 * (theta.bias2 / theta.mse))
## [1] 0.05847158
-
stopifnot(squared_bias_over_mse < 0.1)
+
stopifnot(squared_bias_over_mse < 0.1)

Next, we evaluate the Percentage Coverage Rate (PCR). In theory, under repeated sampling, the estimated 95% CIs should cover the population parameter approximately 95% of the time. We can evaluate that using:

-
# estimate confidence intervals of the Watts index
-# for each of the samples
-est.coverage <-
-  sapply(watts.estimate.list, function(this.stat)
-    confint(this.stat)[, 1] <= theta.pop &
-      confint(this.stat)[, 2] >= theta.pop)
-
-# evaluate empirical coverage
-(empirical_coverage <- mean(est.coverage))
+
# estimate confidence intervals of the Watts index
+# for each of the samples
+est.coverage <-
+  sapply(watts.estimate.list, function(this.stat)
+    confint(this.stat)[, 1] <= theta.pop &
+      confint(this.stat)[, 2] >= theta.pop)
+
+# evaluate empirical coverage
+(empirical_coverage <- mean(est.coverage))
## [1] 0.9268
-
stopifnot(abs(empirical_coverage  - 0.95) < 0.025)
+
stopifnot(abs(empirical_coverage  - 0.95) < 0.025)

Our coverages are not too far from the nominal coverage level of 95%.

For the Watts index decomposition, we start by computing the (true) population values of the components:

-
# compute population value of the Watts index decomposition
-inc.pos <- eusilcP$eqIncome[eusilcP$eqIncome > 0]
-wdec1 <-
-  mean(ifelse(inc.pos <= 10000 , log(10000 /  inc.pos) , 0) , na.rm = TRUE)
-wdec2 <- mean(inc.pos <= 10000 , na.rm = TRUE)
-mu.poor <- mean(inc.pos [inc.pos <= 10000])
-wdec3 <- log(10000 / mu.poor)
-wdec4 <-
-  -mean(log(inc.pos[inc.pos <= 10000] / mu.poor) , na.rm = TRUE)
-theta.pop <-
-  c(
-    "watts" = wdec1 ,
-    "fgt0" =  wdec2 ,
-    "watts pov. gap ratio" = wdec3 ,
-    "theil(poor)" =  wdec4
-  )
-theta.pop
+
# compute population value of the Watts index decomposition
+inc.pos <- eusilcP$eqIncome[eusilcP$eqIncome > 0]
+wdec1 <-
+  mean(ifelse(inc.pos <= 10000 , log(10000 /  inc.pos) , 0) , na.rm = TRUE)
+wdec2 <- mean(inc.pos <= 10000 , na.rm = TRUE)
+mu.poor <- mean(inc.pos [inc.pos <= 10000])
+wdec3 <- log(10000 / mu.poor)
+wdec4 <-
+  -mean(log(inc.pos[inc.pos <= 10000] / mu.poor) , na.rm = TRUE)
+theta.pop <-
+  c(
+    "watts" = wdec1 ,
+    "fgt0" =  wdec2 ,
+    "watts pov. gap ratio" = wdec3 ,
+    "theil(poor)" =  wdec4
+  )
+theta.pop
##                watts                 fgt0 watts pov. gap ratio 
 ##           0.05025374           0.11399096           0.33497664 
 ##          theil(poor) 
 ##           0.10588056

Then, using the same sampling strategy of the svywatts, we compute the svywattsdec for each sample:

-
# simulation function
-wattsdec_sim_fun <- function(this.iter) {
-  
-  set.seed(this.iter)
-  
-  library(survey)
-  library(convey)
-  library(sampling)
-  
-  
-  # load pseudo population data
-  data("eusilcP" , package = "simPop")
-  
-  # compute size-like variable for PPS sampling design
-  eusilcP$aux <-
-    log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
-  
-  # select sample
-  tt <-
-    sampling::cluster(
-      data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
-      clustername = "hid" ,
-      size = 1000L ,
-      method = "systematic" ,
-      pik = eusilcP$aux
-    )
-  
-  # collect data
-  this.sample <- getdata(eusilcP , tt)
-  
-  # create survey design object
-  this.desobj <-
-    svydesign(
-      ids = ~ hid ,
-      probs = ~ Prob ,
-      data = this.sample ,
-      nest = FALSE
-    )
-  
-  # prepare for convey functions
-  this.desobj <- convey_prep(this.desobj)
-  
-  # filter positive incomes
-  this.desobj <- subset(this.desobj , eqIncome > 0)
-  
-  # compute estimates
-  svywattsdec(~ eqIncome , this.desobj , abs_thresh = 10000)
-  
-}
-
-# run replications
-cl <- makeCluster(detectCores() - 1)
-
-wattsdec.estimate.list <-
-  clusterApply(cl, seq_len(mc.rep) , wattsdec_sim_fun)
-
-stopCluster(cl)
+
# simulation function
+wattsdec_sim_fun <- function(this.iter) {
+  
+  set.seed(this.iter)
+  
+  library(survey)
+  library(convey)
+  library(sampling)
+  
+  
+  # load pseudo population data
+  data("eusilcP" , package = "simPop")
+  
+  # compute size-like variable for PPS sampling design
+  eusilcP$aux <-
+    log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
+  
+  # select sample
+  tt <-
+    sampling::cluster(
+      data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
+      clustername = "hid" ,
+      size = 1000L ,
+      method = "systematic" ,
+      pik = eusilcP$aux
+    )
+  
+  # collect data
+  this.sample <- getdata(eusilcP , tt)
+  
+  # create survey design object
+  this.desobj <-
+    svydesign(
+      ids = ~ hid ,
+      probs = ~ Prob ,
+      data = this.sample ,
+      nest = FALSE
+    )
+  
+  # prepare for convey functions
+  this.desobj <- convey_prep(this.desobj)
+  
+  # filter positive incomes
+  this.desobj <- subset(this.desobj , eqIncome > 0)
+  
+  # compute estimates
+  svywattsdec(~ eqIncome , this.desobj , abs_thresh = 10000)
+  
+}
+
+# run replications
+cl <- makeCluster(detectCores() - 1)
+
+wattsdec.estimate.list <-
+  clusterApply(cl, seq_len(mc.rep) , wattsdec_sim_fun)
+
+stopCluster(cl)

The PRB of each component is estimated using the code below. Notice that PRBs are relatively small, with absolute values below 1%, with the largest bias in the Theil index component.

-
# repeat true population values
-(theta.pop)
+
# repeat true population values
+(theta.pop)
##                watts                 fgt0 watts pov. gap ratio 
 ##           0.05025374           0.11399096           0.33497664 
 ##          theil(poor) 
 ##           0.10588056
-
# estimate the expected values of the components estimators
-# using the average of the estimates
-(theta.exp <- rowMeans(sapply(wattsdec.estimate.list , coef)))
+
# estimate the expected values of the components estimators
+# using the average of the estimates
+(theta.exp <- rowMeans(sapply(wattsdec.estimate.list , coef)))
##                watts                 fgt0 watts pov. gap ratio 
 ##           0.05044329           0.11418992           0.33584759 
 ##          theil(poor) 
 ##           0.10584416
-
# estimate the percentage relative bias
-(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1))
+
# estimate the percentage relative bias
+(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1))
##                watts                 fgt0 watts pov. gap ratio 
 ##           0.37719501           0.17453750           0.26000223 
 ##          theil(poor) 
 ##          -0.03437576
-
stopifnot(abs(percentage_relative_bias) < 1)
+
stopifnot(abs(percentage_relative_bias) < 1)

For the variance estimators, we estimate the PRB using the code below. Note that the bias of the variance estimators is still relatively small, with absolute value of the Watts variance estimator’s PRB below 1% and all four components variance estimators below 5%.

-
# estimate the variance of the components estimators
-# using the empirical variance of the estimates
-(vartheta.popest <-
-   diag(var(t(
-     sapply(wattsdec.estimate.list , coef)
-   ))))
+
# estimate the variance of the components estimators
+# using the empirical variance of the estimates
+(vartheta.popest <-
+   diag(var(t(
+     sapply(wattsdec.estimate.list , coef)
+   ))))
##                watts                 fgt0 watts pov. gap ratio 
 ##         6.141434e-05         1.015260e-04         9.241259e-04 
 ##          theil(poor) 
 ##         1.108525e-03
-
# estimate the expected value of the Watts index variance estimator
-# using the (estimated) expected value of the variance estimates
-(vartheta.exp <-
-    rowMeans(sapply(wattsdec.estimate.list , function(z)
-      diag(vcov(
-        z
-      )))))
+
# estimate the expected value of the Watts index variance estimator
+# using the (estimated) expected value of the variance estimates
+(vartheta.exp <-
+    rowMeans(sapply(wattsdec.estimate.list , function(z)
+      diag(vcov(
+        z
+      )))))
##                watts                 fgt0 watts pov. gap ratio 
 ##         6.100902e-05         1.013968e-04         9.613831e-04 
 ##          theil(poor) 
 ##         1.070018e-03
-
# estimate the percentage relative bias of the variance estimators
-(percentage_relative_bias <-
-    100 *  (vartheta.exp / vartheta.popest - 1))
+
# estimate the percentage relative bias of the variance estimators
+(percentage_relative_bias <-
+    100 *  (vartheta.exp / vartheta.popest - 1))
##                watts                 fgt0 watts pov. gap ratio 
 ##           -0.6599717           -0.1273107            4.0316155 
 ##          theil(poor) 
 ##           -3.4736907
-
stopifnot(abs(percentage_relative_bias[1]) < 1 & all(abs(percentage_relative_bias) < 5))
+
stopifnot(abs(percentage_relative_bias[1]) < 1 & all(abs(percentage_relative_bias) < 5))

Regarding the MSE, the squared bias accounts for less than 0.1% of the MSE. This means that, with a good estimate of the variance, we should be able to have a good approximation for the MSE.

-
# estimate MSE
-theta.bias2 <- (theta.exp - theta.pop) ^ 2
-(theta.mse <- theta.bias2 + vartheta.popest)
+
# estimate MSE
+theta.bias2 <- (theta.exp - theta.pop) ^ 2
+(theta.mse <- theta.bias2 + vartheta.popest)
##                watts                 fgt0 watts pov. gap ratio 
 ##         6.145027e-05         1.015656e-04         9.248844e-04 
 ##          theil(poor) 
 ##         1.108526e-03
-
# squared bias component of the MSE
-(squared_bias_over_mse <- 100 * (theta.bias2 / theta.mse))
+
# squared bias component of the MSE
+(squared_bias_over_mse <- 100 * (theta.bias2 / theta.mse))
##                watts                 fgt0 watts pov. gap ratio 
 ##         0.0584715789         0.0389737000         0.0820154610 
 ##          theil(poor) 
 ##         0.0001195063
-
stopifnot(squared_bias_over_mse < 0.1)
+
stopifnot(squared_bias_over_mse < 0.1)

However, the CIs based on the normal approximation might not work very well for some components. The code below shows that coverage rate for the Theil index component differs significantly from the 95% nominal coverage rate.

-
# estimate confidence intervals of the Watts index
-# for each of the samples
-est.coverage <-
-  sapply(wattsdec.estimate.list, function(this.stat)
-    confint(this.stat)[, 1] <= theta.pop &
-      confint(this.stat)[, 2] >= theta.pop)
-
-# evaluate empirical coverage
-(empirical_coverage <- rowMeans(est.coverage))
+
# estimate confidence intervals of the Watts index
+# for each of the samples
+est.coverage <-
+  sapply(wattsdec.estimate.list, function(this.stat)
+    confint(this.stat)[, 1] <= theta.pop &
+      confint(this.stat)[, 2] >= theta.pop)
+
+# evaluate empirical coverage
+(empirical_coverage <- rowMeans(est.coverage))
##                watts                 fgt0 watts pov. gap ratio 
 ##               0.9268               0.9494               0.9476 
 ##          theil(poor) 
 ##               0.8452
-
stopifnot(abs(empirical_coverage - 0.95) < 0.15)
+
stopifnot(abs(empirical_coverage - 0.95) < 0.15)

One of the reasons for this is that the sample might not be large enough for the CLT to hold. The distribution of the estimator shows substantial asymmetry, which would be a problem for the normal approximation.

-
hist(
-  sapply(wattsdec.estimate.list , coef)[4, ] ,
-  main = "Histogram of Theil component estimator" ,
-  xlim = c(0, .30) ,
-  ylim = c(0 , 1500) ,
-  xlab = "Estimate"
-)
+
hist(
+  sapply(wattsdec.estimate.list , coef)[4, ] ,
+  main = "Histogram of Theil component estimator" ,
+  xlim = c(0, .30) ,
+  ylim = c(0 , 1500) ,
+  xlab = "Estimate"
+)

For additional usage examples of svywatts and svywattsdec, type ?convey::svywatts or ?convey::svywattsdec in the R console.

diff --git a/docs/4.11-generalized-entropy-and-decomposition-svygei-svygeidec.html b/docs/4.11-generalized-entropy-and-decomposition-svygei-svygeidec.html index 21fd23c..f2743fa 100644 --- a/docs/4.11-generalized-entropy-and-decomposition-svygei-svygeidec.html +++ b/docs/4.11-generalized-entropy-and-decomposition-svygei-svygeidec.html @@ -265,12 +265,12 @@

4.11 Generalized Entropy and Decomposition (svygei, svygeidec)

-
✔️ flexible inequality-aversion parameter -- varying its epsilon parameter can highlight the effect of inequality in different parts of the income distribution
-✔️ can be group-decomposed into within-inequality and between-inequality
-✔️ this parameter can also be (somewhat) tuned to be less affected by outliers
-❌ does not handle zero or negative incomes
-❌ hard to interpret
-❌ can be very sensitive to outliers
+
✔️ flexible inequality-aversion parameter -- varying its epsilon parameter can highlight the effect of inequality in different parts of the income distribution
+✔️ can be group-decomposed into within-inequality and between-inequality
+✔️ this parameter can also be (somewhat) tuned to be less affected by outliers
+❌ does not handle zero or negative incomes
+❌ hard to interpret
+❌ can be very sensitive to outliers

Using a generalization of the information function, now defined as:

\[ g(f) = \frac{1}{\alpha-1} [ 1 - f^{\alpha - 1} ] @@ -306,38 +306,38 @@

4.11 Generalized Entropy and Deco

4.11.1 Replication Example

In July 2006, Jenkins (2008Jenkins, Stephen. 2008. “Estimation and Interpretation of Measures of Inequality, Poverty, and Social Welfare Using Stata.” North American Stata Users' Group Meetings 2006. Stata Users Group. http://EconPapers.repec.org/RePEc:boc:asug06:16.) presented at the North American Stata Users’ Group Meetings on the stata Generalized Entropy Index command. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the foreign library
-library(foreign)
-
-# create a temporary file on the local disk
-tf <- tempfile()
-
-# store the location of the presentation file
-presentation_zip <-
-  "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
-
-# download jenkins' presentation to the temporary file
-download.file(presentation_zip , tf , mode = 'wb')
-
-# unzip the contents of the archive
-presentation_files <- unzip(tf , exdir = tempdir())
-
-# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
-x81 <-
-  read.dta(grep("ifs81" , presentation_files , value = TRUE))
-x85 <-
-  read.dta(grep("ifs85" , presentation_files , value = TRUE))
-x91 <-
-  read.dta(grep("ifs91" , presentation_files , value = TRUE))
-
-# stack each of these three years of data into a single data.frame
-x <- rbind(x81 , x85 , x91)
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the foreign library
+library(foreign)
+
+# create a temporary file on the local disk
+tf <- tempfile()
+
+# store the location of the presentation file
+presentation_zip <-
+  "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
+
+# download jenkins' presentation to the temporary file
+download.file(presentation_zip , tf , mode = 'wb')
+
+# unzip the contents of the archive
+presentation_files <- unzip(tf , exdir = tempdir())
+
+# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
+x81 <-
+  read.dta(grep("ifs81" , presentation_files , value = TRUE))
+x85 <-
+  read.dta(grep("ifs85" , presentation_files , value = TRUE))
+x91 <-
+  read.dta(grep("ifs91" , presentation_files , value = TRUE))
+
+# stack each of these three years of data into a single data.frame
+x <- rbind(x81 , x85 , x91)

Replicate the author’s survey design statement from stata code..

. * account for clustering within HHs 
 . version 8: svyset [pweight = wgt], psu(hrn)
@@ -345,11 +345,11 @@ 

4.11.1 Replication Example

psu is hrn construct an

.. into R code:

-
# initiate a linearized survey design object
-y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
-
-# immediately run the `convey_prep` function on the survey design
-z <- convey_prep(y)
+
# initiate a linearized survey design object
+y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
+
+# immediately run the `convey_prep` function on the survey design
+z <- convey_prep(y)

Replicate the author’s subset statement and each of his svygei results..

. svygei x if year == 1981
  
@@ -371,21 +371,21 @@ 

4.11.1 Replication Example

GE(3) | .1739994 .00662015 26.28 0.000 .1610242 .1869747 ---------------------------------------------------------------------------

..using R code:

-
z81 <- subset(z , year == 1981)
-
-svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = -1)
+
z81 <- subset(z , year == 1981)
+
+svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = -1)
##            gei     SE
 ## eybhc0 0.19021 0.0247
-
svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 0)
+
svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 0)
##            gei     SE
 ## eybhc0 0.11429 0.0028
-
svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0))
+
svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0))
##            gei     SE
 ## eybhc0 0.11169 0.0023
-
svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 2)
+
svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 2)
##            gei     SE
 ## eybhc0 0.12879 0.0033
-
svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 3)
+
svygei( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 3)
##          gei     SE
 ## eybhc0 0.174 0.0066

Confirm this replication applies for subsetted objects as well. Compare stata output..

@@ -407,83 +407,83 @@

4.11.1 Replication Example

GE(3) | .2609507 .01850689 14.10 0.000 .2246779 .2972235 ---------------------------------------------------------------------------

..to R code:

-
z85 <- subset(z , year == 1985)
-
-svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1) , epsilon = -1)
+
z85 <- subset(z , year == 1985)
+
+svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1) , epsilon = -1)
##            gei     SE
 ## eybhc0 0.16024 0.0094
-
svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1) , epsilon = 0)
+
svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1) , epsilon = 0)
##            gei     SE
 ## eybhc0 0.12762 0.0033
-
svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1))
+
svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1))
##            gei     SE
 ## eybhc0 0.13372 0.0041
-
svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1) , epsilon = 2)
+
svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1) , epsilon = 2)
##            gei     SE
 ## eybhc0 0.16764 0.0073
-
svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1) , epsilon = 3)
+
svygei( ~ eybhc0 , subset(z85 , eybhc0 > 1) , epsilon = 3)
##            gei     SE
 ## eybhc0 0.26095 0.0185

Replicate the author’s decomposition by population subgroup (work status) shown on PDF page 57..

-
# define work status (PDF page 22)
-z <-
-  update(z , wkstatus = c(1 , 1 , 1 , 1 , 2 , 3 , 2 , 2)[as.numeric(esbu)])
-z <-
-  update(z , wkstatus = factor(wkstatus , labels = c("1+ ft working" , "no ft working" , "elderly")))
-
-# subset to 1991 and remove records with zero income
-z91 <- subset(z , year == 1991 & eybhc0 > 0)
-
-# population share
-svymean( ~ wkstatus, z91)
+
# define work status (PDF page 22)
+z <-
+  update(z , wkstatus = c(1 , 1 , 1 , 1 , 2 , 3 , 2 , 2)[as.numeric(esbu)])
+z <-
+  update(z , wkstatus = factor(wkstatus , labels = c("1+ ft working" , "no ft working" , "elderly")))
+
+# subset to 1991 and remove records with zero income
+z91 <- subset(z , year == 1991 & eybhc0 > 0)
+
+# population share
+svymean( ~ wkstatus, z91)
##                          mean     SE
 ## wkstatus1+ ft working 0.61724 0.0067
 ## wkstatusno ft working 0.20607 0.0059
 ## wkstatuselderly       0.17669 0.0046
-
# mean
-svyby( ~ eybhc0, ~ wkstatus, z91, svymean)
+
# mean
+svyby( ~ eybhc0, ~ wkstatus, z91, svymean)
##                    wkstatus   eybhc0       se
 ## 1+ ft working 1+ ft working 278.8040 3.703790
 ## no ft working no ft working 151.6317 3.153968
 ## elderly             elderly 176.6045 4.661740
-
# subgroup indices: ge_k
-svyby( ~ eybhc0 , ~ wkstatus , z91 , svygei , epsilon = -1)
+
# subgroup indices: ge_k
+svyby( ~ eybhc0 , ~ wkstatus , z91 , svygei , epsilon = -1)
##                    wkstatus     eybhc0          se
 ## 1+ ft working 1+ ft working  0.2300708  0.02853959
 ## no ft working no ft working 10.9231761 10.65482557
 ## elderly             elderly  0.1932164  0.02571991
-
svyby( ~ eybhc0 , ~ wkstatus , z91 , svygei , epsilon = 0)
+
svyby( ~ eybhc0 , ~ wkstatus , z91 , svygei , epsilon = 0)
##                    wkstatus    eybhc0          se
 ## 1+ ft working 1+ ft working 0.1536921 0.006955506
 ## no ft working no ft working 0.1836835 0.014740510
 ## elderly             elderly 0.1653658 0.016409770
-
svyby( ~ eybhc0 , ~ wkstatus , z91 , svygei , epsilon = 1)
+
svyby( ~ eybhc0 , ~ wkstatus , z91 , svygei , epsilon = 1)
##                    wkstatus    eybhc0          se
 ## 1+ ft working 1+ ft working 0.1598558 0.008327994
 ## no ft working no ft working 0.1889909 0.016766120
 ## elderly             elderly 0.2023862 0.027787224
-
svyby( ~ eybhc0 , ~ wkstatus , z91 , svygei , epsilon = 2)
+
svyby( ~ eybhc0 , ~ wkstatus , z91 , svygei , epsilon = 2)
##                    wkstatus    eybhc0         se
 ## 1+ ft working 1+ ft working 0.2130664 0.01546521
 ## no ft working no ft working 0.2846345 0.06016394
 ## elderly             elderly 0.3465088 0.07362898
-
# GE decomposition
-svygeidec( ~ eybhc0, ~ wkstatus, z91, epsilon = -1)
+
# GE decomposition
+svygeidec( ~ eybhc0, ~ wkstatus, z91, epsilon = -1)
##         gei decomposition     SE
 ## total            3.682893 3.3999
 ## within           3.646572 3.3998
 ## between          0.036321 0.0028
-
svygeidec( ~ eybhc0, ~ wkstatus, z91, epsilon = 0)
+
svygeidec( ~ eybhc0, ~ wkstatus, z91, epsilon = 0)
##         gei decomposition     SE
 ## total            0.195236 0.0065
 ## within           0.161935 0.0061
 ## between          0.033301 0.0025
-
svygeidec( ~ eybhc0, ~ wkstatus, z91, epsilon = 1)
+
svygeidec( ~ eybhc0, ~ wkstatus, z91, epsilon = 1)
##         gei decomposition     SE
 ## total            0.200390 0.0079
 ## within           0.169396 0.0076
 ## between          0.030994 0.0022
-
svygeidec( ~ eybhc0, ~ wkstatus, z91, epsilon = 2)
+
svygeidec( ~ eybhc0, ~ wkstatus, z91, epsilon = 2)
##         gei decomposition     SE
 ## total            0.274325 0.0167
 ## within           0.245067 0.0164
diff --git a/docs/4.12-j-divergence-and-decomposition-svyjdiv-svyjdivdec.html b/docs/4.12-j-divergence-and-decomposition-svyjdiv-svyjdivdec.html
index 7fed43b..f13f5ce 100644
--- a/docs/4.12-j-divergence-and-decomposition-svyjdiv-svyjdivdec.html
+++ b/docs/4.12-j-divergence-and-decomposition-svyjdiv-svyjdivdec.html
@@ -265,12 +265,12 @@
 

4.12 J-Divergence and Decomposition (svyjdiv, svyjdivdec)

-
✔️ can be interpreted in terms of GEI indices
-✔️ can be group-decomposed into within-inequality and between-inequality
-✔️ does not need to (explicitly) choose inequality aversion parameters
-❌ does not handle zero or negative incomes
-❌ hard to interpret
-❌ the decomposition interpretation is not exactly the same as the GEI, but very similar
+
✔️ can be interpreted in terms of GEI indices
+✔️ can be group-decomposed into within-inequality and between-inequality
+✔️ does not need to (explicitly) choose inequality aversion parameters
+❌ does not handle zero or negative incomes
+❌ hard to interpret
+❌ the decomposition interpretation is not exactly the same as the GEI, but very similar

The J-divergence measure (Rohde 2016Rohde, Nicholas. 2016. “J-Divergence Measurements of Economic Inequality.” Journal of the Royal Statistical Society: Series A (Statistics in Society) 179 (3): 847–70. https://doi.org/10.1111/rssa.12153.) can be seen as the sum of \(GE_0\) and \(GE_1\), satisfying axioms that, individually, those two indices do not. The J-divergence measure is defined as:

\[ J = \frac{1}{N} \sum_{i \in U} \bigg( \frac{ y_i }{ \mu } -1 \bigg) \ln \bigg( \frac{y_i}{\mu} \bigg) @@ -287,136 +287,136 @@

4.12 J-Divergence and Decompositi

4.12.1 Monte Carlo Simulation

First, we should check that the finite-population values make sense. The J-divergence can be seen as the sum of \(GE^{(0)}\) and \(GE^{(1)}\). So, taking the starting population from the svyzenga section of this text, we have:

-
# compute finite population J-divergence
-(jdivt.pop <-
-   (convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))))
+
# compute finite population J-divergence
+(jdivt.pop <-
+   (convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))))
## [1] 0.4332649
-
# compute finite population GE indices
-(gei0.pop <-
-    convey:::CalcGEI(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0) , 0))
+
# compute finite population GE indices
+(gei0.pop <-
+    convey:::CalcGEI(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0) , 0))
## [1] 0.2215037
-
(gei1.pop <-
-    convey:::CalcGEI(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0) , 1))
+
(gei1.pop <-
+    convey:::CalcGEI(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0) , 1))
## [1] 0.2117612
-
# check equality; should be true
-all.equal(jdivt.pop , gei0.pop + gei1.pop)
+
# check equality; should be true
+all.equal(jdivt.pop , gei0.pop + gei1.pop)
## [1] TRUE

As expected, the J-divergence matches the sum of GEs in the finite population. And as we’ve checked the GE measures before, the J-divergence computation function seems safe.

In order to assess the estimators implemented in svyjdiv and svyjdivdec, we can run a Monte Carlo experiment. Using the same samples we used in the svyzenga replication example, we have:

-
# estimate J-divergence with each sample
-jdiv.estimate.list <-
-  lapply(survey.list ,
-         function(this.sample) {
-           svyjdiv( ~ x ,
-                    subset(this.sample , x > 0) ,
-                    na.rm = TRUE)
-         })
-
-# compute the (finite population overall) J-divergence
-(theta.pop <-
-    convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0)))
+
# estimate J-divergence with each sample
+jdiv.estimate.list <-
+  lapply(survey.list ,
+         function(this.sample) {
+           svyjdiv( ~ x ,
+                    subset(this.sample , x > 0) ,
+                    na.rm = TRUE)
+         })
+
+# compute the (finite population overall) J-divergence
+(theta.pop <-
+    convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0)))
## [1] 0.4332649
-
# estimate the expected value of the J-divergence estimator
-# using the average of the estimates
-(theta.exp <- mean(sapply(jdiv.estimate.list , coef)))
+
# estimate the expected value of the J-divergence estimator
+# using the average of the estimates
+(theta.exp <- mean(sapply(jdiv.estimate.list , coef)))
## [1] 0.4327823
-
# estimate the percentage relative bias
-100 * (theta.exp / theta.pop - 1)
+
# estimate the percentage relative bias
+100 * (theta.exp / theta.pop - 1)
## [1] -0.1113765
-
# estimate the variance of the J-divergence estimator
-# using the variance of the estimates
-(vartheta.popest <- var(sapply(jdiv.estimate.list , coef)))
+
# estimate the variance of the J-divergence estimator
+# using the variance of the estimates
+(vartheta.popest <- var(sapply(jdiv.estimate.list , coef)))
## [1] 0.0005434848
-
# estimate the expected value of the J-divergence index variance estimator
-# using the expected of the variance estimates
-(vartheta.exp <- mean(sapply(jdiv.estimate.list , vcov)))
+
# estimate the expected value of the J-divergence index variance estimator
+# using the expected of the variance estimates
+(vartheta.exp <- mean(sapply(jdiv.estimate.list , vcov)))
## [1] 0.0005342947
-
# estimate the percentage relative bias of the variance estimator
-100 * (vartheta.exp / vartheta.popest - 1)
+
# estimate the percentage relative bias of the variance estimator
+100 * (vartheta.exp / vartheta.popest - 1)
## [1] -1.690964

For the decomposition, we repeat the same procedure:

-
# estimate J-divergence decomposition with each sample
-jdivdec.estimate.list <-
-  lapply(survey.list ,
-         function(this.sample) {
-           svyjdivdec( ~ x ,
-                       ~ SEX ,
-                       subset(this.sample , x > 0) ,
-                       na.rm = TRUE)
-         })
-
-# compute the (finite population) J-divergence decomposition per sex
-jdivt.pop <-
-  convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))
-
-overall.mean <- mean(pop.df$x[pop.df$x > 0])
-
-group.mean <-
-  c(by(pop.df$x[pop.df$x > 0] , list("SEX" = factor(pop.df$SEX[pop.df$x > 0])) , FUN = mean))
-
-group.pshare <-
-  c(prop.table(by(rep(1 , sum(
-    pop.df$x > 0
-  )) , list(
-    "SEX" = factor(pop.df$SEX[pop.df$x > 0])
-  ) , FUN = sum)))
-
-jdivb.pop <-
-  sum(group.pshare * (group.mean / overall.mean - 1) * log(group.mean / overall.mean))
-
-jdivw.pop <- jdivt.pop - jdivb.pop
-
-(theta.pop <- c(jdivt.pop , jdivw.pop , jdivb.pop))
+
# estimate J-divergence decomposition with each sample
+jdivdec.estimate.list <-
+  lapply(survey.list ,
+         function(this.sample) {
+           svyjdivdec( ~ x ,
+                       ~ SEX ,
+                       subset(this.sample , x > 0) ,
+                       na.rm = TRUE)
+         })
+
+# compute the (finite population) J-divergence decomposition per sex
+jdivt.pop <-
+  convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))
+
+overall.mean <- mean(pop.df$x[pop.df$x > 0])
+
+group.mean <-
+  c(by(pop.df$x[pop.df$x > 0] , list("SEX" = factor(pop.df$SEX[pop.df$x > 0])) , FUN = mean))
+
+group.pshare <-
+  c(prop.table(by(rep(1 , sum(
+    pop.df$x > 0
+  )) , list(
+    "SEX" = factor(pop.df$SEX[pop.df$x > 0])
+  ) , FUN = sum)))
+
+jdivb.pop <-
+  sum(group.pshare * (group.mean / overall.mean - 1) * log(group.mean / overall.mean))
+
+jdivw.pop <- jdivt.pop - jdivb.pop
+
+(theta.pop <- c(jdivt.pop , jdivw.pop , jdivb.pop))
## [1] 0.433264877 0.428398928 0.004865949
-
# estimate the expected value of the J-divergence decomposition estimator
-# using the average of the estimates
-(theta.exp <- rowMeans(sapply(jdivdec.estimate.list , coef)))
+
# estimate the expected value of the J-divergence decomposition estimator
+# using the average of the estimates
+(theta.exp <- rowMeans(sapply(jdivdec.estimate.list , coef)))
##       total      within     between 
 ## 0.432782322 0.427480257 0.005302065
-
# estimate the percentage relative bias
-100 * (theta.exp / theta.pop - 1)
+
# estimate the percentage relative bias
+100 * (theta.exp / theta.pop - 1)
##      total     within    between 
 ## -0.1113765 -0.2144430  8.9626164

The estimated PRB for the total is the same as before, so we will focus on the within and between components. While the within component has a small relative bias (-0.21%), the between component PRB is significant, amounting to 8.96%.

For the variance estimator, we do:

-
# estimate the variance of the J-divergence estimator
-# using the variance of the estimates
-(vartheta.popest <-
-   diag(var(t(
-     sapply(jdivdec.estimate.list , coef)
-   ))))
+
# estimate the variance of the J-divergence estimator
+# using the variance of the estimates
+(vartheta.popest <-
+   diag(var(t(
+     sapply(jdivdec.estimate.list , coef)
+   ))))
##        total       within      between 
 ## 5.434848e-04 5.391901e-04 8.750879e-06
-
# estimate the expected value of the J-divergence index variance estimator
-# using the expected of the variance estimates
-(vartheta.exp <-
-    rowMeans(sapply(jdivdec.estimate.list , function(z)
-      diag(vcov(
-        z
-      )))))
+
# estimate the expected value of the J-divergence index variance estimator
+# using the expected of the variance estimates
+(vartheta.exp <-
+    rowMeans(sapply(jdivdec.estimate.list , function(z)
+      diag(vcov(
+        z
+      )))))
##        total       within      between 
 ## 5.342947e-04 5.286750e-04 8.891772e-06
-
# estimate the percentage relative bias of the variance estimator
-100 * (vartheta.exp / vartheta.popest - 1)
+
# estimate the percentage relative bias of the variance estimator
+100 * (vartheta.exp / vartheta.popest - 1)
##     total    within   between 
 ## -1.690964 -1.950177  1.610034

The PRB of the variance estimators for both components are small: -1.95% for the within component and 1.61% for the between component.

Now, how much should we care about the between component bias? Our simulations show that the Squared Bias of this estimator accounts for less than 2% of its Mean Squared Error:

-
theta.bias2 <- (theta.exp - theta.pop) ^ 2
-theta.mse <- theta.bias2 + vartheta.popest
-100 * (theta.bias2 / theta.mse)
+
theta.bias2 <- (theta.exp - theta.pop) ^ 2
+theta.mse <- theta.bias2 + vartheta.popest
+100 * (theta.bias2 / theta.mse)
##      total     within    between 
 ## 0.04282728 0.15627854 2.12723212

Next, we evaluate the Percentage Coverage Rate (PCR). In theory, under repeated sampling, the estimated 95% CIs should cover the population parameter 95% of the time. We can evaluate that using:

-
# estimate confidence intervals of the Zenga index
-# for each of the samples
-est.coverage <-
-  t(sapply(jdivdec.estimate.list, function(this.stat)
-    confint(this.stat)[, 1] <= theta.pop &
-      confint(this.stat)[, 2] >= theta.pop))
-
-# evaluate
-colMeans(est.coverage)
+
# estimate confidence intervals of the Zenga index
+# for each of the samples
+est.coverage <-
+  t(sapply(jdivdec.estimate.list, function(this.stat)
+    confint(this.stat)[, 1] <= theta.pop &
+      confint(this.stat)[, 2] >= theta.pop))
+
+# evaluate
+colMeans(est.coverage)
##   total  within between 
 ##  0.9390  0.9376  0.9108

Our coverages are not too far from the nominal coverage level of 95%, however the bias of the between component estimator can affect its coverage rate.

diff --git a/docs/4.2-the-gender-pay-gap-svygpg.html b/docs/4.2-the-gender-pay-gap-svygpg.html index 349c76e..f7ac493 100644 --- a/docs/4.2-the-gender-pay-gap-svygpg.html +++ b/docs/4.2-the-gender-pay-gap-svygpg.html @@ -265,12 +265,12 @@

4.2 The Gender Pay Gap (svygpg)

-
✔️ easy to understand
-✔️ the difference of men and women average wages expressed as a fraction of average men wages
-✔️ alternatively: the average women wage is `( 1 - GPG ) x average men wage`
-❌ not an inequality measure in the Pigou-Dalton Principle sense
-❌ ignores within-inequality among men and among women
-❌ binary gender only
+
✔️ easy to understand
+✔️ the difference of men and women average wages expressed as a fraction of average men wages
+✔️ alternatively: the average women wage is `( 1 - GPG ) x average men wage`
+❌ not an inequality measure in the Pigou-Dalton Principle sense
+❌ ignores within-inequality among men and among women
+❌ binary gender only

Although the Gender Pay Gap (GPG) is not an inequality measure in the usual sense, it can still be a useful instrument to evaluate the effects of gender discrimination. Put simply, it expresses the relative difference between the average hourly earnings of men and women, presenting this difference as a percentage of the average of hourly earnings of men. Like some other functions described in this text, the GPG can also be calculated using wealth or assets in place of earnings.

In mathematical terms, this index can be described as,

\[ GPG = \frac{ \bar{y}_{male} - \bar{y}_{female} }{ \bar{y}_{male} } \]

@@ -282,148 +282,148 @@

4.2 The Gender Pay Gap (svygpg)4.2.1 Replication Example

The R vardpoor package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a GPG coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the laeken library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# coerce the gender variable to numeric 1 or 2
-eusilc$one_two <- as.numeric(eusilc$rb090 == "female") + 1
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the gpg coefficient
-# using the R vardpoor library
-varpoord_gpg_calculation <-
-  varpoord(
-    # analysis variable
-    Y = "eqincome",
-    
-    # weights variable
-    w_final = "rb050",
-    
-    # row number variable
-    ID_level1 = "IDd",
-    
-    # row number variable
-    ID_level2 = "IDd",
-    
-    # strata variable
-    H = "db040",
-    
-    N_h = NULL ,
-    
-    # clustering variable
-    PSU = "rb030",
-    
-    # data.table
-    dataset = dati,
-    
-    # gpg coefficient function
-    type = "lingpg" ,
-    
-    # gender variable
-    gender = "one_two",
-    
-    # get linearized variable
-    outp_lin = TRUE
-  )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc
-  )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_gpg_calculation$all_result$value
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the laeken library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# coerce the gender variable to numeric 1 or 2
+eusilc$one_two <- as.numeric(eusilc$rb090 == "female") + 1
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the gpg coefficient
+# using the R vardpoor library
+varpoord_gpg_calculation <-
+  varpoord(
+    # analysis variable
+    Y = "eqincome",
+    
+    # weights variable
+    w_final = "rb050",
+    
+    # row number variable
+    ID_level1 = "IDd",
+    
+    # row number variable
+    ID_level2 = "IDd",
+    
+    # strata variable
+    H = "db040",
+    
+    N_h = NULL ,
+    
+    # clustering variable
+    PSU = "rb030",
+    
+    # data.table
+    dataset = dati,
+    
+    # gpg coefficient function
+    type = "lingpg" ,
+    
+    # gender variable
+    gender = "one_two",
+    
+    # get linearized variable
+    outp_lin = TRUE
+  )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc
+  )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_gpg_calculation$all_result$value
## [1] 7.645389
-
coef(svygpg( ~ eqincome , des_eusilc , sex = ~ rb090)) * 100
+
coef(svygpg( ~ eqincome , des_eusilc , sex = ~ rb090)) * 100
##  eqincome 
 ## -8.278297
-
# linearized variables do match
-# vardpoor
-lin_gpg_varpoord <- varpoord_gpg_calculation$lin_out$lin_gpg
-# convey
-lin_gpg_convey <-
-  attr(svygpg( ~ eqincome , des_eusilc, sex = ~ rb090), "lin")
-
-# check equality
-all.equal(lin_gpg_varpoord, 100 * lin_gpg_convey[, 1])
+
# linearized variables do match
+# vardpoor
+lin_gpg_varpoord <- varpoord_gpg_calculation$lin_out$lin_gpg
+# convey
+lin_gpg_convey <-
+  attr(svygpg( ~ eqincome , des_eusilc, sex = ~ rb090), "lin")
+
+# check equality
+all.equal(lin_gpg_varpoord, 100 * lin_gpg_convey[, 1])
## [1] "Mean relative difference: 2.172419"
-
# variances do not match exactly
-attr(svygpg( ~ eqincome , des_eusilc , sex = ~ rb090) , 'var') * 10000
+
# variances do not match exactly
+attr(svygpg( ~ eqincome , des_eusilc , sex = ~ rb090) , 'var') * 10000
##           eqincome
 ## eqincome 0.8926311
-
varpoord_gpg_calculation$all_result$var
+
varpoord_gpg_calculation$all_result$var
## [1] 0.6482346
-
# standard errors do not match exactly
-varpoord_gpg_calculation$all_result$se
+
# standard errors do not match exactly
+varpoord_gpg_calculation$all_result$se
## [1] 0.8051301
-
SE(svygpg( ~ eqincome , des_eusilc , sex = ~ rb090)) * 100
+
SE(svygpg( ~ eqincome , des_eusilc , sex = ~ rb090)) * 100
##           eqincome
 ## eqincome 0.9447916

The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svygpg and vardpoor::lingpg produce the same linearized variable \(z\).

However, the measures of uncertainty do not line up, because library(vardpoor) defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design object.

-
# within each strata, sum up the weights
-cluster_sums <-
-  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc ,
-    fpc = ~ cluster_sum
-  )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
-  convey_prep(des_eusilc_ultimate_cluster)
-
-# matches
-attr(svygpg( ~ eqincome , des_eusilc_ultimate_cluster , sex = ~ rb090) ,
-     'var') * 10000
+
# within each strata, sum up the weights
+cluster_sums <-
+  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc ,
+    fpc = ~ cluster_sum
+  )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+  convey_prep(des_eusilc_ultimate_cluster)
+
+# matches
+attr(svygpg( ~ eqincome , des_eusilc_ultimate_cluster , sex = ~ rb090) ,
+     'var') * 10000
##           eqincome
 ## eqincome 0.8910413
-
varpoord_gpg_calculation$all_result$var
+
varpoord_gpg_calculation$all_result$var
## [1] 0.6482346
-
# matches
-varpoord_gpg_calculation$all_result$se
+
# matches
+varpoord_gpg_calculation$all_result$se
## [1] 0.8051301
-
SE(svygpg( ~ eqincome , des_eusilc_ultimate_cluster , sex = ~ rb090)) * 100
+
SE(svygpg( ~ eqincome , des_eusilc_ultimate_cluster , sex = ~ rb090)) * 100
##           eqincome
 ## eqincome 0.9439499

For additional usage examples of svygpg, type ?convey::svygpg in the R console.

diff --git a/docs/4.3-quintile-share-ratio-svyqsr.html b/docs/4.3-quintile-share-ratio-svyqsr.html index f373d63..8628d08 100644 --- a/docs/4.3-quintile-share-ratio-svyqsr.html +++ b/docs/4.3-quintile-share-ratio-svyqsr.html @@ -265,11 +265,11 @@

4.3 Quintile Share Ratio (svyqsr)

-
✔️ Easy to understand
-✔️ Can be adapted into other more commonly used variants, like the Palma ratio
-✔️ Can be interpreted using a Lorenz curve
-❌ Fails the Pigou-Dalton Principle (depending on the donor and recipient position in the income distribution)
-❌ Focuses on specific parts of the income  distribution, not the entire distribution
+
✔️ Easy to understand
+✔️ Can be adapted into other more commonly used variants, like the Palma ratio
+✔️ Can be interpreted using a Lorenz curve
+❌ Fails the Pigou-Dalton Principle (depending on the donor and recipient position in the income distribution)
+❌ Focuses on specific parts of the income  distribution, not the entire distribution

Unlike the previous measure, the Quintile Share Ratio (QSR) is an inequality measure in itself, depending only on the income distribution to evaluate the degree of inequality. By definition, it can be described as the ratio between the income share held by the richest 20% and the poorest 20% of the population.

In plain terms, it expresses how many times the wealthier part of the population are richer than the poorest part. For instance, a \(QSR = 4\) implies that the upper class takes home 4 times as much of the total income as the poor.

The QSR can be modified to a more general function of percentile share ratios. For instance, Cobham, Schlogl, and Sumner (2015Cobham, Alex, Luke Schlogl, and Andy Sumner. 2015. Inequality and the Tails: The Palma Proposition and Ratio Revisited.” Working Papers 143. United Nations, Department of Economics; Social Affairs. http://www.un.org/esa/desa/papers/2015/wp143_2015.pdf.) argues for using the Palma index, defined as the ratio between the share of the 10% richest over the share held by the poorest 40%. There are actually two ways to compute the Palma ratio with the convey package. One is using convey::svyqsr and the other is using convey::svylorenz - they won’t match exactly because of different estimators, but are asymptotically equal. Since the Palma ratio is the top 10% divided by the bottom 40%, in convey::svyqsr this could be achieved using alpha1 = .40 and alpha2 = .90. Note that the vardpoor::linqsr function only accepts a single alpha parameter (defaulting to 0.2 and 1 - 0.2), meaning the Palma index cannot presently be computed using that function.

@@ -279,148 +279,148 @@

4.3 Quintile Share Ratio (svyqsr)

4.3.1 Replication Example

The R vardpoor package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a QSR coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the laeken library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the qsr coefficient
-# using the R vardpoor library
-varpoord_qsr_calculation <-
-  varpoord(
-    # analysis variable
-    Y = "eqincome",
-    
-    # weights variable
-    w_final = "rb050",
-    
-    # row number variable
-    ID_level1 = "IDd",
-    
-    # row number variable
-    ID_level2 = "IDd",
-    
-    # strata variable
-    H = "db040",
-    
-    N_h = NULL ,
-    
-    # clustering variable
-    PSU = "rb030",
-    
-    # data.table
-    dataset = dati,
-    
-    # qsr coefficient function
-    type = "linqsr",
-    
-    # poverty threshold range
-    alpha = 20 ,
-    
-    # get linearized variable
-    outp_lin = TRUE
-    
-  )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc
-  )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_qsr_calculation$all_result$value
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the laeken library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the qsr coefficient
+# using the R vardpoor library
+varpoord_qsr_calculation <-
+  varpoord(
+    # analysis variable
+    Y = "eqincome",
+    
+    # weights variable
+    w_final = "rb050",
+    
+    # row number variable
+    ID_level1 = "IDd",
+    
+    # row number variable
+    ID_level2 = "IDd",
+    
+    # strata variable
+    H = "db040",
+    
+    N_h = NULL ,
+    
+    # clustering variable
+    PSU = "rb030",
+    
+    # data.table
+    dataset = dati,
+    
+    # qsr coefficient function
+    type = "linqsr",
+    
+    # poverty threshold range
+    alpha = 20 ,
+    
+    # get linearized variable
+    outp_lin = TRUE
+    
+  )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc
+  )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_qsr_calculation$all_result$value
## [1] 3.970004
-
coef(svyqsr( ~ eqincome , des_eusilc))
+
coef(svyqsr( ~ eqincome , des_eusilc))
## eqincome 
 ## 3.970004
-
# linearized variables do match
-# vardpoor
-lin_qsr_varpoord <- varpoord_qsr_calculation$lin_out$lin_qsr
-# convey
-lin_qsr_convey <-
-  as.numeric(attr(svyqsr( ~ eqincome ,
-                          des_eusilc ,
-                          linearized = TRUE) ,
-                  "linearized"))
-
-# check equality
-all.equal(lin_qsr_varpoord, lin_qsr_convey)
+
# linearized variables do match
+# vardpoor
+lin_qsr_varpoord <- varpoord_qsr_calculation$lin_out$lin_qsr
+# convey
+lin_qsr_convey <-
+  as.numeric(attr(svyqsr( ~ eqincome ,
+                          des_eusilc ,
+                          linearized = TRUE) ,
+                  "linearized"))
+
+# check equality
+all.equal(lin_qsr_varpoord, lin_qsr_convey)
## [1] TRUE
-
# variances do not match exactly
-attr(svyqsr( ~ eqincome , des_eusilc) , 'var')
+
# variances do not match exactly
+attr(svyqsr( ~ eqincome , des_eusilc) , 'var')
##             eqincome
 ## eqincome 0.001810537
-
varpoord_qsr_calculation$all_result$var
+
varpoord_qsr_calculation$all_result$var
## [1] 0.001807323
-
# standard errors do not match exactly
-varpoord_qsr_calculation$all_result$se
+
# standard errors do not match exactly
+varpoord_qsr_calculation$all_result$se
## [1] 0.04251263
-
SE(svyqsr( ~ eqincome , des_eusilc))
+
SE(svyqsr( ~ eqincome , des_eusilc))
##            eqincome
 ## eqincome 0.04255041

The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svyqsr and vardpoor::linqsr produce the same linearized variable \(z\).

However, the measures of uncertainty do not line up, because library(vardpoor) defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design object.

-
# within each strata, sum up the weights
-cluster_sums <-
-  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc ,
-    fpc = ~ cluster_sum
-  )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
-  convey_prep(des_eusilc_ultimate_cluster)
-
-# matches
-attr(svyqsr( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')
+
# within each strata, sum up the weights
+cluster_sums <-
+  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc ,
+    fpc = ~ cluster_sum
+  )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+  convey_prep(des_eusilc_ultimate_cluster)
+
+# matches
+attr(svyqsr( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')
##             eqincome
 ## eqincome 0.001807323
-
varpoord_qsr_calculation$all_result$var
+
varpoord_qsr_calculation$all_result$var
## [1] 0.001807323
-
# matches
-varpoord_qsr_calculation$all_result$se
+
# matches
+varpoord_qsr_calculation$all_result$se
## [1] 0.04251263
-
SE(svyqsr( ~ eqincome , des_eusilc_ultimate_cluster))
+
SE(svyqsr( ~ eqincome , des_eusilc_ultimate_cluster))
##            eqincome
 ## eqincome 0.04251263

For additional usage examples of svyqsr, type ?convey::svyqsr in the R console.

diff --git a/docs/4.5-lorenz-curve-svylorenz.html b/docs/4.5-lorenz-curve-svylorenz.html index 0b7e0c4..5b30227 100644 --- a/docs/4.5-lorenz-curve-svylorenz.html +++ b/docs/4.5-lorenz-curve-svylorenz.html @@ -265,13 +265,13 @@

4.5 Lorenz Curve (svylorenz)

-
✔️ graphical device for understanding inequality measurement
-✔️ simple to understand
-✔️ conveys information about inequality across the entire income distribution
-✔️ Lorenz dominance and inequality measure ordering
-❌ testing Lorenz dominance is difficult
-❌ can hide inequality in the top or bottom
-❌ cannot be summarised into a single number
+
✔️ graphical device for understanding inequality measurement
+✔️ simple to understand
+✔️ conveys information about inequality across the entire income distribution
+✔️ Lorenz dominance and inequality measure ordering
+❌ testing Lorenz dominance is difficult
+❌ can hide inequality in the top or bottom
+❌ cannot be summarised into a single number

Though not an inequality measure in itself, the Lorenz curve is a classic instrument of distribution analysis. This function associates a cumulative share of the population to the share of the total income earned. In mathematical terms,

\[ L(p) = \frac{\int_{-\infty}^{Q_p}yf(y)dy}{\int_{-\infty}^{+\infty}yf(y)dy} @@ -308,57 +308,57 @@

4.5 Lorenz Curve (svylorenz)

4.5.1 Replication Example

In October 2016, (Jann 2016Jann, Ben. 2016. Estimating Lorenz and concentration curves in Stata.” University of Bern Social Sciences Working Papers 15. University of Bern, Department of Social Sciences. https://ideas.repec.org/p/bss/wpaper/15.html.) released a pre-publication working paper to estimate Lorenz and concentration curves using stata. The example below reproduces the statistics presented in his section 4.1.

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the stata-style webuse library
-library(webuse)
-
-# load the NLSW 1988 data
-webuse("nlsw88")
-
-# coerce that `tbl_df` to a standard R `data.frame`
-nlsw88 <- data.frame(nlsw88)
-
-# initiate a linearized survey design object
-des_nlsw88 <- svydesign(ids = ~ 1 , data = nlsw88)
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the stata-style webuse library
+library(webuse)
+
+# load the NLSW 1988 data
+webuse("nlsw88")
+
+# coerce that `tbl_df` to a standard R `data.frame`
+nlsw88 <- data.frame(nlsw88)
+
+# initiate a linearized survey design object
+des_nlsw88 <- svydesign(ids = ~ 1 , data = nlsw88)
## Warning in svydesign.default(ids = ~1, data = nlsw88): No weights or
 ## probabilities supplied, assuming equal probability
-
# immediately run the `convey_prep` function on the survey design
-des_nlsw88 <- convey_prep(des_nlsw88)
-
-# estimates lorenz curve
-result.lin <-
-  svylorenz( ~ wage,
-             des_nlsw88,
-             quantiles = seq(0, 1, .05),
-             na.rm = TRUE)
+
# immediately run the `convey_prep` function on the survey design
+des_nlsw88 <- convey_prep(des_nlsw88)
+
+# estimates lorenz curve
+result.lin <-
+  svylorenz( ~ wage,
+             des_nlsw88,
+             quantiles = seq(0, 1, .05),
+             na.rm = TRUE)

-
# note: most survey commands in R use Inf degrees of freedom by default
-# stata generally uses the degrees of freedom of the survey design.
-# therefore, while the degf() parameters passed to qt()
-# serve to prove a precise replication of stata,
-# it is generally not necessary
-section_four_one <-
-  data.frame(
-    estimate = coef(result.lin) ,
-    standard_error = SE(result.lin) ,
-    ci_lower_bound =
-      coef(result.lin) +
-      SE(result.lin) *
-      qt(0.025 , degf(subset(
-        des_nlsw88 , !is.na(wage)
-      ))) ,
-    ci_upper_bound =
-      coef(result.lin) +
-      SE(result.lin) *
-      qt(0.975 , degf(subset(
-        des_nlsw88 , !is.na(wage)
-      )))
-  )
+
# note: most survey commands in R use Inf degrees of freedom by default
+# stata generally uses the degrees of freedom of the survey design.
+# therefore, while the degf() parameters passed to qt()
+# serve to prove a precise replication of stata,
+# it is generally not necessary
+section_four_one <-
+  data.frame(
+    estimate = coef(result.lin) ,
+    standard_error = SE(result.lin) ,
+    ci_lower_bound =
+      coef(result.lin) +
+      SE(result.lin) *
+      qt(0.025 , degf(subset(
+        des_nlsw88 , !is.na(wage)
+      ))) ,
+    ci_upper_bound =
+      coef(result.lin) +
+      SE(result.lin) *
+      qt(0.975 , degf(subset(
+        des_nlsw88 , !is.na(wage)
+      )))
+  )
diff --git a/docs/4.6-gini-index-svygini.html b/docs/4.6-gini-index-svygini.html index b0cd86b..78cac7e 100644 --- a/docs/4.6-gini-index-svygini.html +++ b/docs/4.6-gini-index-svygini.html @@ -265,13 +265,13 @@

4.6 Gini index (svygini)

-
✔️ direct relationship with the Lorenz curve
-✔️ most used inequality measure
-✔️ [0,1]-bounded, easy to compare
-✔️ allows for zero incomes
-❌ hard to interpret without a graphical device
-❌ more sensitive to outliers (when compared to `svyzenga`)
-❌ https://pubmed.ncbi.nlm.nih.gov/26292521/ and https://www.jstor.org/stable/2142862
+
✔️ direct relationship with the Lorenz curve
+✔️ most used inequality measure
+✔️ [0,1]-bounded, easy to compare
+✔️ allows for zero incomes
+❌ hard to interpret without a graphical device
+❌ more sensitive to outliers (when compared to `svyzenga`)
+❌ https://pubmed.ncbi.nlm.nih.gov/26292521/ and https://www.jstor.org/stable/2142862

The Gini index (or Gini coefficient) is an attempt to express the inequality presented in the Lorenz curve as a single number. In essence, it is twice the area between the equality curve and the real Lorenz curve. Put simply:

\[ \begin{aligned} @@ -290,142 +290,142 @@

4.6 Gini index (svygini)

4.6.1 Replication Example

The R vardpoor package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a Gini coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the laeken library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the gini coefficient
-# using the R vardpoor library
-varpoord_gini_calculation <-
-  varpoord(
-    # analysis variable
-    Y = "eqincome",
-    
-    # weights variable
-    w_final = "rb050",
-    
-    # row number variable
-    ID_level1 = "IDd",
-    
-    # row number variable
-    ID_level2 = "IDd",
-    
-    # strata variable
-    H = "db040",
-    
-    N_h = NULL ,
-    
-    # clustering variable
-    PSU = "rb030",
-    
-    # data.table
-    dataset = dati,
-    
-    # gini coefficient function
-    type = "lingini",
-    
-    # get linearized variable
-    outp_lin = TRUE
-    
-  )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc
-  )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_gini_calculation$all_result$value
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the laeken library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the gini coefficient
+# using the R vardpoor library
+varpoord_gini_calculation <-
+  varpoord(
+    # analysis variable
+    Y = "eqincome",
+    
+    # weights variable
+    w_final = "rb050",
+    
+    # row number variable
+    ID_level1 = "IDd",
+    
+    # row number variable
+    ID_level2 = "IDd",
+    
+    # strata variable
+    H = "db040",
+    
+    N_h = NULL ,
+    
+    # clustering variable
+    PSU = "rb030",
+    
+    # data.table
+    dataset = dati,
+    
+    # gini coefficient function
+    type = "lingini",
+    
+    # get linearized variable
+    outp_lin = TRUE
+    
+  )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc
+  )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_gini_calculation$all_result$value
## [1] 26.49652
-
coef(svygini( ~ eqincome , des_eusilc)) * 100
+
coef(svygini( ~ eqincome , des_eusilc)) * 100
## eqincome 
 ## 26.49652
-
# linearized variables do match
-# varpoord
-lin_gini_varpoord <- varpoord_gini_calculation$lin_out$lin_gini
-# convey
-lin_gini_convey <-
-  attr(svygini( ~ eqincome , des_eusilc , linearized = TRUE) , "linearized")
-
-# check equality
-all.equal(lin_gini_varpoord , (100 * as.numeric(lin_gini_convey)))
+
# linearized variables do match
+# varpoord
+lin_gini_varpoord <- varpoord_gini_calculation$lin_out$lin_gini
+# convey
+lin_gini_convey <-
+  attr(svygini( ~ eqincome , des_eusilc , linearized = TRUE) , "linearized")
+
+# check equality
+all.equal(lin_gini_varpoord , (100 * as.numeric(lin_gini_convey)))
## [1] TRUE
-
# variances do not match exactly
-attr(svygini( ~ eqincome , des_eusilc) , 'var') * 10000
+
# variances do not match exactly
+attr(svygini( ~ eqincome , des_eusilc) , 'var') * 10000
##            eqincome
 ## eqincome 0.03790739
-
varpoord_gini_calculation$all_result$var
+
varpoord_gini_calculation$all_result$var
## [1] 0.03783931
-
# standard errors do not match exactly
-varpoord_gini_calculation$all_result$se
+
# standard errors do not match exactly
+varpoord_gini_calculation$all_result$se
## [1] 0.1945233
-
SE(svygini( ~ eqincome , des_eusilc)) * 100
+
SE(svygini( ~ eqincome , des_eusilc)) * 100
##           eqincome
 ## eqincome 0.1946982

The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svygini and vardpoor::lingini produce the same linearized variable \(z\).

However, the measures of uncertainty do not line up, because library(vardpoor) defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design object.

-
# within each strata, sum up the weights
-cluster_sums <-
-  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc ,
-    fpc = ~ cluster_sum
-  )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
-  convey_prep(des_eusilc_ultimate_cluster)
-
-# matches
-attr(svygini( ~ eqincome , des_eusilc_ultimate_cluster) , 'var') * 10000
+
# within each strata, sum up the weights
+cluster_sums <-
+  aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc ,
+    fpc = ~ cluster_sum
+  )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+  convey_prep(des_eusilc_ultimate_cluster)
+
+# matches
+attr(svygini( ~ eqincome , des_eusilc_ultimate_cluster) , 'var') * 10000
##            eqincome
 ## eqincome 0.03783931
-
varpoord_gini_calculation$all_result$var
+
varpoord_gini_calculation$all_result$var
## [1] 0.03783931
-
# matches
-varpoord_gini_calculation$all_result$se
+
# matches
+varpoord_gini_calculation$all_result$se
## [1] 0.1945233
-
SE(svygini( ~ eqincome , des_eusilc_ultimate_cluster)) * 100
+
SE(svygini( ~ eqincome , des_eusilc_ultimate_cluster)) * 100
##           eqincome
 ## eqincome 0.1945233
@@ -433,55 +433,55 @@

4.6.1 Replication Example

4.6.2 Analysis Examples

This section presents current Gini index estimates for the United States and Brazil to help users compare their own results to populations (and sub-populations) of two large, diverse countries. Using the two nationally-representative labor force surveys of the United States (CPS-ASEC) and Brazil (PNAD-Continua) described in the introduction, this section displays straightforward syntax to calculate a Gini index with those survey design objects.

Nationwide Estimates:

-
# Gini index of the United States calculated with Household Income
-svygini(~ htotval , cps_household_design)
+
# Gini index of the United States calculated with Household Income
+svygini(~ htotval , cps_household_design)
##            gini    SE
 ## htotval 0.48846 0.002
-
# Gini index of the United States calculated with Family Income
-svygini(~ ftotval , cps_family_design)
+
# Gini index of the United States calculated with Family Income
+svygini(~ ftotval , cps_family_design)
##            gini     SE
 ## ftotval 0.45816 0.0023
-
# Gini index of Full-Time Full Year Workers in the United States calculated with Personal Income
-svygini(~ ptotval , cps_ftfy_worker_design)
+
# Gini index of Full-Time Full Year Workers in the United States calculated with Personal Income
+svygini(~ ptotval , cps_ftfy_worker_design)
##            gini     SE
 ## ptotval 0.41809 0.0025
-
# Gini index of Full-Time Full Year Workers in the United States calculated with Personal Earnings
-svygini(~ pearnval , cps_ftfy_worker_design)
+
# Gini index of Full-Time Full Year Workers in the United States calculated with Personal Earnings
+svygini(~ pearnval , cps_ftfy_worker_design)
##           gini     SE
 ## pearnval 0.412 0.0026
-
# Gini index of Brazil calculated with Per Capita Income
-svygini(~ deflated_per_capita_income , pnadc_design , na.rm = TRUE)
+
# Gini index of Brazil calculated with Per Capita Income
+svygini(~ deflated_per_capita_income , pnadc_design , na.rm = TRUE)
##                               gini     SE
 ## deflated_per_capita_income 0.51845 0.0032
-
# Gini index of Brazil calculated with Personal Earnings
-svygini(~ deflated_labor_income , pnadc_design , na.rm = TRUE)
+
# Gini index of Brazil calculated with Personal Earnings
+svygini(~ deflated_labor_income , pnadc_design , na.rm = TRUE)
##                          gini     SE
 ## deflated_labor_income 0.48606 0.0036

Regional Estimates

-
# Gini index of California calculated with Household Income
-svygini(~ htotval , subset(cps_household_design , gestfips == 6))
+
# Gini index of California calculated with Household Income
+svygini(~ htotval , subset(cps_household_design , gestfips == 6))
##            gini     SE
 ## htotval 0.50132 0.0073
-
# Gini index of California calculated with Family Income
-svygini(~ ftotval , subset(cps_family_design , gestfips == 6))
+
# Gini index of California calculated with Family Income
+svygini(~ ftotval , subset(cps_family_design , gestfips == 6))
##            gini    SE
 ## ftotval 0.47395 0.008
-
# Gini index of Full-Time Full Year Workers in California calculated with Personal Income
-svygini(~ ptotval , subset(cps_ftfy_worker_design , gestfips == 6))
+
# Gini index of Full-Time Full Year Workers in California calculated with Personal Income
+svygini(~ ptotval , subset(cps_ftfy_worker_design , gestfips == 6))
##            gini     SE
 ## ptotval 0.44372 0.0082
-
# Gini index of Full-Time Full Year Workers in California calculated with Personal Earnings
-svygini(~ pearnval , subset(cps_ftfy_worker_design , gestfips == 6))
+
# Gini index of Full-Time Full Year Workers in California calculated with Personal Earnings
+svygini(~ pearnval , subset(cps_ftfy_worker_design , gestfips == 6))
##             gini     SE
 ## pearnval 0.43741 0.0084
-
# Gini index of Sao Paulo calculated with Per Capita Income
-svygini(~ deflated_per_capita_income ,
-        subset(pnadc_design , uf == 35) ,
-        na.rm = TRUE)
+
# Gini index of Sao Paulo calculated with Per Capita Income
+svygini(~ deflated_per_capita_income ,
+        subset(pnadc_design , uf == 35) ,
+        na.rm = TRUE)
##                               gini     SE
 ## deflated_per_capita_income 0.50009 0.0077
-
# Gini index of Sao Paulo calculated with Personal Earnings
-svygini(~ deflated_labor_income , subset(pnadc_design , uf == 35) , na.rm = TRUE)
+
# Gini index of Sao Paulo calculated with Personal Earnings
+svygini(~ deflated_labor_income , subset(pnadc_design , uf == 35) , na.rm = TRUE)
##                          gini     SE
 ## deflated_labor_income 0.47583 0.0089
diff --git a/docs/4.7-zenga-index-svyzenga.html b/docs/4.7-zenga-index-svyzenga.html index a171b15..ef67c05 100644 --- a/docs/4.7-zenga-index-svyzenga.html +++ b/docs/4.7-zenga-index-svyzenga.html @@ -265,13 +265,13 @@

4.7 Zenga index (svyzenga)

-
✔️ [0,1]-bounded
-✔️ can account for zero incomes
-✔️ not extremely sensitive to top incomes
-✔️ can be interpreted using (a transformation of) the Lorenz curve
-❌ very uncommon
-❌ difficult to find applications to compare
-❌ difficult to explain
+
✔️ [0,1]-bounded
+✔️ can account for zero incomes
+✔️ not extremely sensitive to top incomes
+✔️ can be interpreted using (a transformation of) the Lorenz curve
+❌ very uncommon
+❌ difficult to find applications to compare
+❌ difficult to explain

Proposed by Zenga (2007Zenga, Michele. 2007. “Inequality Curve and Inequality Index Based on the Ratios Between Lower and Upper Arithmetic Means.” Statistica e Applicazioni 1 (4): 3–27.), the Zenga index is another inequality measure related to the Lorenz curve with interesting properties. For continuous populations, it is defined as:

\[ Z = 1 - \int_{0}^{1} \frac{L(p)}{p} \cdot \frac{ 1 - p }{ 1 - L(p) } dp @@ -290,159 +290,159 @@

4.7 Zenga index (svyzenga)

4.7.1 Replication Example

While it is not possible to reproduce the exact results because their simulation includes random sampling, we can replicate a Monte Carlo experiment similar to the one presented by Barabesi, Diana, and Perri (2016Barabesi, Lucio, Giancarlo Diana, and Pier Francesco Perri. 2016. “Linearization of Inequality Indices in the Design-Based Framework.” Statistics 50 (5): 1161–72. https://doi.org/10.1080/02331888.2015.1135924.) in Table 2.

Load and prepare the data set:

-
library(convey)
-library(survey)
-library(parallel)
-
-# create a temporary file on the local disk
-tf <- tempfile()
-
-# store the location of the zip file
-data_zip <-
-  "https://www.bancaditalia.it/statistiche/tematiche/indagini-famiglie-imprese/bilanci-famiglie/distribuzione-microdati/documenti/ind12_ascii.zip"
-
-# download to the temporary file
-download.file(data_zip , tf , mode = 'wb')
-
-# unzip the contents of the archive to a temporary directory
-td <- file.path(tempdir() , "shiw2012")
-data_files <- unzip(tf , exdir = td)
-
-# load the household, consumption and income from SHIW (2012) survey data
-hhr.df <- read.csv(grep("carcom12" , data_files , value = TRUE))
-cns.df <- read.csv(grep("risfam12" , data_files , value = TRUE))
-inc.df <- read.csv(grep("rper12" , data_files , value = TRUE))
-
-# fixes names
-colnames(cns.df)[1] <- tolower(colnames(cns.df))[1]
-
-# household count should be 8151
-stopifnot(sum(!duplicated(hhr.df$nquest)) == 8151)
-
-# person count should be 20022
-stopifnot(sum(!duplicated(paste0(
-  hhr.df$nquest , "-" , hhr.df$nord
-))) == 20022)
-
-# income recipients: should be 12986
-stopifnot(sum(hhr.df$PERC) == 12986)
-
-# combine household roster and income datasets
-pop.df <-
-  merge(
-    hhr.df ,
-    inc.df ,
-    by = c("nquest" , "nord") ,
-    all.x = TRUE ,
-    sort = FALSE
-  )
-
-# compute household-level income
-hinc.df <-
-  pop.df[pop.df$PERC == 1 ,
-         c("nquest" , "nord" , "PERC" , "Y" , "YL" , "YM" , "YT" , "YC") ,
-         drop = FALSE]
-
-hinc.df <-
-  aggregate(rowSums(hinc.df[, c("YL"  , "YM" , "YT") , drop = FALSE]) ,
-            list("nquest" = hinc.df$nquest) ,
-            sum ,
-            na.rm = TRUE)
-
-pop.df <-
-  merge(pop.df ,
-        hinc.df ,
-        by = "nquest" ,
-        all.x = TRUE ,
-        sort = FALSE)
-
-# combine with consumption data
-pop.df <-
-  merge(pop.df ,
-        cns.df ,
-        by = "nquest" ,
-        all.x = TRUE ,
-        sort = FALSE)
-
-# treat household-level sample as the finite population
-pop.df <-
-  pop.df[!duplicated(pop.df$nquest) , , drop = FALSE]
+
library(convey)
+library(survey)
+library(parallel)
+
+# create a temporary file on the local disk
+tf <- tempfile()
+
+# store the location of the zip file
+data_zip <-
+  "https://www.bancaditalia.it/statistiche/tematiche/indagini-famiglie-imprese/bilanci-famiglie/distribuzione-microdati/documenti/ind12_ascii.zip"
+
+# download to the temporary file
+download.file(data_zip , tf , mode = 'wb')
+
+# unzip the contents of the archive to a temporary directory
+td <- file.path(tempdir() , "shiw2012")
+data_files <- unzip(tf , exdir = td)
+
+# load the household, consumption and income from SHIW (2012) survey data
+hhr.df <- read.csv(grep("carcom12" , data_files , value = TRUE))
+cns.df <- read.csv(grep("risfam12" , data_files , value = TRUE))
+inc.df <- read.csv(grep("rper12" , data_files , value = TRUE))
+
+# fixes names
+colnames(cns.df)[1] <- tolower(colnames(cns.df))[1]
+
+# household count should be 8151
+stopifnot(sum(!duplicated(hhr.df$nquest)) == 8151)
+
+# person count should be 20022
+stopifnot(sum(!duplicated(paste0(
+  hhr.df$nquest , "-" , hhr.df$nord
+))) == 20022)
+
+# income recipients: should be 12986
+stopifnot(sum(hhr.df$PERC) == 12986)
+
+# combine household roster and income datasets
+pop.df <-
+  merge(
+    hhr.df ,
+    inc.df ,
+    by = c("nquest" , "nord") ,
+    all.x = TRUE ,
+    sort = FALSE
+  )
+
+# compute household-level income
+hinc.df <-
+  pop.df[pop.df$PERC == 1 ,
+         c("nquest" , "nord" , "PERC" , "Y" , "YL" , "YM" , "YT" , "YC") ,
+         drop = FALSE]
+
+hinc.df <-
+  aggregate(rowSums(hinc.df[, c("YL"  , "YM" , "YT") , drop = FALSE]) ,
+            list("nquest" = hinc.df$nquest) ,
+            sum ,
+            na.rm = TRUE)
+
+pop.df <-
+  merge(pop.df ,
+        hinc.df ,
+        by = "nquest" ,
+        all.x = TRUE ,
+        sort = FALSE)
+
+# combine with consumption data
+pop.df <-
+  merge(pop.df ,
+        cns.df ,
+        by = "nquest" ,
+        all.x = TRUE ,
+        sort = FALSE)
+
+# treat household-level sample as the finite population
+pop.df <-
+  pop.df[!duplicated(pop.df$nquest) , , drop = FALSE]

For the Monte Carlo experiment, we take 5000 samples of (expected) size 1000 using Poisson sampling6 but not Conditional Poisson Sampling.:

-
# set up monte carlo attributes
-mc.rep <- 5000L
-n.size = 1000L
-N.size = nrow(pop.df)
-
-# calculate first order probabilities
-pop.df$pik <-
-  n.size * (abs(pop.df$C) / sum(abs(pop.df$C)))
-
-# calculate second order probabilities
-pikl <- pop.df$pik %*% t(pop.df$pik)
-diag(pikl) <- pop.df$pik
-
-# set random number seed
-set.seed(1997)
-
-# create list of survey design objects
-# using poisson sampling
-survey.list <-
-  lapply(seq_len(mc.rep) ,
-         function(this.rep) {
-           smp.ind <- which(runif(N.size) <= pop.df$pik)
-           smp.df <- pop.df[smp.ind , , drop = FALSE]
-           svydesign(
-             ids = ~ nquest ,
-             data = smp.df ,
-             fpc = ~ pik ,
-             nest = FALSE ,
-             pps = ppsmat(pikl[smp.ind , smp.ind]) ,
-             variance = "HT"
-           )
-         })
+
# set up monte carlo attributes
+mc.rep <- 5000L
+n.size = 1000L
+N.size = nrow(pop.df)
+
+# calculate first order probabilities
+pop.df$pik <-
+  n.size * (abs(pop.df$C) / sum(abs(pop.df$C)))
+
+# calculate second order probabilities
+pikl <- pop.df$pik %*% t(pop.df$pik)
+diag(pikl) <- pop.df$pik
+
+# set random number seed
+set.seed(1997)
+
+# create list of survey design objects
+# using poisson sampling
+survey.list <-
+  lapply(seq_len(mc.rep) ,
+         function(this.rep) {
+           smp.ind <- which(runif(N.size) <= pop.df$pik)
+           smp.df <- pop.df[smp.ind , , drop = FALSE]
+           svydesign(
+             ids = ~ nquest ,
+             data = smp.df ,
+             fpc = ~ pik ,
+             nest = FALSE ,
+             pps = ppsmat(pikl[smp.ind , smp.ind]) ,
+             variance = "HT"
+           )
+         })

… and estimate the Zenga index using each sample:

-
# estimate zenga index for each sample
-zenga.estimate.list <-
-  lapply(survey.list ,
-         function(this.sample) {
-           svyzenga( ~ x ,
-                     subset(this.sample , x > 0) ,
-                     na.rm = TRUE)
-         })
+
# estimate zenga index for each sample
+zenga.estimate.list <-
+  lapply(survey.list ,
+         function(this.sample) {
+           svyzenga( ~ x ,
+                     subset(this.sample , x > 0) ,
+                     na.rm = TRUE)
+         })

Then, we evaluate the Percentage Relative Bias (PRB) of the Zenga index estimator. Under this scenario, the PRB of the Zenga index estimator is 0.3397%, a result similar to the 0.321 shown in Table 2.

-
# compute the (finite population) Zenga index parameter
-theta.pop <-
-  convey:::CalcZenga(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))
-
-# estimate the expected value of the Zenga index estimator
-# using the average of the estimates
-theta.exp <- mean(sapply(zenga.estimate.list , coef))
-
-# estimate the percentage relative bias
-100 * (theta.exp / theta.pop - 1)
+
# compute the (finite population) Zenga index parameter
+theta.pop <-
+  convey:::CalcZenga(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))
+
+# estimate the expected value of the Zenga index estimator
+# using the average of the estimates
+theta.exp <- mean(sapply(zenga.estimate.list , coef))
+
+# estimate the percentage relative bias
+100 * (theta.exp / theta.pop - 1)
## [1] 0.3396837

Then, we evaluate the PRB of the variance estimator of the Zenga index estimator. Under this scenario, the PRB of the Zenga index variance estimator is -0.4954%, another result similar to the -0.600 shown in Table 2.

-
# estimate the variance of the Zenga index estimator
-# using the variance of the estimates
-(vartheta.popest <- var(sapply(zenga.estimate.list , coef)))
+
# estimate the variance of the Zenga index estimator
+# using the variance of the estimates
+(vartheta.popest <- var(sapply(zenga.estimate.list , coef)))
## [1] 9.47244e-05
-
# estimate the expected value of the Zenga index variance estimator
-# using the expected of the variance estimates
-(vartheta.exp <- mean(sapply(zenga.estimate.list , vcov)))
+
# estimate the expected value of the Zenga index variance estimator
+# using the expected of the variance estimates
+(vartheta.exp <- mean(sapply(zenga.estimate.list , vcov)))
## [1] 9.425515e-05
-
# estimate the percentage relative bias
-100 * (vartheta.exp / vartheta.popest - 1)
+
# estimate the percentage relative bias
+100 * (vartheta.exp / vartheta.popest - 1)
## [1] -0.4953863

Next, we evaluate the Percentage Coverage Rate (PCR). In theory, under repeated sampling, the estimated 95% CIs should cover the population parameter 95% of the time. We can evaluate that using:

-
# estimate confidence intervals of the Zenga index
-# for each of the samples
-est.cis <-
-  t(sapply(zenga.estimate.list, function(this.stat)
-    c(confint(this.stat))))
-
-# evaluate
-prop.table(table((theta.pop > est.cis[, 1]) &
-                   (theta.pop < est.cis[, 2])))
+
# estimate confidence intervals of the Zenga index
+# for each of the samples
+est.cis <-
+  t(sapply(zenga.estimate.list, function(this.stat)
+    c(confint(this.stat))))
+
+# evaluate
+prop.table(table((theta.pop > est.cis[, 1]) &
+                   (theta.pop < est.cis[, 2])))
## 
 ##  FALSE   TRUE 
 ## 0.0548 0.9452
diff --git a/docs/4.9-atkinson-index-svyatk.html b/docs/4.9-atkinson-index-svyatk.html index 22b4d2a..b577465 100644 --- a/docs/4.9-atkinson-index-svyatk.html +++ b/docs/4.9-atkinson-index-svyatk.html @@ -265,12 +265,12 @@

4.9 Atkinson index (svyatk)

-
✔️ is defined in terms of an interpretable utility function
-✔️ has a direct (but non-linear) relationship to `svygei`
-✔️ has an interpretable inequality aversion parameter
-❌ does not handle zeroes or negative incomes
-❌ the decomposition interpretation is not exactly the same as the GEI, but very similar (not implemented, though)
-❌ not very common
+
✔️ is defined in terms of an interpretable utility function
+✔️ has a direct (but non-linear) relationship to `svygei`
+✔️ has an interpretable inequality aversion parameter
+❌ does not handle zeroes or negative incomes
+❌ the decomposition interpretation is not exactly the same as the GEI, but very similar (not implemented, though)
+❌ not very common

Dalton (1920Dalton, Hugh. 1920. “The Measurement of the Inequality of Incomes.” The Economic Journal 30 (September). https://doi.org/10.2307/2223525.) pointed out one of the most important facts about the economic theory of income inequality measurement: every income inequality measure has an implicit SWF.7 This is the subject of an interesting discussion between the British and the Italian schools of inequality measurement. @@ -338,38 +338,38 @@

4.9 Atkinson index (svyatk)

4.9.1 Replication Example

In July 2006, Jenkins (2008Jenkins, Stephen. 2008. “Estimation and Interpretation of Measures of Inequality, Poverty, and Social Welfare Using Stata.” North American Stata Users' Group Meetings 2006. Stata Users Group. http://EconPapers.repec.org/RePEc:boc:asug06:16.) presented at the North American Stata Users’ Group Meetings on the stata Atkinson Index command. The example below reproduces those statistics.

Load and prepare the same data set:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the foreign library
-library(foreign)
-
-# create a temporary file on the local disk
-tf <- tempfile()
-
-# store the location of the presentation file
-presentation_zip <-
-  "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
-
-# download jenkins' presentation to the temporary file
-download.file(presentation_zip , tf , mode = 'wb')
-
-# unzip the contents of the archive
-presentation_files <- unzip(tf , exdir = tempdir())
-
-# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
-x81 <-
-  read.dta(grep("ifs81" , presentation_files , value = TRUE))
-x85 <-
-  read.dta(grep("ifs85" , presentation_files , value = TRUE))
-x91 <-
-  read.dta(grep("ifs91" , presentation_files , value = TRUE))
-
-# stack each of these three years of data into a single data.frame
-x <- rbind(x81 , x85 , x91)
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the foreign library
+library(foreign)
+
+# create a temporary file on the local disk
+tf <- tempfile()
+
+# store the location of the presentation file
+presentation_zip <-
+  "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
+
+# download jenkins' presentation to the temporary file
+download.file(presentation_zip , tf , mode = 'wb')
+
+# unzip the contents of the archive
+presentation_files <- unzip(tf , exdir = tempdir())
+
+# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
+x81 <-
+  read.dta(grep("ifs81" , presentation_files , value = TRUE))
+x85 <-
+  read.dta(grep("ifs85" , presentation_files , value = TRUE))
+x91 <-
+  read.dta(grep("ifs91" , presentation_files , value = TRUE))
+
+# stack each of these three years of data into a single data.frame
+x <- rbind(x81 , x85 , x91)

Replicate the author’s survey design statement from stata code..

. * account for clustering within HHs 
 . version 8: svyset [pweight = wgt], psu(hrn)
@@ -377,11 +377,11 @@ 

4.9.1 Replication Example

psu is hrn construct an

.. into R code:

-
# initiate a linearized survey design object
-y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
-
-# immediately run the `convey_prep` function on the survey design
-z <- convey_prep(y)
+
# initiate a linearized survey design object
+y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
+
+# immediately run the `convey_prep` function on the survey design
+z <- convey_prep(y)

Replicate the author’s subset statement and each of his svyatk results with stata..

. svyatk x if year == 1981
  
@@ -403,21 +403,21 @@ 

4.9.1 Replication Example

A(2.5) | .4992701 .06754311 7.39 0.000 .366888 .6316522 ---------------------------------------------------------------------------

..using R code:

-
z81 <- subset(z , year == 1981)
-
-svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 0.5)
+
z81 <- subset(z , year == 1981)
+
+svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 0.5)
##        atkinson     SE
 ## eybhc0 0.054324 0.0011
-
svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0))
+
svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0))
##        atkinson     SE
 ## eybhc0    0.108 0.0025
-
svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 1.5)
+
svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 1.5)
##        atkinson     SE
 ## eybhc0  0.17018 0.0067
-
svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 2)
+
svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 2)
##        atkinson    SE
 ## eybhc0  0.27558 0.026
-
svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 2.5)
+
svyatk( ~ eybhc0 , subset(z81 , eybhc0 > 0) , epsilon = 2.5)
##        atkinson     SE
 ## eybhc0  0.49927 0.0675

Confirm this replication applies for subsetted objects as well, comparing stata code..

@@ -439,21 +439,21 @@

4.9.1 Replication Example

A(2.5) | .394787 .04155221 9.50 0.000 .3133461 .4762278 ---------------------------------------------------------------------------

..to R code:

-
z81_two <- subset(z , year == 1981 & eybhc0 > 1)
-
-svyatk( ~ eybhc0 , z81_two , epsilon = 0.5)
+
z81_two <- subset(z , year == 1981 & eybhc0 > 1)
+
+svyatk( ~ eybhc0 , z81_two , epsilon = 0.5)
##        atkinson     SE
 ## eybhc0 0.054006 0.0011
-
svyatk( ~ eybhc0 , z81_two)
+
svyatk( ~ eybhc0 , z81_two)
##        atkinson     SE
 ## eybhc0  0.10661 0.0022
-
svyatk( ~ eybhc0 , z81_two , epsilon = 1.5)
+
svyatk( ~ eybhc0 , z81_two , epsilon = 1.5)
##        atkinson     SE
 ## eybhc0  0.16383 0.0048
-
svyatk( ~ eybhc0 , z81_two , epsilon = 2)
+
svyatk( ~ eybhc0 , z81_two , epsilon = 2)
##        atkinson     SE
 ## eybhc0  0.24432 0.0143
-
svyatk( ~ eybhc0 , z81_two , epsilon = 2.5)
+
svyatk( ~ eybhc0 , z81_two , epsilon = 2.5)
##        atkinson     SE
 ## eybhc0  0.39479 0.0416

For additional usage examples of svyatk, type ?convey::svyatk in the R console.

diff --git a/docs/5.1-richness-measures-svyrich.html b/docs/5.1-richness-measures-svyrich.html index 4a1244b..5d08e3d 100644 --- a/docs/5.1-richness-measures-svyrich.html +++ b/docs/5.1-richness-measures-svyrich.html @@ -265,13 +265,13 @@

5.1 Richness Measures (svyrich)

-
✔️ focuses on the top -- i.e., the "rich"
-✔️ can have an inequality or polarization interpretation
-✔️ interesting complementary approach to income inequality or polarization
-❌ hard to interpret as parameters change
-❌ convex richness measures are severely affected by outliers, unreliable for inference
-❌ requires a richness line defition
-❌ hardly ever used
+
✔️ focuses on the top -- i.e., the "rich"
+✔️ can have an inequality or polarization interpretation
+✔️ interesting complementary approach to income inequality or polarization
+❌ hard to interpret as parameters change
+❌ convex richness measures are severely affected by outliers, unreliable for inference
+❌ requires a richness line defition
+❌ hardly ever used

Peichl, Schaefer, and Scheicher (2010Peichl, Andreas, Thilo Schaefer, and Christoph Scheicher. 2010. Measuring Richness and Poverty: A micro data application to Europe and Germany.” Review of Income and Wealth 56 (3): 597–619. https://doi.org/https://doi.org/10.1111/j.1475-4991.2010.00404.x.) also presented a general class of richness measures, proposing three particular sub-classes: the (concave) Chakravarty class, the concave-FGT class and the convex-FGT class, defined at the population level as:

\[ @@ -308,167 +308,167 @@

5.1.1 Monte Carlo Simulation

parameter!).

For the sake of similarity, we start by simulating a large finite population (\(N = 10^5\)) using the same distribution from Brzezinski (2014Brzezinski, Michal. 2014. “Statistical Inference for Richness Measures.” Applied Economics 46 (14): 1599–1608. https://doi.org/10.1080/00036846.2014.880106.):

-
# load libraries
-library(survey)
-library(convey)
-library(sampling)
-library(VGAM)
-
-# set random seed
-set.seed(2023)
-
-# superpopulation parameters
-scale.x0 <- 1
-shape.theta <- 2
-cha.beta <- 2
-fgt.alpha <- 1.5
-n.pop <- as.integer(10 ^ 5)
-
-# generate finite population
-pop.df <-
-  data.frame(y1 = rparetoI(n.pop  ,  scale.x0 , shape.theta))
+
# load libraries
+library(survey)
+library(convey)
+library(sampling)
+library(VGAM)
+
+# set random seed
+set.seed(2023)
+
+# superpopulation parameters
+scale.x0 <- 1
+shape.theta <- 2
+cha.beta <- 2
+fgt.alpha <- 1.5
+n.pop <- as.integer(10 ^ 5)
+
+# generate finite population
+pop.df <-
+  data.frame(y1 = rparetoI(n.pop  ,  scale.x0 , shape.theta))

Then, we compute the finite population parameters using the simulated population:

-
# richness measures: finite population parameters
-cha.scores <-
-  function(y , b , rho)
-    ifelse(y > rho , (1 - (rho / y) ^ b) , 0)
-fgtt2.scores <-
-  function(y , g , rho)
-    ifelse(y > rho , (y / rho - 1) ^ a , 0)
-median.fp <- quantile(pop.df$y1 , .50)
-rho.fp <- 3 * median.fp
-rHC.fp <- mean(pop.df$y1 > rho.fp)
-rCha.fp <- mean(cha.scores(pop.df$y1  , cha.beta , rho.fp))
-rFGTT2.fp <- mean(cha.scores(pop.df$y1  , fgt.alpha , rho.fp))
+
# richness measures: finite population parameters
+cha.scores <-
+  function(y , b , rho)
+    ifelse(y > rho , (1 - (rho / y) ^ b) , 0)
+fgtt2.scores <-
+  function(y , g , rho)
+    ifelse(y > rho , (y / rho - 1) ^ a , 0)
+median.fp <- quantile(pop.df$y1 , .50)
+rho.fp <- 3 * median.fp
+rHC.fp <- mean(pop.df$y1 > rho.fp)
+rCha.fp <- mean(cha.scores(pop.df$y1  , cha.beta , rho.fp))
+rFGTT2.fp <- mean(cha.scores(pop.df$y1  , fgt.alpha , rho.fp))

For our sampling design, we select \(n = 1000\) units using multinomial sampling, with the variable x.aux as the size variable for the inclusion probabilities:

-
# define sample size
-n.sample <- 1000L
-
-# inclusion probability
-pop.df$x.aux <- plogis( pop.df$y1 ) / 1.1
-pop.df$pi1 <- sampling::inclusionprobabilities( pop.df$x.aux , n.sample )
+
# define sample size
+n.sample <- 1000L
+
+# inclusion probability
+pop.df$x.aux <- plogis( pop.df$y1 ) / 1.1
+pop.df$pi1 <- sampling::inclusionprobabilities( pop.df$x.aux , n.sample )

We run the procedure N and store the estimate objects using the code below:

-
# define the number of simulation runs
-mc.reps <- 5000L
-
-# simulation runs
-rep.list <- lapply(seq_len(mc.reps) , function(this.iter) {
-  # multinomial sampling
-  this.sample <- sampling::UPmultinomial(pop.df$pi1)
-  this.sample <- rep(1:n.pop , this.sample)
-  sample.df <- pop.df[this.sample ,]
-  sample.df$weights <- 1 / sample.df$pi1
-  des.obj <-
-    svydesign(
-      ids = ~ 1 ,
-      weights = ~ weights ,
-      data = sample.df ,
-      nest = FALSE
-    )
-  
-  # run estimation
-  des.obj <- convey_prep(des.obj)
-  rCha.hat <-
-    svyrich(
-      ~ y1 ,
-      des.obj ,
-      type_measure = "Cha" ,
-      g = cha.beta ,
-      type_thresh = "relq" ,
-      percent = 3
-    )
-  suppressWarnings(
-    rHC.hat <-
-      svyrich(
-        ~ y1 ,
-        des.obj ,
-        type_measure = "FGTT1" ,
-        g = 0 ,
-        type_thresh = "relq"  ,
-        percent = 3
-      )
-  )
-  suppressWarnings(
-    rFGTT2.hat <-
-      svyrich(
-        ~ y1 ,
-        des.obj ,
-        type_measure = "FGTT2" ,
-        g = fgt.alpha ,
-        type_thresh = "relq"  ,
-        percent = 3
-      )
-  )
-  est.list <- list(rHC.hat , rCha.hat , rFGTT2.hat)
-  est.list
-  
-})
+
# define the number of simulation runs
+mc.reps <- 5000L
+
+# simulation runs
+rep.list <- lapply(seq_len(mc.reps) , function(this.iter) {
+  # multinomial sampling
+  this.sample <- sampling::UPmultinomial(pop.df$pi1)
+  this.sample <- rep(1:n.pop , this.sample)
+  sample.df <- pop.df[this.sample ,]
+  sample.df$weights <- 1 / sample.df$pi1
+  des.obj <-
+    svydesign(
+      ids = ~ 1 ,
+      weights = ~ weights ,
+      data = sample.df ,
+      nest = FALSE
+    )
+  
+  # run estimation
+  des.obj <- convey_prep(des.obj)
+  rCha.hat <-
+    svyrich(
+      ~ y1 ,
+      des.obj ,
+      type_measure = "Cha" ,
+      g = cha.beta ,
+      type_thresh = "relq" ,
+      percent = 3
+    )
+  suppressWarnings(
+    rHC.hat <-
+      svyrich(
+        ~ y1 ,
+        des.obj ,
+        type_measure = "FGTT1" ,
+        g = 0 ,
+        type_thresh = "relq"  ,
+        percent = 3
+      )
+  )
+  suppressWarnings(
+    rFGTT2.hat <-
+      svyrich(
+        ~ y1 ,
+        des.obj ,
+        type_measure = "FGTT2" ,
+        g = fgt.alpha ,
+        type_thresh = "relq"  ,
+        percent = 3
+      )
+  )
+  est.list <- list(rHC.hat , rCha.hat , rFGTT2.hat)
+  est.list
+  
+})

To study the behavior of the estimators, we estimate their expected values, empirical variance (for the main parameter) and mean squared error. To study the validity of the normal approximation, we also estimate the percent coverage rate of the nominal 95% confidence interval. This is done using the function below:

-
sim.compile <- function(ll ,
-                        pv ,
-                        level = .95 ,
-                        na.rm = FALSE) {
-  # collect estimates
-  mhat.vec <- sapply(ll , coef)
-  vhat.vec <- sapply(ll , vcov)
-  
-  # estimate expected value
-  mhat.exp <- mean(mhat.vec , na.rm = na.rm)
-  vhat.exp <- mean(vhat.vec , na.rm = na.rm)
-  
-  # calculate empirical variance
-  mhat.empvar <- var(mhat.vec , na.rm = na.rm)
-  
-  # estimate squared bias
-  mhat.bias2 <- (mhat.exp - pv) ^ 2
-  vhat.bias2 <- (vhat.exp - mhat.empvar) ^ 2
-  
-  # estimate mse
-  mhat.mse <- mhat.bias2 + mhat.empvar
-  
-  # estimate coverage rate
-  ci.hats <- t(sapply(ll , confint))
-  ci.check <-
-    matrix(as.logical(NA) , nrow = nrow(ci.hats) , ncol = 3)
-  ci.check[, 1] <- ci.hats[, 1] <= pv
-  ci.check[, 2] <- ci.hats[, 2] >= pv
-  ci.check[, 3] <- apply(ci.check[, 1:2] , 1 , all)
-  pcr.emp <- mean(ci.check[, 3] , na.rm = na.rm)
-  
-  # setup final table
-  data.frame(
-    "mc.reps" = length(ll) ,
-    "theta" = pv ,
-    "theta.hat" = mhat.exp ,
-    "theta.bias2" = mhat.bias2 ,
-    "theta.empvar" = mhat.empvar ,
-    "theta.hat.mse" = mhat.mse ,
-    "theta.varhat" = vhat.exp ,
-    "pcr" = pcr.emp
-  )
-}
+
sim.compile <- function(ll ,
+                        pv ,
+                        level = .95 ,
+                        na.rm = FALSE) {
+  # collect estimates
+  mhat.vec <- sapply(ll , coef)
+  vhat.vec <- sapply(ll , vcov)
+  
+  # estimate expected value
+  mhat.exp <- mean(mhat.vec , na.rm = na.rm)
+  vhat.exp <- mean(vhat.vec , na.rm = na.rm)
+  
+  # calculate empirical variance
+  mhat.empvar <- var(mhat.vec , na.rm = na.rm)
+  
+  # estimate squared bias
+  mhat.bias2 <- (mhat.exp - pv) ^ 2
+  vhat.bias2 <- (vhat.exp - mhat.empvar) ^ 2
+  
+  # estimate mse
+  mhat.mse <- mhat.bias2 + mhat.empvar
+  
+  # estimate coverage rate
+  ci.hats <- t(sapply(ll , confint))
+  ci.check <-
+    matrix(as.logical(NA) , nrow = nrow(ci.hats) , ncol = 3)
+  ci.check[, 1] <- ci.hats[, 1] <= pv
+  ci.check[, 2] <- ci.hats[, 2] >= pv
+  ci.check[, 3] <- apply(ci.check[, 1:2] , 1 , all)
+  pcr.emp <- mean(ci.check[, 3] , na.rm = na.rm)
+  
+  # setup final table
+  data.frame(
+    "mc.reps" = length(ll) ,
+    "theta" = pv ,
+    "theta.hat" = mhat.exp ,
+    "theta.bias2" = mhat.bias2 ,
+    "theta.empvar" = mhat.empvar ,
+    "theta.hat.mse" = mhat.mse ,
+    "theta.varhat" = vhat.exp ,
+    "pcr" = pcr.emp
+  )
+}

For the Headcount Richness Ratio (computed using the concave FGT measure), we have:

-
rhc.list <- lapply(rep.list , `[[` , 1)
-sim.compile(rhc.list, rHC.fp)
+
rhc.list <- lapply(rep.list , `[[` , 1)
+sim.compile(rhc.list, rHC.fp)
##   mc.reps   theta  theta.hat  theta.bias2 theta.empvar theta.hat.mse
 ## 1    5000 0.05469 0.05452618 2.683797e-08 4.112616e-05    4.1153e-05
 ##   theta.varhat    pcr
 ## 1 4.336252e-05 0.9524
-
stopifnot(round(
-  sim.compile(rhc.list, rHC.fp)["theta.bias2"] / sim.compile(rhc.list, rHC.fp)["theta.hat.mse"] ,
-  4
-) == 0.0007)
-
-stopifnot(round(
-  sim.compile(rhc.list, rHC.fp)["theta.varhat"] / sim.compile(rhc.list, rHC.fp)["theta.empvar"] ,
-  2
-) == 1.05)
-
-stopifnot(round(sim.compile(rhc.list, rHC.fp)["pcr"] , 4) == 0.9524)
+
stopifnot(round(
+  sim.compile(rhc.list, rHC.fp)["theta.bias2"] / sim.compile(rhc.list, rHC.fp)["theta.hat.mse"] ,
+  4
+) == 0.0007)
+
+stopifnot(round(
+  sim.compile(rhc.list, rHC.fp)["theta.varhat"] / sim.compile(rhc.list, rHC.fp)["theta.empvar"] ,
+  2
+) == 1.05)
+
+stopifnot(round(sim.compile(rhc.list, rHC.fp)["pcr"] , 4) == 0.9524)

Under this approach, the squared bias accounts for approx. 0.07% of the MSE, indicating that the MSE of the estimator is reasonably approximated by its variance. Additionally, the ratio between the expected value of the variance estimator and the empirical @@ -477,23 +477,23 @@

5.1.1 Monte Carlo Simulation

Finally, the estimated percent coverage rate of 95.24% is close to the nominal level of 95%, indicating that the confidence intervals are approximately valid.

For the Chakravarty measure, we have:

-
rcha.list <- lapply(rep.list , `[[` , 2)
-sim.compile(rcha.list, rCha.fp)
+
rcha.list <- lapply(rep.list , `[[` , 2)
+sim.compile(rcha.list, rCha.fp)
##   mc.reps      theta  theta.hat  theta.bias2 theta.empvar theta.hat.mse
 ## 1    5000 0.02743915 0.02737177 4.538966e-09 1.428639e-05  1.429093e-05
 ##   theta.varhat    pcr
 ## 1 1.402416e-05 0.9428
-
stopifnot(round(
-  sim.compile(rcha.list, rCha.fp)["theta.bias2"] / sim.compile(rcha.list, rCha.fp)["theta.hat.mse"] ,
-  4
-) == 0.0003)
-
-stopifnot(round(
-  sim.compile(rcha.list, rCha.fp)["theta.varhat"] / sim.compile(rcha.list, rCha.fp)["theta.empvar"] ,
-  2
-) == 0.98)
-
-stopifnot(round(sim.compile(rcha.list, rCha.fp)["pcr"] , 4) == 0.9428)
+
stopifnot(round(
+  sim.compile(rcha.list, rCha.fp)["theta.bias2"] / sim.compile(rcha.list, rCha.fp)["theta.hat.mse"] ,
+  4
+) == 0.0003)
+
+stopifnot(round(
+  sim.compile(rcha.list, rCha.fp)["theta.varhat"] / sim.compile(rcha.list, rCha.fp)["theta.empvar"] ,
+  2
+) == 0.98)
+
+stopifnot(round(sim.compile(rcha.list, rCha.fp)["pcr"] , 4) == 0.9428)

Under this approach, the squared bias is approx 0.03% of the MSE, indicating that the MSE of the estimator is reasonably approximated by its variance. Additionally, the ratio between the expected value of the variance estimator and the empirical @@ -502,23 +502,23 @@

5.1.1 Monte Carlo Simulation

coverage rate of 94.28% is close to the nominal level of 95%, indicating that the confidence intervals are approximately valid.

For the convex FGT measure, we have:

-
rfgtt2.list <- lapply(rep.list , `[[` , 3)
-sim.compile(rfgtt2.list, rFGTT2.fp)
+
rfgtt2.list <- lapply(rep.list , `[[` , 3)
+sim.compile(rfgtt2.list, rFGTT2.fp)
##   mc.reps      theta theta.hat theta.bias2 theta.empvar theta.hat.mse
 ## 1    5000 0.02349189 0.1045973  0.00657808   0.01865219    0.02523027
 ##   theta.varhat    pcr
 ## 1   0.01857144 0.5212
-
stopifnot(round(
-  sim.compile(rfgtt2.list, rFGTT2.fp)["theta.bias2"] / sim.compile(rfgtt2.list, rFGTT2.fp)["theta.hat.mse"] ,
-  2
-) == 0.26)
-
-stopifnot(round(
-  sim.compile(rfgtt2.list, rFGTT2.fp)["theta.varhat"] / sim.compile(rfgtt2.list, rFGTT2.fp)["theta.empvar"] ,
-  2
-) == 1)
-
-stopifnot(round(sim.compile(rfgtt2.list, rFGTT2.fp)["pcr"] , 4) == 0.5212)
+
stopifnot(round(
+  sim.compile(rfgtt2.list, rFGTT2.fp)["theta.bias2"] / sim.compile(rfgtt2.list, rFGTT2.fp)["theta.hat.mse"] ,
+  2
+) == 0.26)
+
+stopifnot(round(
+  sim.compile(rfgtt2.list, rFGTT2.fp)["theta.varhat"] / sim.compile(rfgtt2.list, rFGTT2.fp)["theta.empvar"] ,
+  2
+) == 1)
+
+stopifnot(round(sim.compile(rfgtt2.list, rFGTT2.fp)["pcr"] , 4) == 0.5212)

Under this approach, the squared bias is approx 26% of the MSE, indicating that the bias is substantial. The ratio between the expected value of the variance estimator and the empirical variance is approx. 1.00, indicating that the variance diff --git a/docs/7-covariance-matrix.html b/docs/7-covariance-matrix.html index f45d1db..b4c2060 100644 --- a/docs/7-covariance-matrix.html +++ b/docs/7-covariance-matrix.html @@ -281,36 +281,36 @@

Chapter 7 Covariance Matrix

Then, how can we estimate the covariance? The survey package already provides an approach for that using the svybyfunction. Based on the ?survey::svyby examples, we have:

-
# load the survey library
-library(survey)
-
-#  load data set
-data( api )
-
-# declare sampling design
-dclus1 <- svydesign( id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc )
-
-# estimate means
-mns <-svyby(~api99, ~stype, dclus1, svymean,covmat=TRUE)
-
-# collect variacnce-covariance matrix of estimates
-( m <- vcov( mns ) )
+
# load the survey library
+library(survey)
+
+#  load data set
+data( api )
+
+# declare sampling design
+dclus1 <- svydesign( id=~dnum, weights=~pw, data=apiclus1, fpc=~fpc )
+
+# estimate means
+mns <-svyby(~api99, ~stype, dclus1, svymean,covmat=TRUE)
+
+# collect variacnce-covariance matrix of estimates
+( m <- vcov( mns ) )
##          E         H         M
 ## E 520.5973  573.0404  684.6562
 ## H 573.0404 1744.2317  747.1989
 ## M 684.6562  747.1989 1060.1954
-
# compute variance terms
-var.naive <- sum( diag( m[c(1,3),c(1,3)] ) )
-cov.term <- sum( diag( m[ c(1,3),c(3,1)] ) )
-
-# "naive" SE of the difference
-sqrt( var.naive )
+
# compute variance terms
+var.naive <- sum( diag( m[c(1,3),c(1,3)] ) )
+cov.term <- sum( diag( m[ c(1,3),c(3,1)] ) )
+
+# "naive" SE of the difference
+sqrt( var.naive )
## [1] 39.75918
-
# SE of the difference
-sqrt( var.naive - cov.term )
+
# SE of the difference
+sqrt( var.naive - cov.term )
## [1] 14.54236
-
#... or using svycontrast
-svycontrast( mns , c(E = 1, M = -1) )
+
#... or using svycontrast
+svycontrast( mns , c(E = 1, M = -1) )
##          contrast     SE
 ## contrast -0.80833 14.542

Notice that, because the covariance terms are positive, the (actual) variance of diff --git a/docs/7.1-example-calculation-1.html b/docs/7.1-example-calculation-1.html index 2bb54f7..fde0b5f 100644 --- a/docs/7.1-example-calculation-1.html +++ b/docs/7.1-example-calculation-1.html @@ -266,76 +266,76 @@

7.1 Example Calculation

We start by showing what has been implemented — the resampling based approach:

-
# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the laeken library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
-  svydesign(
-    ids = ~ rb030 ,
-    strata = ~ db040 ,
-    weights = ~ rb050 ,
-    data = eusilc
-  )
-
-# create bootstrap-based design object
-des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep( des_eusilc )
-des_eusilc_rep <- convey_prep(des_eusilc_rep)
-
-# estimate gini by gender, with and without covariance matrices
-ginis.rep <- svyby( ~ py010n , ~rb090 , des_eusilc_rep , svygini , na.rm = TRUE , covmat = TRUE )
-ginis.rep.nocov <- svyby( ~ py010n , ~rb090 , des_eusilc_rep , svygini , na.rm = TRUE , covmat = FALSE )
-
-# variance of the gini difference: naive
-svycontrast( ginis.rep.nocov , quote( `male` - `female` ) )
+
# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the laeken library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+  svydesign(
+    ids = ~ rb030 ,
+    strata = ~ db040 ,
+    weights = ~ rb050 ,
+    data = eusilc
+  )
+
+# create bootstrap-based design object
+des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep( des_eusilc )
+des_eusilc_rep <- convey_prep(des_eusilc_rep)
+
+# estimate gini by gender, with and without covariance matrices
+ginis.rep <- svyby( ~ py010n , ~rb090 , des_eusilc_rep , svygini , na.rm = TRUE , covmat = TRUE )
+ginis.rep.nocov <- svyby( ~ py010n , ~rb090 , des_eusilc_rep , svygini , na.rm = TRUE , covmat = FALSE )
+
+# variance of the gini difference: naive
+svycontrast( ginis.rep.nocov , quote( `male` - `female` ) )
## Warning in vcov.svyby(stat): Only diagonal elements of vcov() available
##             nlcon     SE
 ## contrast -0.15006 0.0065
-
# variance of the gini difference: accounting for covariance
-svycontrast( ginis.rep , quote( `male` - `female` ) )
+
# variance of the gini difference: accounting for covariance
+svycontrast( ginis.rep , quote( `male` - `female` ) )
##             nlcon     SE
 ## contrast -0.15006 0.0068

While not implemented yet, a linearization-based approach can be applied using:

-
# estimate gini by gender, with and without covariance matrices
-ginis.lin.nocov <- svyby( ~ py010n , ~rb090 , des_eusilc , svygini , na.rm = TRUE , covmat = FALSE )
-gini.female <- svygini( ~ py010n , subset( des_eusilc , rb090 == "female" ), na.rm = TRUE , influence = TRUE )
-gini.male   <- svygini( ~ py010n , subset( des_eusilc , rb090 == "male" ), na.rm = TRUE , influence = TRUE )
-
-# variance of the gini difference: naive
-linmat <- rep( 0 , nrow( des_eusilc$variables ) )
-linmat[ attr( gini.female , "index" ) ] <- attr( gini.female , "influence" )
-linmat[ attr( gini.male , "index" ) ] <- attr( gini.male , "influence" )
-linmat <- linmat * des_eusilc$prob
-lintot <- svyby( linmat , ~rb090 , des_eusilc , svytotal , covmat = TRUE )
-
-# collect covariance matrix
-( m <- vcov( lintot ) )
+
# estimate gini by gender, with and without covariance matrices
+ginis.lin.nocov <- svyby( ~ py010n , ~rb090 , des_eusilc , svygini , na.rm = TRUE , covmat = FALSE )
+gini.female <- svygini( ~ py010n , subset( des_eusilc , rb090 == "female" ), na.rm = TRUE , influence = TRUE )
+gini.male   <- svygini( ~ py010n , subset( des_eusilc , rb090 == "male" ), na.rm = TRUE , influence = TRUE )
+
+# variance of the gini difference: naive
+linmat <- rep( 0 , nrow( des_eusilc$variables ) )
+linmat[ attr( gini.female , "index" ) ] <- attr( gini.female , "influence" )
+linmat[ attr( gini.male , "index" ) ] <- attr( gini.male , "influence" )
+linmat <- linmat * des_eusilc$prob
+lintot <- svyby( linmat , ~rb090 , des_eusilc , svytotal , covmat = TRUE )
+
+# collect covariance matrix
+( m <- vcov( lintot ) )
##                male       female
 ## male   2.954413e-05 2.764584e-10
 ## female 2.764584e-10 2.292063e-05
-
# SE of the gini difference: naive
-SE( svycontrast( ginis.lin.nocov , quote( `male` - `female` ) ) )
+
# SE of the gini difference: naive
+SE( svycontrast( ginis.lin.nocov , quote( `male` - `female` ) ) )
## Warning in vcov.svyby(stat): Only diagonal elements of vcov() available
##    contrast 
 ## 0.007243256
-
# SE of the gini difference: accounting for covariance
-SE( svycontrast( lintot , quote( `male` - `female` ) ) )
+
# SE of the gini difference: accounting for covariance
+SE( svycontrast( lintot , quote( `male` - `female` ) ) )
##    contrast 
 ## 0.007243218
diff --git a/index.Rmd b/index.Rmd index 246ca8f..99e1854 100644 --- a/index.Rmd +++ b/index.Rmd @@ -183,7 +183,7 @@ This section downloads, imports, and prepares the most current microdata for ana Download and unzip the 2023 file: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} library(httr) tf <- tempfile() @@ -198,7 +198,7 @@ unzipped_files <- unzip(tf , exdir = tempdir()) Import all four files: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} library(haven) four_tbl <- lapply(unzipped_files , read_sas) @@ -223,7 +223,7 @@ repwgts_df <- Divide weights: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} household_df[, 'hsup_wgt'] <- household_df[, 'hsup_wgt'] / 100 family_df[, 'fsup_wgt'] <- family_df[, 'fsup_wgt'] / 100 for (j in c('marsupwt' , 'a_ernlwt' , 'a_fnlwgt')) @@ -232,7 +232,7 @@ for (j in c('marsupwt' , 'a_ernlwt' , 'a_fnlwgt')) Merge these four files: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} names(family_df)[names(family_df) == 'fh_seq'] <- 'h_seq' names(person_df)[names(person_df) == 'ph_seq'] <- 'h_seq' names(person_df)[names(person_df) == 'phf_seq'] <- 'ffpos' @@ -246,7 +246,7 @@ stopifnot(nrow(cps_df) == nrow(person_df)) Construct a complex sample survey design: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} library(survey) cps_design <- @@ -263,7 +263,7 @@ cps_design <- Run the `convey_prep()` function on the full design: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} cps_design <- convey_prep(cps_design) ``` @@ -326,7 +326,7 @@ This section downloads, imports, and prepares the most current microdata for ana Download and import the 2022 file: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} library(PNADcIBGE) pnadc_df <- @@ -339,7 +339,7 @@ names(pnadc_df) <- tolower(names(pnadc_df)) ``` Recode a number of variables: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} pnadc_df <- transform( pnadc_df , @@ -392,7 +392,7 @@ pnadc_df <- Construct a complex sample survey design: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} library(survey) pnadc_design <- @@ -407,7 +407,7 @@ pnadc_design <- Run the `convey_prep()` function on the full design: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} pnadc_design <- convey_prep(pnadc_design) ``` @@ -460,7 +460,7 @@ This section downloads, imports, and prepares the most current microdata for ana This survey uses a multiply-imputed variance estimation technique described in the [2004 Codebook](https://www.federalreserve.gov/econres/files/2004_codebk2004.txt). Most users do not need to study this function carefully. Define a function specific to only this dataset: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine <- function (results, variances, @@ -515,7 +515,7 @@ scf_MIcombine <- Define a function to download and import each stata file: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} library(haven) scf_dta_import <- @@ -538,7 +538,7 @@ scf_dta_import <- Download and import the full, summary extract, and replicate weights tables: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_df <- scf_dta_import("https://www.federalreserve.gov/econres/files/scf2022s.zip") @@ -550,13 +550,13 @@ scf_rw_df <- ``` Confirm both the full public data and the summary extract contain five records per family: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} stopifnot(nrow(scf_df) == nrow(scf_rw_df) * 5) stopifnot(nrow(scf_df) == nrow(ext_df)) ``` Confirm only the primary economic unit and the five implicate identifiers overlap: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} stopifnot(all(sort(intersect( names(scf_df) , names(ext_df) )) == c('y1' , 'yy1'))) @@ -569,7 +569,7 @@ stopifnot(all(sort(intersect( ``` Remove the implicate identifier from the replicate weights table, add a column of fives for weighting: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_rw_df[, 'y1'] <- NULL scf_df[, 'five'] <- 5 @@ -578,7 +578,7 @@ scf_df[, 'five'] <- 5 Construct a multiply-imputed, complex sample survey design: Break the main table into five different implicates based on the final character of the column `y1`: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} library(stringr) s1_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 1 ,] @@ -589,7 +589,7 @@ s5_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 5 ,] ``` Combine these into a single `list`, then merge each implicate with the summary extract: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_imp <- list(s1_df , s2_df , s3_df , s4_df , s5_df) scf_list <- lapply(scf_imp , merge , ext_df) @@ -597,7 +597,7 @@ scf_list <- lapply(scf_imp , merge , ext_df) ``` Replace all missing values in the replicate weights table with zeroes, multiply the replicate weights by the multiplication factor, then only keep the unique identifier and the final (combined) replicate weights: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_rw_df[is.na(scf_rw_df)] <- 0 scf_rw_df[, paste0('wgt' , 1:999)] <- @@ -608,7 +608,7 @@ scf_rw_df <- scf_rw_df[, c('yy1' , paste0('wgt' , 1:999))] Sort both the five implicates and also the replicate weights table by the unique identifier: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_list <- lapply(scf_list , function(w) w[order(w[, 'yy1']) ,]) @@ -617,7 +617,7 @@ scf_rw_df <- scf_rw_df[order(scf_rw_df[, 'yy1']) ,] ``` Define the design: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} library(survey) library(mitools) @@ -639,13 +639,13 @@ scf_design <- Run the `convey_prep()` function on the full design: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_design$designs <- lapply(scf_design$designs , convey_prep) ``` This example matches the "Table 4" tab's cell Y6 of the [Excel Based on Public Data](https://www.federalreserve.gov/econres/files/scf2022_tables_public_nominal_historical.xlsx): -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} mean_net_worth <- scf_MIcombine(with(scf_design , svymean(~ networth))) @@ -653,13 +653,13 @@ stopifnot(round(coef(mean_net_worth) / 1000 , 1) == 1059.5) ``` This example comes within $500 of the standard error of mean net worth from Table 2 of the [Federal Reserve Bulletin](https://www.federalreserve.gov/publications/files/scf23.pdf#page=18), displaying the minor differences between the [Internal Data](https://www.federalreserve.gov/econres/files/scf2022_tables_internal_nominal_historical.xlsx) and [Public Data](https://www.federalreserve.gov/econres/files/scf2022_tables_public_nominal_historical.xlsx): -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} stopifnot(abs(23.2 - round(SE(mean_net_worth) / 1000 , 1)) < 0.5) ``` This example matches the "Table 4" tab's cells X6 of the [Excel Based on Public Data](https://www.federalreserve.gov/econres/files/scf2022_tables_public_nominal_historical.xlsx): -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} # compute quantile with all five implicates stacked (not the recommended technique) fake_design <- svydesign(~ 1 , data = ext_df[c('networth' , 'wgt')] , weights = ~ wgt) @@ -677,7 +677,7 @@ stopifnot(round(coef(median_net_worth_incorrect_errors) / 1000 , 2) == 192.7) ### Analysis Examples with the `survey` library Add new columns to the data set: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_design <- update( scf_design , @@ -707,14 +707,14 @@ scf_design <- ``` Count the unweighted number of records in the survey sample, overall and by groups: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with(scf_design , svyby(~ five , ~ five , unwtd.count))) scf_MIcombine(with(scf_design , svyby(~ five , ~ hhsex , unwtd.count))) ``` Count the weighted size of the generalizable population, overall and by groups: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with(scf_design , svytotal(~ five))) scf_MIcombine(with(scf_design , @@ -722,7 +722,7 @@ scf_MIcombine(with(scf_design , ``` Calculate the mean (average) of a linear variable, overall and by groups: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with(scf_design , svymean(~ networth))) scf_MIcombine(with(scf_design , @@ -730,7 +730,7 @@ scf_MIcombine(with(scf_design , ``` Calculate the distribution of a categorical variable, overall and by groups: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with(scf_design , svymean(~ edcl))) scf_MIcombine(with(scf_design , @@ -738,7 +738,7 @@ scf_MIcombine(with(scf_design , ``` Calculate the sum of a linear variable, overall and by groups: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with(scf_design , svytotal(~ networth))) scf_MIcombine(with(scf_design , @@ -746,7 +746,7 @@ scf_MIcombine(with(scf_design , ``` Calculate the weighted sum of a categorical variable, overall and by groups: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with(scf_design , svytotal(~ edcl))) scf_MIcombine(with(scf_design , @@ -754,7 +754,7 @@ scf_MIcombine(with(scf_design , ``` Calculate the median (50th percentile) of a linear variable, overall and by groups: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with( scf_design , svyquantile(~ networth , @@ -776,7 +776,7 @@ scf_MIcombine(with( ``` Estimate a ratio: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with( scf_design , svyratio(numerator = ~ income , denominator = ~ networth) @@ -784,16 +784,16 @@ scf_MIcombine(with( ``` Restrict the survey design to labor force participants: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} sub_scf_design <- subset(scf_design , lf == 1) ``` Calculate the mean (average) of this subset: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with(sub_scf_design , svymean(~ networth))) ``` Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} this_result <- scf_MIcombine(with(scf_design , svymean(~ networth))) @@ -814,17 +814,17 @@ cv(grouped_result) ``` Calculate the degrees of freedom of any survey design object: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} degf(scf_design$designs[[1]]) ``` Calculate the complex sample survey-adjusted variance of any statistic: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} scf_MIcombine(with(scf_design , svyvar(~ networth))) ``` Include the complex sample design effect in the result for a specific statistic: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} # SRS without replacement scf_MIcombine(with(scf_design , svymean(~ networth , deff = TRUE))) @@ -836,7 +836,7 @@ scf_MIcombine(with(scf_design , Perform a survey-weighted generalized linear model: -```{r, results = 'hide'} +```{r results='hide', message=FALSE, warning=FALSE} glm_result <- scf_MIcombine(with(scf_design , svyglm(networth ~ married + edcl)))