diff --git "a/docs/1.5-pesquisa-nacional-por-amostra-de-domic\303\255lios-cont\303\255nua-pnad-continua.html" "b/docs/1.5-pesquisa-nacional-por-amostra-de-domic\303\255lios-cont\303\255nua-pnad-continua.html" index 491ec9e..f5ecdd4 100644 --- "a/docs/1.5-pesquisa-nacional-por-amostra-de-domic\303\255lios-cont\303\255nua-pnad-continua.html" +++ "b/docs/1.5-pesquisa-nacional-por-amostra-de-domic\303\255lios-cont\303\255nua-pnad-continua.html" @@ -274,102 +274,102 @@
## Deflator year was not provided, so deflator year was set to 2022.
-
+ labels = FALSE)
+
+names(pnadc_df) <- tolower(names(pnadc_df))
Recode a number of variables:
-pnadc_df <-
- transform(
- pnadc_df ,
-
- household_id = paste0(upa , v1008 , v1014) ,
-
- deflated_labor_income = vd4019 * co2 ,
-
- deflated_other_source_income = vd4048 * co2e
- )
-
-labor_income_sum_df <-
- aggregate(
- cbind(household_deflated_labor_income = deflated_labor_income) ~ household_id ,
- data = pnadc_df[!(pnadc_df[, 'vd2002'] %in% 17:19) ,] ,
- sum ,
- na.rm = TRUE
- )
-
-other_income_sum_df <-
- aggregate(
- cbind(household_deflated_other_source_income = deflated_other_source_income) ~ household_id ,
- data = pnadc_df[!(pnadc_df[, 'vd2002'] %in% 17:19) ,] ,
- sum ,
- na.rm = TRUE
- )
-
-before_nrow <- nrow(pnadc_df)
-pnadc_df <- merge(pnadc_df , labor_income_sum_df , all.x = TRUE)
-pnadc_df <- merge(pnadc_df , other_income_sum_df , all.x = TRUE)
-stopifnot(nrow(pnadc_df) == before_nrow)
-
-pnadc_df[is.na(pnadc_df[, 'household_deflated_labor_income']) , 'household_deflated_labor_income'] <-
- 0
-pnadc_df[is.na(pnadc_df[, 'household_deflated_other_source_income']) , 'household_deflated_other_source_income'] <-
- 0
-
-
-pnadc_df <-
- transform(
- pnadc_df ,
-
- deflated_per_capita_income =
- (
- household_deflated_labor_income + household_deflated_other_source_income
- ) / vd2003
-
- )
pnadc_df <-
+ transform(
+ pnadc_df ,
+
+ household_id = paste0(upa , v1008 , v1014) ,
+
+ deflated_labor_income = vd4019 * co2 ,
+
+ deflated_other_source_income = vd4048 * co2e
+ )
+
+labor_income_sum_df <-
+ aggregate(
+ cbind(household_deflated_labor_income = deflated_labor_income) ~ household_id ,
+ data = pnadc_df[!(pnadc_df[, 'vd2002'] %in% 17:19) ,] ,
+ sum ,
+ na.rm = TRUE
+ )
+
+other_income_sum_df <-
+ aggregate(
+ cbind(household_deflated_other_source_income = deflated_other_source_income) ~ household_id ,
+ data = pnadc_df[!(pnadc_df[, 'vd2002'] %in% 17:19) ,] ,
+ sum ,
+ na.rm = TRUE
+ )
+
+before_nrow <- nrow(pnadc_df)
+pnadc_df <- merge(pnadc_df , labor_income_sum_df , all.x = TRUE)
+pnadc_df <- merge(pnadc_df , other_income_sum_df , all.x = TRUE)
+stopifnot(nrow(pnadc_df) == before_nrow)
+
+pnadc_df[is.na(pnadc_df[, 'household_deflated_labor_income']) , 'household_deflated_labor_income'] <-
+ 0
+pnadc_df[is.na(pnadc_df[, 'household_deflated_other_source_income']) , 'household_deflated_other_source_income'] <-
+ 0
+
+
+pnadc_df <-
+ transform(
+ pnadc_df ,
+
+ deflated_per_capita_income =
+ (
+ household_deflated_labor_income + household_deflated_other_source_income
+ ) / vd2003
+
+ )
Construct a complex sample survey design:
-library(survey)
-
-pnadc_design <-
- svrepdesign(
- data = pnadc_df ,
- weight = ~ v1032,
- type = "bootstrap" ,
- repweights = "v1032[0-9]{3}" ,
- mse = TRUE
- )
library(survey)
+
+pnadc_design <-
+ svrepdesign(
+ data = pnadc_df ,
+ weight = ~ v1032,
+ type = "bootstrap" ,
+ repweights = "v1032[0-9]{3}" ,
+ mse = TRUE
+ )
Run the convey_prep()
function on the full design:
Calculate the gini coefficient with per capita income:
-# https://sidra.ibge.gov.br/tabela/7435#/n1/all/v/all/p/last%201/d/v10681%203,v10682%201/l/v,p,t/resultado
-
-(
- pnadc_per_capita_gini <-
- svygini( ~ deflated_per_capita_income , pnadc_design , na.rm = TRUE)
-)
# https://sidra.ibge.gov.br/tabela/7435#/n1/all/v/all/p/last%201/d/v10681%203,v10682%201/l/v,p,t/resultado
+
+(
+ pnadc_per_capita_gini <-
+ svygini( ~ deflated_per_capita_income , pnadc_design , na.rm = TRUE)
+)
## gini SE
## deflated_per_capita_income 0.51845 0.0032
-
+
Calculate the gini coefficient with total earnings:
-# https://sidra.ibge.gov.br/tabela/7453#/n1/all/v/all/p/last%201/d/v10806%203,v10807%201/l/v,p,t/resultado
-
-(pnadc_earnings_gini <-
- svygini( ~ deflated_labor_income , pnadc_design , na.rm = TRUE))
# https://sidra.ibge.gov.br/tabela/7453#/n1/all/v/all/p/last%201/d/v10806%203,v10807%201/l/v,p,t/resultado
+
+(pnadc_earnings_gini <-
+ svygini( ~ deflated_labor_income , pnadc_design , na.rm = TRUE))
## gini SE
## deflated_labor_income 0.48606 0.0036
-
+
diff --git a/docs/1.6-survey-of-consumer-finances-scf.html b/docs/1.6-survey-of-consumer-finances-scf.html index 88cc607..55986c2 100644 --- a/docs/1.6-survey-of-consumer-finances-scf.html +++ b/docs/1.6-survey-of-consumer-finances-scf.html @@ -268,299 +268,285 @@
The SCF studies net worth across the United States by asking respondents about both active and passive income, mortgages, pensions, credit card debt, even car leases. Administered by the Board of Governors of the Federal Reserve System triennially since 1989, this complex sample survey generalizes to the civilian non-institutional population and comprehensively assesses household wealth.
This section downloads, imports, and prepares the most current microdata for analysis, then reproduces some statistics and margin of error terms from the Federal Reserve.
This survey uses a multiply-imputed variance estimation technique described in the 2004 Codebook. Most users do not need to study this function carefully. Define a function specific to only this dataset:
-scf_MIcombine <-
- function (results,
- variances,
- call = sys.call(),
- df.complete = Inf,
- ...) {
- m <- length(results)
- oldcall <- attr(results, "call")
- if (missing(variances)) {
- variances <- suppressWarnings(lapply(results, vcov))
- results <- lapply(results, coef)
- }
- vbar <- variances[[1]]
- cbar <- results[[1]]
- for (i in 2:m) {
- cbar <- cbar + results[[i]]
- # MODIFICATION:
- # vbar <- vbar + variances[[i]]
- }
- cbar <- cbar / m
- # MODIFICATION:
- # vbar <- vbar/m
- evar <- var(do.call("rbind", results))
- r <- (1 + 1 / m) * evar / vbar
- df <- (m - 1) * (1 + 1 / r) ^ 2
- if (is.matrix(df))
- df <- diag(df)
- if (is.finite(df.complete)) {
- dfobs <- ((df.complete + 1) / (df.complete + 3)) * df.complete *
- vbar / (vbar + evar)
- if (is.matrix(dfobs))
- dfobs <- diag(dfobs)
- df <- 1 / (1 / dfobs + 1 / df)
- }
- if (is.matrix(r))
- r <- diag(r)
- rval <- list(
- coefficients = cbar,
- variance = vbar + evar *
- (m + 1) / m,
- call = c(oldcall, call),
- nimp = m,
- df = df,
- missinfo = (r + 2 / (df + 3)) / (r + 1)
- )
- class(rval) <- "MIresult"
- rval
- }
scf_MIcombine <-
+ function (results,
+ variances,
+ call = sys.call(),
+ df.complete = Inf,
+ ...) {
+ m <- length(results)
+ oldcall <- attr(results, "call")
+ if (missing(variances)) {
+ variances <- suppressWarnings(lapply(results, vcov))
+ results <- lapply(results, coef)
+ }
+ vbar <- variances[[1]]
+ cbar <- results[[1]]
+ for (i in 2:m) {
+ cbar <- cbar + results[[i]]
+ # MODIFICATION:
+ # vbar <- vbar + variances[[i]]
+ }
+ cbar <- cbar / m
+ # MODIFICATION:
+ # vbar <- vbar/m
+ evar <- var(do.call("rbind", results))
+ r <- (1 + 1 / m) * evar / vbar
+ df <- (m - 1) * (1 + 1 / r) ^ 2
+ if (is.matrix(df))
+ df <- diag(df)
+ if (is.finite(df.complete)) {
+ dfobs <- ((df.complete + 1) / (df.complete + 3)) * df.complete *
+ vbar / (vbar + evar)
+ if (is.matrix(dfobs))
+ dfobs <- diag(dfobs)
+ df <- 1 / (1 / dfobs + 1 / df)
+ }
+ if (is.matrix(r))
+ r <- diag(r)
+ rval <- list(
+ coefficients = cbar,
+ variance = vbar + evar *
+ (m + 1) / m,
+ call = c(oldcall, call),
+ nimp = m,
+ df = df,
+ missinfo = (r + 2 / (df + 3)) / (r + 1)
+ )
+ class(rval) <- "MIresult"
+ rval
+ }
Define a function to download and import each stata file:
-library(haven)
-
-scf_dta_import <-
- function(this_url) {
- this_tf <- tempfile()
-
- download.file(this_url , this_tf , mode = 'wb')
-
- this_tbl <- read_dta(this_tf)
-
- this_df <- data.frame(this_tbl)
-
- file.remove(this_tf)
-
- names(this_df) <- tolower(names(this_df))
-
- this_df
- }
library(haven)
+
+scf_dta_import <-
+ function(this_url) {
+ this_tf <- tempfile()
+
+ download.file(this_url , this_tf , mode = 'wb')
+
+ this_tbl <- read_dta(this_tf)
+
+ this_df <- data.frame(this_tbl)
+
+ file.remove(this_tf)
+
+ names(this_df) <- tolower(names(this_df))
+
+ this_df
+ }
Download and import the full, summary extract, and replicate weights tables:
-scf_df <-
- scf_dta_import("https://www.federalreserve.gov/econres/files/scf2022s.zip")
-
-ext_df <-
- scf_dta_import("https://www.federalreserve.gov/econres/files/scfp2022s.zip")
-
-scf_rw_df <-
- scf_dta_import("https://www.federalreserve.gov/econres/files/scf2022rw1s.zip")
scf_df <-
+ scf_dta_import("https://www.federalreserve.gov/econres/files/scf2022s.zip")
+
+ext_df <-
+ scf_dta_import("https://www.federalreserve.gov/econres/files/scfp2022s.zip")
+
+scf_rw_df <-
+ scf_dta_import("https://www.federalreserve.gov/econres/files/scf2022rw1s.zip")
Confirm both the full public data and the summary extract contain five records per family:
- +Confirm only the primary economic unit and the five implicate identifiers overlap:
-stopifnot(all(sort(intersect(
- names(scf_df) , names(ext_df)
-)) == c('y1' , 'yy1')))
-stopifnot(all(sort(intersect(
- names(scf_df) , names(scf_rw_df)
-)) == c('y1' , 'yy1')))
-stopifnot(all(sort(intersect(
- names(ext_df) , names(scf_rw_df)
-)) == c('y1' , 'yy1')))
stopifnot(all(sort(intersect(
+ names(scf_df) , names(ext_df)
+)) == c('y1' , 'yy1')))
+stopifnot(all(sort(intersect(
+ names(scf_df) , names(scf_rw_df)
+)) == c('y1' , 'yy1')))
+stopifnot(all(sort(intersect(
+ names(ext_df) , names(scf_rw_df)
+)) == c('y1' , 'yy1')))
Remove the implicate identifier from the replicate weights table, add a column of fives for weighting:
- +Construct a multiply-imputed, complex sample survey design:
Break the main table into five different implicates based on the final character of the column y1
:
library(stringr)
-
-s1_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 1 ,]
-s2_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 2 ,]
-s3_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 3 ,]
-s4_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 4 ,]
-s5_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 5 ,]
library(stringr)
+
+s1_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 1 ,]
+s2_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 2 ,]
+s3_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 3 ,]
+s4_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 4 ,]
+s5_df <- scf_df[str_sub(scf_df[, 'y1'] ,-1 ,-1) == 5 ,]
Combine these into a single list
, then merge each implicate with the summary extract:
scf_imp <- list(s1_df , s2_df , s3_df , s4_df , s5_df)
-
-scf_list <- lapply(scf_imp , merge , ext_df)
scf_imp <- list(s1_df , s2_df , s3_df , s4_df , s5_df)
+
+scf_list <- lapply(scf_imp , merge , ext_df)
Replace all missing values in the replicate weights table with zeroes, multiply the replicate weights by the multiplication factor, then only keep the unique identifier and the final (combined) replicate weights:
-scf_rw_df[is.na(scf_rw_df)] <- 0
-
-scf_rw_df[, paste0('wgt' , 1:999)] <-
- scf_rw_df[, paste0('wt1b' , 1:999)] * scf_rw_df[, paste0('mm' , 1:999)]
-
-scf_rw_df <- scf_rw_df[, c('yy1' , paste0('wgt' , 1:999))]
scf_rw_df[is.na(scf_rw_df)] <- 0
+
+scf_rw_df[, paste0('wgt' , 1:999)] <-
+ scf_rw_df[, paste0('wt1b' , 1:999)] * scf_rw_df[, paste0('mm' , 1:999)]
+
+scf_rw_df <- scf_rw_df[, c('yy1' , paste0('wgt' , 1:999))]
Sort both the five implicates and also the replicate weights table by the unique identifier:
-scf_list <-
- lapply(scf_list , function(w)
- w[order(w[, 'yy1']) ,])
-
-scf_rw_df <- scf_rw_df[order(scf_rw_df[, 'yy1']) ,]
scf_list <-
+ lapply(scf_list , function(w)
+ w[order(w[, 'yy1']) ,])
+
+scf_rw_df <- scf_rw_df[order(scf_rw_df[, 'yy1']) ,]
Define the design:
-library(survey)
-library(mitools)
-
-scf_design <-
- svrepdesign(
- weights = ~ wgt ,
- repweights = scf_rw_df[,-1] ,
- data = imputationList(scf_list) ,
- scale = 1 ,
- rscales = rep(1 / 998 , 999) ,
- mse = FALSE ,
- type = "other" ,
- combined.weights = TRUE
- )
library(survey)
+library(mitools)
+
+scf_design <-
+ svrepdesign(
+ weights = ~ wgt ,
+ repweights = scf_rw_df[,-1] ,
+ data = imputationList(scf_list) ,
+ scale = 1 ,
+ rscales = rep(1 / 998 , 999) ,
+ mse = FALSE ,
+ type = "other" ,
+ combined.weights = TRUE
+ )
Run the convey_prep()
function on the full design:
This example matches the “Table 4” tab’s cell Y6 of the Excel Based on Public Data:
-mean_net_worth <-
- scf_MIcombine(with(scf_design , svymean(~ networth)))
-
-stopifnot(round(coef(mean_net_worth) / 1000 , 1) == 1059.5)
mean_net_worth <-
+ scf_MIcombine(with(scf_design , svymean(~ networth)))
+
+stopifnot(round(coef(mean_net_worth) / 1000 , 1) == 1059.5)
This example comes within $500 of the standard error of mean net worth from Table 2 of the Federal Reserve Bulletin, displaying the minor differences between the Internal Data and Public Data:
- +This example matches the “Table 4” tab’s cells X6 of the Excel Based on Public Data:
-# compute quantile with all five implicates stacked (not the recommended technique)
-fake_design <-
- svydesign(~ 1 , data = ext_df[c('networth' , 'wgt')] , weights = ~ wgt)
-
-median_net_worth_incorrect_errors <-
- svyquantile(~ networth , fake_design , 0.5)
-
-stopifnot(round(coef(median_net_worth_incorrect_errors) / 1000 , 2) == 192.7)
# compute quantile with all five implicates stacked (not the recommended technique)
+fake_design <-
+ svydesign(~ 1 , data = ext_df[c('networth' , 'wgt')] , weights = ~ wgt)
+
+median_net_worth_incorrect_errors <-
+ svyquantile(~ networth , fake_design , 0.5)
+
+stopifnot(round(coef(median_net_worth_incorrect_errors) / 1000 , 2) == 192.7)
survey
libraryAdd new columns to the data set:
-scf_design <-
- update(
- scf_design ,
-
- hhsex = factor(
- hhsex ,
- levels = 1:2 ,
- labels = c("male" , "female")
- ) ,
-
- married = as.numeric(married == 1) ,
-
- edcl =
- factor(
- edcl ,
- levels = 1:4 ,
- labels =
- c(
- "less than high school" ,
- "high school or GED" ,
- "some college" ,
- "college degree"
- )
- )
-
- )
scf_design <-
+ update(
+ scf_design ,
+
+ hhsex = factor(
+ hhsex ,
+ levels = 1:2 ,
+ labels = c("male" , "female")
+ ) ,
+
+ married = as.numeric(married == 1) ,
+
+ edcl =
+ factor(
+ edcl ,
+ levels = 1:4 ,
+ labels =
+ c(
+ "less than high school" ,
+ "high school or GED" ,
+ "some college" ,
+ "college degree"
+ )
+ )
+
+ )
Count the unweighted number of records in the survey sample, overall and by groups:
-scf_MIcombine(with(scf_design , svyby(~ five , ~ five , unwtd.count)))
-
-scf_MIcombine(with(scf_design , svyby(~ five , ~ hhsex , unwtd.count)))
scf_MIcombine(with(scf_design , svyby(~ five , ~ five , unwtd.count)))
+
+scf_MIcombine(with(scf_design , svyby(~ five , ~ hhsex , unwtd.count)))
Count the weighted size of the generalizable population, overall and by groups:
-scf_MIcombine(with(scf_design , svytotal(~ five)))
+scf_MIcombine(with(scf_design , svytotal(~ five)))
+
+scf_MIcombine(with(scf_design ,
+ svyby(~ five , ~ hhsex , svytotal)))
+Calculate the mean (average) of a linear variable, overall and by groups:
+scf_MIcombine(with(scf_design , svymean(~ networth)))
+
+scf_MIcombine(with(scf_design ,
+ svyby(~ networth , ~ hhsex , svymean)))
+Calculate the distribution of a categorical variable, overall and by groups:
+scf_MIcombine(with(scf_design , svymean(~ edcl)))
scf_MIcombine(with(scf_design ,
- svyby(~ five , ~ hhsex , svytotal)))
-Calculate the mean (average) of a linear variable, overall and by groups:
-
+Calculate the sum of a linear variable, overall and by groups:
+scf_MIcombine(with(scf_design , svytotal(~ networth)))
scf_MIcombine(with(scf_design ,
- svyby(~ networth , ~ hhsex , svymean)))
-Calculate the distribution of a categorical variable, overall and by groups:
-
+Calculate the weighted sum of a categorical variable, overall and by groups:
+scf_MIcombine(with(scf_design , svytotal(~ edcl)))
scf_MIcombine(with(scf_design ,
- svyby(~ edcl , ~ hhsex , svymean)))
-Calculate the sum of a linear variable, overall and by groups:
-scf_MIcombine(with(scf_design , svytotal(~ networth)))
-
-scf_MIcombine(with(scf_design ,
- svyby(~ networth , ~ hhsex , svytotal)))
-Calculate the weighted sum of a categorical variable, overall and by groups:
-scf_MIcombine(with(scf_design , svytotal(~ edcl)))
-
-scf_MIcombine(with(scf_design ,
- svyby(~ edcl , ~ hhsex , svytotal)))
+ svyby(~ edcl , ~ hhsex , svytotal)))
Calculate the median (50th percentile) of a linear variable, overall and by groups:
-scf_MIcombine(with(
- scf_design ,
- svyquantile(~ networth ,
- 0.5 , se = TRUE , interval.type = 'quantile')
-))
## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-scf_MIcombine(with(
- scf_design ,
- svyby(
- ~ networth ,
- ~ hhsex ,
- svyquantile ,
- 0.5 ,
- se = TRUE ,
- interval.type = 'quantile' ,
- ci = TRUE
- )
-))
## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
-## Not all replicate weight designs give valid standard errors for quantiles.
+scf_MIcombine(with(
+ scf_design ,
+ svyquantile(~ networth ,
+ 0.5 , se = TRUE , interval.type = 'quantile')
+))
+
+scf_MIcombine(with(
+ scf_design ,
+ svyby(
+ ~ networth ,
+ ~ hhsex ,
+ svyquantile ,
+ 0.5 ,
+ se = TRUE ,
+ interval.type = 'quantile' ,
+ ci = TRUE
+ )
+))
Estimate a ratio:
- +Restrict the survey design to labor force participants:
- +Calculate the mean (average) of this subset:
- +Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups:
-this_result <-
- scf_MIcombine(with(scf_design ,
- svymean(~ networth)))
-
-coef(this_result)
-SE(this_result)
-confint(this_result)
-cv(this_result)
-
-grouped_result <-
- scf_MIcombine(with(scf_design ,
- svyby(~ networth , ~ hhsex , svymean)))
-
-coef(grouped_result)
-SE(grouped_result)
-confint(grouped_result)
-cv(grouped_result)
this_result <-
+ scf_MIcombine(with(scf_design ,
+ svymean(~ networth)))
+
+coef(this_result)
+SE(this_result)
+confint(this_result)
+cv(this_result)
+
+grouped_result <-
+ scf_MIcombine(with(scf_design ,
+ svyby(~ networth , ~ hhsex , svymean)))
+
+coef(grouped_result)
+SE(grouped_result)
+confint(grouped_result)
+cv(grouped_result)
Calculate the degrees of freedom of any survey design object:
- +Calculate the complex sample survey-adjusted variance of any statistic:
- +Include the complex sample design effect in the result for a specific statistic:
-# SRS without replacement
-scf_MIcombine(with(scf_design ,
- svymean(~ networth , deff = TRUE)))
-
-# SRS with replacement
-scf_MIcombine(with(scf_design ,
- svymean(~ networth , deff = "replace")))
# SRS without replacement
+scf_MIcombine(with(scf_design ,
+ svymean(~ networth , deff = TRUE)))
+
+# SRS with replacement
+scf_MIcombine(with(scf_design ,
+ svymean(~ networth , deff = "replace")))
Perform a survey-weighted generalized linear model:
- +Calculate the gini coefficient with family net worth:
- +## Multiple imputation results:
## m <- length(results)
## scf_MIcombine(with(scf_design, svygini(~networth)))
@@ -570,7 +556,7 @@ 1.6.2 Family Net Worth
1.6.3 Family Income
Calculate the gini coefficient with income:
-
+
## Multiple imputation results:
## m <- length(results)
## scf_MIcombine(with(scf_design, svygini(~income)))
diff --git a/docs/2.2-replication-based-variance-estimation.html b/docs/2.2-replication-based-variance-estimation.html
index af6857e..c5ab6fb 100644
--- a/docs/2.2-replication-based-variance-estimation.html
+++ b/docs/2.2-replication-based-variance-estimation.html
@@ -270,34 +270,34 @@ 2.2 Replication-Based Variance Es
The function bootVar
from the laeken
library (Alfons and Templ 2013Alfons, Andreas, and Matthias Templ. 2013. “Estimation of Social Exclusion Indicators from Complex Surveys: The R Package laeken.” Journal of Statistical Software 54 (15): 1–25. http://www.jstatsoft.org/v54/i15/.), also uses the bootstrap method to estimate variances.
2.2.1 Replication Design Example
-# load libraries
-library( survey )
-library( convey )
-library( laeken ) # for the dataset
-
-# get laeken eusilc data
-data( eusilc ) ; names( eusilc ) <- tolower( names( eusilc ) )
-
-# survey design object for TSL/ifluence function variance estimation
-des_eusilc <- svydesign( ids = ~rb030 , strata = ~db040 , weights = ~rb050 , data = eusilc )
-des_eusilc <- convey_prep( des_eusilc )
-
-# influence function SE estimate for the gini index
-convey:::svygini.survey.design( ~eqincome , design = des_eusilc )
+# load libraries
+library( survey )
+library( convey )
+library( laeken ) # for the dataset
+
+# get laeken eusilc data
+data( eusilc ) ; names( eusilc ) <- tolower( names( eusilc ) )
+
+# survey design object for TSL/ifluence function variance estimation
+des_eusilc <- svydesign( ids = ~rb030 , strata = ~db040 , weights = ~rb050 , data = eusilc )
+des_eusilc <- convey_prep( des_eusilc )
+
+# influence function SE estimate for the gini index
+convey:::svygini.survey.design( ~eqincome , design = des_eusilc )
## gini SE
## eqincome 0.26497 0.0019
-# create survey design object for replicate-based variance estimation
-des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
-des_eusilc_rep <- convey_prep( des_eusilc_rep )
-
-# replicate-based (bootstrao) SE estimate for the gini index
-# with option to keep replicates
-( gini.repstat <- convey:::svygini.svyrep.design( ~eqincome , design = des_eusilc_rep , return.replicates = TRUE ) )
+# create survey design object for replicate-based variance estimation
+des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
+des_eusilc_rep <- convey_prep( des_eusilc_rep )
+
+# replicate-based (bootstrao) SE estimate for the gini index
+# with option to keep replicates
+( gini.repstat <- convey:::svygini.svyrep.design( ~eqincome , design = des_eusilc_rep , return.replicates = TRUE ) )
## gini SE
## eqincome 0.26497 0.0022
To understand how that variance is estimated, we can look at the replicates:
-
+
## [1] 0.2628816 0.2653672 0.2663951 0.2646311 0.2660021 0.2648396 0.2633737
## [8] 0.2611656 0.2613233 0.2647174 0.2638256 0.2621616 0.2646830 0.2587950
## [15] 0.2642947 0.2651559 0.2663231 0.2673018 0.2687169 0.2671058 0.2654078
@@ -314,16 +314,16 @@ 2.2.1 Replication Design Example<
## attr(,"mse")
## [1] FALSE
These are resampling (bootstrap) replicates. With them, we can look at the variance of these replicates to get an estimate the gini estimator’s variance:
-# variance estimate
-des.scale <- des_eusilc_rep$scale
-meantheta <- mean( gini.reps )[[1]]
-v <- sum( ( gini.reps - meantheta )^2 ) * des.scale
-
-# SE estimate
-( gini.se <- ( sqrt( v ) ) )
+# variance estimate
+des.scale <- des_eusilc_rep$scale
+meantheta <- mean( gini.reps )[[1]]
+v <- sum( ( gini.reps - meantheta )^2 ) * des.scale
+
+# SE estimate
+( gini.se <- ( sqrt( v ) ) )
## [1] 0.002226009
-
+
## [1] TRUE
diff --git a/docs/3.1-at-risk-of-poverty-threshold-svyarpt.html b/docs/3.1-at-risk-of-poverty-threshold-svyarpt.html
index 4b5ebee..44d4f4c 100644
--- a/docs/3.1-at-risk-of-poverty-threshold-svyarpt.html
+++ b/docs/3.1-at-risk-of-poverty-threshold-svyarpt.html
@@ -265,13 +265,13 @@
3.1 At Risk of Poverty Threshold (svyarpt)
-✔️ commonly used by statistical agencies in the european union working group on statistics on income & living conditions (eurostat)
-✔️ not tied to the inflation rate nor to a basket of goods or consumable products
-✔️ generic calculation that can be broadly applied to different nations or regions
-✔️ easy to understand: defaults to 60% of median income
-❌ the 60% of median income used in ARPT might appear arbitrary for non-EU analyses
-❌ does not account for the intensity/severity of poverty
-❌ not really a poverty measure, but an estimated poverty threshold/poverty line
+✔️ commonly used by statistical agencies in the european union working group on statistics on income & living conditions (eurostat)
+✔️ not tied to the inflation rate nor to a basket of goods or consumable products
+✔️ generic calculation that can be broadly applied to different nations or regions
+✔️ easy to understand: defaults to 60% of median income
+❌ the 60% of median income used in ARPT might appear arbitrary for non-EU analyses
+❌ does not account for the intensity/severity of poverty
+❌ not really a poverty measure, but an estimated poverty threshold/poverty line
The at-risk-of-poverty threshold (ARPT) is a measure used to define the people whose incomes imply a low standard of living in comparison to the general living standards. Even though some people are not below the effective poverty line, those below the ARPT can be considered “almost deprived”.
This measure is defined as \(0.6\) times the median income for the entire population:
\[
@@ -283,142 +283,142 @@ 3.1 At Risk of Poverty Threshold
3.1.1 Replication Example
The R vardpoor
package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes an ARPT coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.
Load and prepare the same data set:
-# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the laeken library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the arpt coefficient
-# using the R vardpoor library
-varpoord_arpt_calculation <-
- varpoord(
- # analysis variable
- Y = "eqincome",
-
- # weights variable
- w_final = "rb050",
-
- # row number variable
- ID_level1 = "IDd",
-
- # row number variable
- ID_level2 = "IDd",
-
- # strata variable
- H = "db040",
-
- N_h = NULL ,
-
- # clustering variable
- PSU = "rb030",
-
- # data.table
- dataset = dati,
-
- # arpt coefficient function
- type = "linarpt",
-
- # get linearized variable
- outp_lin = TRUE
- )
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc
- )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_arpt_calculation$all_result$value
+# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the laeken library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the arpt coefficient
+# using the R vardpoor library
+varpoord_arpt_calculation <-
+ varpoord(
+ # analysis variable
+ Y = "eqincome",
+
+ # weights variable
+ w_final = "rb050",
+
+ # row number variable
+ ID_level1 = "IDd",
+
+ # row number variable
+ ID_level2 = "IDd",
+
+ # strata variable
+ H = "db040",
+
+ N_h = NULL ,
+
+ # clustering variable
+ PSU = "rb030",
+
+ # data.table
+ dataset = dati,
+
+ # arpt coefficient function
+ type = "linarpt",
+
+ # get linearized variable
+ outp_lin = TRUE
+ )
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc
+ )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_arpt_calculation$all_result$value
## [1] 10859.24
-
+
## eqincome
## 10859.24
-# linearized variables do match
-# vardpoor
-lin_arpt_varpoord <- varpoord_arpt_calculation$lin_out$lin_arpt
-# convey
-lin_arpt_convey <- attr(svyarpt( ~ eqincome , des_eusilc), "lin")
-
-# check equality
-all.equal(lin_arpt_varpoord, lin_arpt_convey)
+# linearized variables do match
+# vardpoor
+lin_arpt_varpoord <- varpoord_arpt_calculation$lin_out$lin_arpt
+# convey
+lin_arpt_convey <- attr(svyarpt( ~ eqincome , des_eusilc), "lin")
+
+# check equality
+all.equal(lin_arpt_varpoord, lin_arpt_convey)
## [1] TRUE
-
+
## eqincome
## eqincome 2564.027
-
+
## [1] 2559.442
-
+
## [1] 50.59093
-
+
## eqincome
## eqincome 50.63622
The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svyarpt
and vardpoor::linarpt
produce the same linearized variable \(z\).
However, the measures of uncertainty do not line up, because library(vardpoor)
defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design
object.
-# within each strata, sum up the weights
-cluster_sums <-
- aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc ,
- fpc = ~ cluster_sum
- )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
- convey_prep(des_eusilc_ultimate_cluster)
-
-
-
-# matches
-stopifnot(all.equal(
- attr(svyarpt( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1] ,
- varpoord_arpt_calculation$all_result$var
-))
-
-# matches
-stopifnot(all.equal(varpoord_arpt_calculation$all_result$se ,
- SE(
- svyarpt( ~ eqincome , des_eusilc_ultimate_cluster)
- )[1]))
+# within each strata, sum up the weights
+cluster_sums <-
+ aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc ,
+ fpc = ~ cluster_sum
+ )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+ convey_prep(des_eusilc_ultimate_cluster)
+
+
+
+# matches
+stopifnot(all.equal(
+ attr(svyarpt( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1] ,
+ varpoord_arpt_calculation$all_result$var
+))
+
+# matches
+stopifnot(all.equal(varpoord_arpt_calculation$all_result$se ,
+ SE(
+ svyarpt( ~ eqincome , des_eusilc_ultimate_cluster)
+ )[1]))
For additional usage examples of svyarpt
, type ?convey::svyarpt
in the R console.
diff --git a/docs/3.2-at-risk-of-poverty-ratio-svyarpr.html b/docs/3.2-at-risk-of-poverty-ratio-svyarpr.html
index 60cd017..73f6015 100644
--- a/docs/3.2-at-risk-of-poverty-ratio-svyarpr.html
+++ b/docs/3.2-at-risk-of-poverty-ratio-svyarpr.html
@@ -265,12 +265,12 @@
3.2 At Risk of Poverty Ratio (svyarpr)
-✔️ EU standard like ARPT, easy to understand, interpret, and implement
-✔️ proportion of individuals below ARPT -- a "companion" function that uses svyarpt() internally
-✔️ measure is easy to understand
-❌ does not account for the intensity or inequality among the poor
-❌ not very common outside of the EU
-❌ just another name for the `svyfgt( g = 0 , thresh = "relq" )`
+✔️ EU standard like ARPT, easy to understand, interpret, and implement
+✔️ proportion of individuals below ARPT -- a "companion" function that uses svyarpt() internally
+✔️ measure is easy to understand
+❌ does not account for the intensity or inequality among the poor
+❌ not very common outside of the EU
+❌ just another name for the `svyfgt( g = 0 , thresh = "relq" )`
The at-risk-of-poverty rate (ARPR) is the share of persons with an income below the at-risk-of-poverty threshold (ARPT). The logic behind this measure is that although most people below the ARPT cannot be considered “poor”, they are the ones most vulnerable to becoming poor in the event of a negative economic phenomenon like a recession.
The ARPR is a composite estimate, taking into account both the sampling error in the proportion itself and that in the ARPT estimate. The details of the linearization of the ARPR are discussed by Deville (1999Deville, Jean-Claude. 1999. “Variance Estimation for Complex Statistics and Estimators: Linearization and Residual Techniques.” Survey Methodology 25 (2): 193–203. http://www.statcan.gc.ca/pub/12-001-x/1999002/article/4882-eng.pdf.) and Osier (2009Osier, Guillaume. 2009. “Variance Estimation for Complex Indicators of Poverty and Inequality.” Journal of the European Survey Research Association 3 (3): 167–95. http://ojs.ub.uni-konstanz.de/srm/article/view/369.).
@@ -278,270 +278,270 @@ 3.2 At Risk of Poverty Ratio (svy
3.2.1 Replication Example
The R vardpoor
package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a ARPR coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.
Load and prepare the same data set:
-# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the vardpoor library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the arpr coefficient
-# using the R vardpoor library
-varpoord_arpr_calculation <-
- varpoord(
- # analysis variable
- Y = "eqincome",
-
- # weights variable
- w_final = "rb050",
-
- # row number variable
- ID_level1 = "IDd",
-
- # row number variable
- ID_level2 = "IDd",
-
- # strata variable
- H = "db040",
-
- N_h = NULL ,
-
- # clustering variable
- PSU = "rb030",
-
- # data.table
- dataset = dati,
-
- # arpr coefficient function
- type = "linarpr",
-
- # get linearized variable
- outp_lin = TRUE
-
- )
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc
- )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_arpr_calculation$all_result$value
+# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the vardpoor library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the arpr coefficient
+# using the R vardpoor library
+varpoord_arpr_calculation <-
+ varpoord(
+ # analysis variable
+ Y = "eqincome",
+
+ # weights variable
+ w_final = "rb050",
+
+ # row number variable
+ ID_level1 = "IDd",
+
+ # row number variable
+ ID_level2 = "IDd",
+
+ # strata variable
+ H = "db040",
+
+ N_h = NULL ,
+
+ # clustering variable
+ PSU = "rb030",
+
+ # data.table
+ dataset = dati,
+
+ # arpr coefficient function
+ type = "linarpr",
+
+ # get linearized variable
+ outp_lin = TRUE
+
+ )
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc
+ )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_arpr_calculation$all_result$value
## [1] 14.44422
-
+
## eqincome
## 14.44422
-# linearized variables do not match
-# because Fprime is the derivative wrt
-# to the estimated threshold, not the estimated quantile
-# for more details, see
-# https://github.com/ajdamico/convey/issues/372#issuecomment-1656264143
-#
-# vardpoor
-lin_arpr_varpoord <- varpoord_arpr_calculation$lin_out$lin_arpr
-# convey
-lin_arpr_convey <- attr(svyarpr( ~ eqincome , des_eusilc), "lin")
-
-# check equality
-all.equal(lin_arpr_varpoord, 100 * lin_arpr_convey)
+# linearized variables do not match
+# because Fprime is the derivative wrt
+# to the estimated threshold, not the estimated quantile
+# for more details, see
+# https://github.com/ajdamico/convey/issues/372#issuecomment-1656264143
+#
+# vardpoor
+lin_arpr_varpoord <- varpoord_arpr_calculation$lin_out$lin_arpr
+# convey
+lin_arpr_convey <- attr(svyarpr( ~ eqincome , des_eusilc), "lin")
+
+# check equality
+all.equal(lin_arpr_varpoord, 100 * lin_arpr_convey)
## [1] "Mean relative difference: 0.2264738"
-
+
## eqincome
## eqincome 0.07599778
-
+
## [1] 0.08718569
-
+
## [1] 0.2952722
-
+
## eqincome
## eqincome 0.2756769
The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svyarpr
and vardpoor::linarpr
produce the same linearized variable \(z\).
However, the measures of uncertainty do not line up. One of the reasons is that library(vardpoor)
defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design
object.
-# within each strata, sum up the weights
-cluster_sums <-
- aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc ,
- fpc = ~ cluster_sum
- )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
- convey_prep(des_eusilc_ultimate_cluster)
-
-# does not match
-attr(svyarpr( ~ eqincome , des_eusilc_ultimate_cluster) , 'var') * 10000
+# within each strata, sum up the weights
+cluster_sums <-
+ aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc ,
+ fpc = ~ cluster_sum
+ )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+ convey_prep(des_eusilc_ultimate_cluster)
+
+# does not match
+attr(svyarpr( ~ eqincome , des_eusilc_ultimate_cluster) , 'var') * 10000
## eqincome
## eqincome 0.07586194
-
+
## [1] 0.08718569
-
+
## [1] 0.2952722
-
+
## eqincome
## eqincome 0.2754305
Still, there is a difference in the estimates. This is discussed in detail in this issue.
In order to still provide additional examples for our code, we proceed with a Monte Carlo experiment.
Using the eusilcP
data from the simPop
package (Templ et al. 2017Templ, Matthias, Bernhard Meindl, Alexander Kowarik, and Olivier Dupriez. 2017. “Simulation of Synthetic Complex Data: The R Package simPop.” Journal of Statistical Software 79 (10): 1–38. https://doi.org/10.18637/jss.v079.i10.), we can compute the actual value of the at risk of poverty rate for that population:
-# load libraries
-library(sampling)
-library(survey)
-library(convey)
-library(parallel)
-
-# load pseudo population data
-data("eusilcP" , package = "simPop")
-
-# compute population median
-q50.pop <-
- convey:::computeQuantiles(eusilcP$eqIncome , rep(1 , length(eusilcP$eqIncome)) , .50)
-
-# compute population poverty threshold
-# as 60% of the median
-thresh.pop <- .60 * q50.pop
-
-# compute population at risk of poverty rate
-(theta.pop <-
- mean(eusilcP$eqIncome <= thresh.pop , na.rm = TRUE))
+# load libraries
+library(sampling)
+library(survey)
+library(convey)
+library(parallel)
+
+# load pseudo population data
+data("eusilcP" , package = "simPop")
+
+# compute population median
+q50.pop <-
+ convey:::computeQuantiles(eusilcP$eqIncome , rep(1 , length(eusilcP$eqIncome)) , .50)
+
+# compute population poverty threshold
+# as 60% of the median
+thresh.pop <- .60 * q50.pop
+
+# compute population at risk of poverty rate
+(theta.pop <-
+ mean(eusilcP$eqIncome <= thresh.pop , na.rm = TRUE))
## [1] 0.1469295
Now, to study the distribution of the estimator under a particular sampling design, we select 5000 samples under one-stage cluster sampling of 100 households using the cluster
function from the sampling
package (Tillé and Matei 2021Tillé, Yves, and Alina Matei. 2021. Sampling: Survey Sampling. https://CRAN.R-project.org/package=sampling.), and use the svyarpr
function to estimate the ARPR for each of those samples:
-# define the number of monte carlo replicates
-mc.rep <- 5000L
-
-# simulation function
-arpr_sim_fun <- function(this.iter) {
-
- set.seed(this.iter)
-
-
- library(survey)
- library(convey)
- library(sampling)
-
- # load pseudo population data
- data("eusilcP" , package = "simPop")
-
- # compute size-like variable for PPS sampling design
- eusilcP$aux <-
- log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
-
-
- # select sample
- tt <-
- sampling::cluster(
- data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
- clustername = "hid" ,
- size = 1000L ,
- method = "systematic" ,
- pik = eusilcP$aux
- )
-
- # collect data
- this.sample <- getdata(eusilcP , tt)
-
- # create survey design object
- this.desobj <-
- svydesign(
- ids = ~ hid ,
- probs = ~ Prob ,
- data = this.sample ,
- nest = FALSE
- )
-
- # prepare for convey functions
- this.desobj <- convey_prep(this.desobj)
-
- # compute estimates
- svyarpr( ~ eqIncome , this.desobj)
-
-}
-
-# run replications
-cl <- makeCluster(detectCores() - 1)
-
-arpr.estimate.list <-
- clusterApply(cl, seq_len(mc.rep) , arpr_sim_fun)
-
-stopCluster(cl)
+# define the number of monte carlo replicates
+mc.rep <- 5000L
+
+# simulation function
+arpr_sim_fun <- function(this.iter) {
+
+ set.seed(this.iter)
+
+
+ library(survey)
+ library(convey)
+ library(sampling)
+
+ # load pseudo population data
+ data("eusilcP" , package = "simPop")
+
+ # compute size-like variable for PPS sampling design
+ eusilcP$aux <-
+ log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
+
+
+ # select sample
+ tt <-
+ sampling::cluster(
+ data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
+ clustername = "hid" ,
+ size = 1000L ,
+ method = "systematic" ,
+ pik = eusilcP$aux
+ )
+
+ # collect data
+ this.sample <- getdata(eusilcP , tt)
+
+ # create survey design object
+ this.desobj <-
+ svydesign(
+ ids = ~ hid ,
+ probs = ~ Prob ,
+ data = this.sample ,
+ nest = FALSE
+ )
+
+ # prepare for convey functions
+ this.desobj <- convey_prep(this.desobj)
+
+ # compute estimates
+ svyarpr( ~ eqIncome , this.desobj)
+
+}
+
+# run replications
+cl <- makeCluster(detectCores() - 1)
+
+arpr.estimate.list <-
+ clusterApply(cl, seq_len(mc.rep) , arpr_sim_fun)
+
+stopCluster(cl)
Then, we evaluate the Percentage Relative Bias (PRB) of the ARPR estimator. Under this scenario, the PRB of the ARPR estimator is -0.19830%.
-# estimate the expected value of the ARPR estimator
-# using the average of the estimates
-(theta.exp <- mean(sapply(arpr.estimate.list , coef)))
+# estimate the expected value of the ARPR estimator
+# using the average of the estimates
+(theta.exp <- mean(sapply(arpr.estimate.list , coef)))
## [1] 0.1466381
-# estimate the percentage relative bias
-(percentage_relative_bias_arpr <- 100 * (theta.exp / theta.pop - 1))
+# estimate the percentage relative bias
+(percentage_relative_bias_arpr <- 100 * (theta.exp / theta.pop - 1))
## [1] -0.1982981
-
+
For the variance estimator, we have:
-# estimate the variance of the ARPR estimator
-# using the empirical variance of the estimates
-(vartheta.popest <- var(sapply(arpr.estimate.list , coef)))
+# estimate the variance of the ARPR estimator
+# using the empirical variance of the estimates
+(vartheta.popest <- var(sapply(arpr.estimate.list , coef)))
## [1] 0.0001048728
-# estimate the expected value of the ARPR variance estimator
-# using the (estimated) expected value of the variance estimates
-(vartheta.exp <- mean(sapply(arpr.estimate.list , vcov)))
+# estimate the expected value of the ARPR variance estimator
+# using the (estimated) expected value of the variance estimates
+(vartheta.exp <- mean(sapply(arpr.estimate.list , vcov)))
## [1] 0.0001081994
-# estimate the percentage relative bias of the variance estimator
-( percentage_relative_bias_variance <- 100 * (vartheta.exp / vartheta.popest - 1) )
+# estimate the percentage relative bias of the variance estimator
+( percentage_relative_bias_variance <- 100 * (vartheta.exp / vartheta.popest - 1) )
## [1] 3.172023
-
+
Under this scenario, the PRB of the ARPR variance estimator is 3.1720%.
Our simulation shows that the Bias Ratio of this estimator is approximately 2%:
-
+
## [1] 2.84509
if the normal approximation holds, a small bias ratio still allows for approximately valid estimates of the confidence intervals.
Next, we evaluate the Percentage Coverage Rate (PCR). In theory, under repeated sampling, the estimated 95% CIs should cover the population parameter approximately 95% of the time. We can evaluate that using:
-# estimate confidence intervals of the ARPR
-# for each of the samples
-est.coverage <-
- sapply(arpr.estimate.list, function(this.stat)
- confint(this.stat)[, 1] <= theta.pop &
- confint(this.stat)[, 2] >= theta.pop)
-
-# evaluate empirical coverage
-(empirical_coverage <- mean(est.coverage))
+# estimate confidence intervals of the ARPR
+# for each of the samples
+est.coverage <-
+ sapply(arpr.estimate.list, function(this.stat)
+ confint(this.stat)[, 1] <= theta.pop &
+ confint(this.stat)[, 2] >= theta.pop)
+
+# evaluate empirical coverage
+(empirical_coverage <- mean(est.coverage))
## [1] 0.9516
-
+
Our coverages are not too far from the nominal coverage level of 95%.
For additional usage examples of svyarpr
, type ?convey::svyarpr
in the R console.
diff --git a/docs/3.3-relative-median-income-ratio-svyrmir.html b/docs/3.3-relative-median-income-ratio-svyrmir.html
index 2dec577..928e722 100644
--- a/docs/3.3-relative-median-income-ratio-svyrmir.html
+++ b/docs/3.3-relative-median-income-ratio-svyrmir.html
@@ -265,15 +265,15 @@
3.3 Relative Median Income Ratio (svyrmir)
-✔️ mainly useful for studies of the income of the elderly following EU definitions
-✔️ a ratio of medians
-✔️ less sensitive to outliers
-❌ solely a measure of medians and does not fully account for the income distribution
-❌ not very common outside of the EU
-❌ hard to interpret
-❌ not (necessarily) a dependency measure
-❌ not an inequality measure (fails the Pigou-Dalton principle)
-❌ not (exactly) a poverty measure (fails the poverty-focus axiom)
+✔️ mainly useful for studies of the income of the elderly following EU definitions
+✔️ a ratio of medians
+✔️ less sensitive to outliers
+❌ solely a measure of medians and does not fully account for the income distribution
+❌ not very common outside of the EU
+❌ hard to interpret
+❌ not (necessarily) a dependency measure
+❌ not an inequality measure (fails the Pigou-Dalton principle)
+❌ not (exactly) a poverty measure (fails the poverty-focus axiom)
The relative median income ratio (RMIR) is the ratio of the median income of people aged above a value (65) to the median of people aged below the same value. In mathematical terms,
\[
rmir = \frac{median\{y_i; age_i >65 \}}{median\{y_i; age_i \leq 65 \}}.
@@ -284,149 +284,149 @@ 3.3 Relative Median Income Ratio
3.3.1 Replication Example
The R vardpoor
package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a RMIR coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.
Load and prepare the same data set:
-# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the vardpoor library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the rmir coefficient
-# using the R vardpoor library
-varpoord_rmir_calculation <-
- varpoord(
- # analysis variable
- Y = "eqincome",
-
- # weights variable
- w_final = "rb050",
-
- # row number variable
- ID_level1 = "IDd",
-
- # row number variable
- ID_level2 = "IDd",
-
- # strata variable
- H = "db040",
-
- N_h = NULL ,
-
- # clustering variable
- PSU = "rb030",
-
- # data.table
- dataset = dati,
-
- # age variable
- age = "age",
-
- # rmir coefficient function
- type = "linrmir",
-
- # get linearized variable
- outp_lin = TRUE
-
- )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc
- )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_rmir_calculation$all_result$value
+# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the vardpoor library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the rmir coefficient
+# using the R vardpoor library
+varpoord_rmir_calculation <-
+ varpoord(
+ # analysis variable
+ Y = "eqincome",
+
+ # weights variable
+ w_final = "rb050",
+
+ # row number variable
+ ID_level1 = "IDd",
+
+ # row number variable
+ ID_level2 = "IDd",
+
+ # strata variable
+ H = "db040",
+
+ N_h = NULL ,
+
+ # clustering variable
+ PSU = "rb030",
+
+ # data.table
+ dataset = dati,
+
+ # age variable
+ age = "age",
+
+ # rmir coefficient function
+ type = "linrmir",
+
+ # get linearized variable
+ outp_lin = TRUE
+
+ )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc
+ )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_rmir_calculation$all_result$value
## [1] 0.9330361
-
+
## eqincome
## 0.9330361
-# linearized variables do match
-# vardpoor
-lin_rmir_varpoord <- varpoord_rmir_calculation$lin_out$lin_rmir
-# convey
-lin_rmir_convey <-
- attr(svyrmir( ~ eqincome , des_eusilc, age = ~ age), "lin")
-
-# check equality
-all.equal(lin_rmir_varpoord, lin_rmir_convey[, 1])
+# linearized variables do match
+# vardpoor
+lin_rmir_varpoord <- varpoord_rmir_calculation$lin_out$lin_rmir
+# convey
+lin_rmir_convey <-
+ attr(svyrmir( ~ eqincome , des_eusilc, age = ~ age), "lin")
+
+# check equality
+all.equal(lin_rmir_varpoord, lin_rmir_convey[, 1])
## [1] TRUE
-
+
## eqincome
## eqincome 0.000127444
-
+
## [1] 0.0001272137
-
+
## [1] 0.0112789
-
+
## eqincome
## eqincome 0.01128911
The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svyrmir
and vardpoor::linrmir
produce the same linearized variable \(z\).
However, the measures of uncertainty do not line up, because library(vardpoor)
defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design
object.
-# within each strata, sum up the weights
-cluster_sums <-
- aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc ,
- fpc = ~ cluster_sum
- )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
- convey_prep(des_eusilc_ultimate_cluster)
-
-# matches
-stopifnot(all.equal(
- attr(
- svyrmir(~ eqincome , des_eusilc_ultimate_cluster , age = ~ age) ,
- 'var'
- )[1],
- varpoord_rmir_calculation$all_result$var
-))
-
-
-# matches
-stopifnot(all.equal(SE(
- svyrmir(~ eqincome , des_eusilc_ultimate_cluster, age = ~ age)
-)[1], varpoord_rmir_calculation$all_result$se))
+# within each strata, sum up the weights
+cluster_sums <-
+ aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc ,
+ fpc = ~ cluster_sum
+ )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+ convey_prep(des_eusilc_ultimate_cluster)
+
+# matches
+stopifnot(all.equal(
+ attr(
+ svyrmir(~ eqincome , des_eusilc_ultimate_cluster , age = ~ age) ,
+ 'var'
+ )[1],
+ varpoord_rmir_calculation$all_result$var
+))
+
+
+# matches
+stopifnot(all.equal(SE(
+ svyrmir(~ eqincome , des_eusilc_ultimate_cluster, age = ~ age)
+)[1], varpoord_rmir_calculation$all_result$se))
For additional usage examples of svyrmir
, type ?convey::svyrmir
in the R console.
diff --git a/docs/3.4-relative-median-poverty-gap-svyrmpg.html b/docs/3.4-relative-median-poverty-gap-svyrmpg.html
index aedeaa6..58c75b4 100644
--- a/docs/3.4-relative-median-poverty-gap-svyrmpg.html
+++ b/docs/3.4-relative-median-poverty-gap-svyrmpg.html
@@ -265,11 +265,11 @@
3.4 Relative Median Poverty Gap (svyrmpg)
-✔️ how poor are those below the ARPT?
-✔️ median poverty gap expressed as a percentage of the threshold
-✔️ useful for understanding the depth of poverty
-❌ not common outside of the EU
-❌ not immediately interpretable in terms of income
+✔️ how poor are those below the ARPT?
+✔️ median poverty gap expressed as a percentage of the threshold
+✔️ useful for understanding the depth of poverty
+❌ not common outside of the EU
+❌ not immediately interpretable in terms of income
The relative median poverty gap (RMPG) is the relative difference between the median income of people having income below the ARPT and the ARPT itself:
\[
rmpg = \frac{median\{y_i, y_i<arpt\}-arpt}{arpt}
@@ -280,144 +280,144 @@ 3.4 Relative Median Poverty Gap (
3.4.1 Replication Example
The R vardpoor
package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a RMPG coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.
Load and prepare the same data set:
-# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the vardpoor library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the rmpg coefficient
-# using the R vardpoor library
-varpoord_rmpg_calculation <-
- varpoord(
- # analysis variable
- Y = "eqincome",
-
- # weights variable
- w_final = "rb050",
-
- # row number variable
- ID_level1 = "IDd",
-
- # row number variable
- ID_level2 = "IDd",
-
- # strata variable
- H = "db040",
-
- N_h = NULL ,
-
- # clustering variable
- PSU = "rb030",
-
- # data.table
- dataset = dati,
-
- # rmpg coefficient function
- type = "linrmpg",
-
- # get linearized variable
- outp_lin = TRUE
-
- )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc
- )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_rmpg_calculation$all_result$value
+# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the vardpoor library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the rmpg coefficient
+# using the R vardpoor library
+varpoord_rmpg_calculation <-
+ varpoord(
+ # analysis variable
+ Y = "eqincome",
+
+ # weights variable
+ w_final = "rb050",
+
+ # row number variable
+ ID_level1 = "IDd",
+
+ # row number variable
+ ID_level2 = "IDd",
+
+ # strata variable
+ H = "db040",
+
+ N_h = NULL ,
+
+ # clustering variable
+ PSU = "rb030",
+
+ # data.table
+ dataset = dati,
+
+ # rmpg coefficient function
+ type = "linrmpg",
+
+ # get linearized variable
+ outp_lin = TRUE
+
+ )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc
+ )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_rmpg_calculation$all_result$value
## [1] 18.9286
-
+
## eqincome
## 18.9286
-# linearized variables do match
-# vardpoor
-lin_rmpg_varpoord <- varpoord_rmpg_calculation$lin_out$lin_rmpg
-# convey
-lin_rmpg_convey <- attr(svyrmpg( ~ eqincome , des_eusilc), "lin")
-
-# check equality
-all.equal(lin_rmpg_varpoord, 100 * lin_rmpg_convey[, 1])
+# linearized variables do match
+# vardpoor
+lin_rmpg_varpoord <- varpoord_rmpg_calculation$lin_out$lin_rmpg
+# convey
+lin_rmpg_convey <- attr(svyrmpg( ~ eqincome , des_eusilc), "lin")
+
+# check equality
+all.equal(lin_rmpg_varpoord, 100 * lin_rmpg_convey[, 1])
## [1] TRUE
-
+
## eqincome
## eqincome 0.332234
-
+
## [1] 0.3316454
-
+
## [1] 0.5758866
-
+
## eqincome
## eqincome 0.5763974
The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svyrmpg
and vardpoor::linrmpg
produce the same linearized variable \(z\).
However, the measures of uncertainty do not line up, because library(vardpoor)
defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design
object.
-# within each strata, sum up the weights
-cluster_sums <-
- aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc ,
- fpc = ~ cluster_sum
- )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
- convey_prep(des_eusilc_ultimate_cluster)
-
-
-
-# matches
-stopifnot(all.equal(
- attr(svyrmpg( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1] * 10000 ,
- varpoord_rmpg_calculation$all_result$var
-))
-
-# matches
-stopifnot(all.equal(SE(
- svyrmpg(~ eqincome , des_eusilc_ultimate_cluster)
-)[1] * 100 ,
-varpoord_rmpg_calculation$all_result$se))
+# within each strata, sum up the weights
+cluster_sums <-
+ aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc ,
+ fpc = ~ cluster_sum
+ )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+ convey_prep(des_eusilc_ultimate_cluster)
+
+
+
+# matches
+stopifnot(all.equal(
+ attr(svyrmpg( ~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1] * 10000 ,
+ varpoord_rmpg_calculation$all_result$var
+))
+
+# matches
+stopifnot(all.equal(SE(
+ svyrmpg(~ eqincome , des_eusilc_ultimate_cluster)
+)[1] * 100 ,
+varpoord_rmpg_calculation$all_result$se))
For additional usage examples of svyrmpg
, type ?convey::svyrmpg
in the R console.
diff --git a/docs/3.5-median-income-below-the-at-risk-of-poverty-threshold-svypoormed.html b/docs/3.5-median-income-below-the-at-risk-of-poverty-threshold-svypoormed.html
index 30b2bdd..0c8b122 100644
--- a/docs/3.5-median-income-below-the-at-risk-of-poverty-threshold-svypoormed.html
+++ b/docs/3.5-median-income-below-the-at-risk-of-poverty-threshold-svypoormed.html
@@ -265,11 +265,11 @@
3.5 Median Income Below the At Risk of Poverty Threshold (svypoormed)
-✔️ median income among those below the threshold
-✔️ useful for understanding the depth of poverty
-✔️ related to the RMPG
-❌ not very common outside the EU
-❌ not immediately interpretable in terms of the poverty gap
+✔️ median income among those below the threshold
+✔️ useful for understanding the depth of poverty
+✔️ related to the RMPG
+❌ not very common outside the EU
+❌ not immediately interpretable in terms of the poverty gap
Median income below the at-risk-of-poverty-threshold (POORMED) is median of incomes of people having the income below the ARPT:
\[
poormed = median\{y_i; y_i< arpt\}
@@ -280,145 +280,145 @@ 3.5 Median Income Below the At Ri
3.5.1 Replication Example
The R vardpoor
package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a POORMED coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.
Load and prepare the same data set:
-# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the vardpoor library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the poormed coefficient
-# using the R vardpoor library
-varpoord_poormed_calculation <-
- varpoord(
- # analysis variable
- Y = "eqincome",
-
- # weights variable
- w_final = "rb050",
-
- # row number variable
- ID_level1 = "IDd",
-
- # row number variable
- ID_level2 = "IDd",
-
- # strata variable
- H = "db040",
-
- N_h = NULL ,
-
- # clustering variable
- PSU = "rb030",
-
- # data.table
- dataset = dati,
-
- # poormed coefficient function
- type = "linpoormed",
-
- # get linearized variable
- outp_lin = TRUE
-
- )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc
- )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_poormed_calculation$all_result$value
+# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the vardpoor library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the poormed coefficient
+# using the R vardpoor library
+varpoord_poormed_calculation <-
+ varpoord(
+ # analysis variable
+ Y = "eqincome",
+
+ # weights variable
+ w_final = "rb050",
+
+ # row number variable
+ ID_level1 = "IDd",
+
+ # row number variable
+ ID_level2 = "IDd",
+
+ # strata variable
+ H = "db040",
+
+ N_h = NULL ,
+
+ # clustering variable
+ PSU = "rb030",
+
+ # data.table
+ dataset = dati,
+
+ # poormed coefficient function
+ type = "linpoormed",
+
+ # get linearized variable
+ outp_lin = TRUE
+
+ )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc
+ )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_poormed_calculation$all_result$value
## [1] 8803.735
-
+
## eqincome
## 8803.735
-# linearized variables do match
-# vardpoor
-lin_poormed_varpoord <-
- varpoord_poormed_calculation$lin_out$lin_poormed
-# convey
-lin_poormed_convey <-
- attr(svypoormed( ~ eqincome , des_eusilc), "lin")
-
-# check equality
-all.equal(lin_poormed_varpoord, lin_poormed_convey)
+# linearized variables do match
+# vardpoor
+lin_poormed_varpoord <-
+ varpoord_poormed_calculation$lin_out$lin_poormed
+# convey
+lin_poormed_convey <-
+ attr(svypoormed( ~ eqincome , des_eusilc), "lin")
+
+# check equality
+all.equal(lin_poormed_varpoord, lin_poormed_convey)
## [1] "names for current but not for target"
-
+
## eqincome
## eqincome 5311.47
-
+
## [1] 5302.086
-
+
## [1] 72.81542
-
+
## eqincome
## eqincome 72.87983
The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svypoormed
and vardpoor::linpoormed
produce the same linearized variable \(z\).
However, the measures of uncertainty do not line up, because library(vardpoor)
defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design
object.
-# within each strata, sum up the weights
-cluster_sums <-
- aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc ,
- fpc = ~ cluster_sum
- )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
- convey_prep(des_eusilc_ultimate_cluster)
-
-
-# matches
-stopifnot(all.equal(
- attr(svypoormed(~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1],
- varpoord_poormed_calculation$all_result$var
-))
-
-# matches
-stopifnot(all.equal(
- SE(svypoormed(~ eqincome , des_eusilc_ultimate_cluster))[1],
- varpoord_poormed_calculation$all_result$se
-))
+# within each strata, sum up the weights
+cluster_sums <-
+ aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc ,
+ fpc = ~ cluster_sum
+ )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+ convey_prep(des_eusilc_ultimate_cluster)
+
+
+# matches
+stopifnot(all.equal(
+ attr(svypoormed(~ eqincome , des_eusilc_ultimate_cluster) , 'var')[1],
+ varpoord_poormed_calculation$all_result$var
+))
+
+# matches
+stopifnot(all.equal(
+ SE(svypoormed(~ eqincome , des_eusilc_ultimate_cluster))[1],
+ varpoord_poormed_calculation$all_result$se
+))
For additional usage examples of svypoormed
, type ?convey::svypoormed
in the R console.
diff --git a/docs/3.6-foster-greer-thorbecke-class-svyfgt-svyfgtdec.html b/docs/3.6-foster-greer-thorbecke-class-svyfgt-svyfgtdec.html
index 55d7d0a..b8f17cb 100644
--- a/docs/3.6-foster-greer-thorbecke-class-svyfgt-svyfgtdec.html
+++ b/docs/3.6-foster-greer-thorbecke-class-svyfgt-svyfgtdec.html
@@ -265,13 +265,13 @@
3.6 Foster-Greer-Thorbecke class (svyfgt, svyfgtdec)
-✔️ used widely because encompasses interpretable measures
-✔️ allows an arbitrary poverty threshold
-✔️ can incorporate intensity and inequality among the poor
-✔️ when `g >= 2`, measure can be decomposed in interpretable measures of extension, intensity and inequality in poverty
-❌ scales are dependent on `g`
-❌ becomes increasingly difficult to interpret as `g` parameter grows
-❌ no component that allows for time to exit poverty, unlike the watts index
+✔️ used widely because encompasses interpretable measures
+✔️ allows an arbitrary poverty threshold
+✔️ can incorporate intensity and inequality among the poor
+✔️ when `g >= 2`, measure can be decomposed in interpretable measures of extension, intensity and inequality in poverty
+❌ scales are dependent on `g`
+❌ becomes increasingly difficult to interpret as `g` parameter grows
+❌ no component that allows for time to exit poverty, unlike the watts index
Foster, Greer, and Thorbecke (1984Foster, James, Joel Greer, and Erik Thorbecke. 1984. “A Class of Decomposable Poverty Measures.” Econometrica 52 (3): 761–66. http://www.jstor.org/stable/1913475.) proposed a family of indicators to measure poverty. This class of \(FGT\) measures, can be defined as
\[
p=\frac{1}{N}\sum_{k\in U}h(y_{k},\theta ),
@@ -300,23 +300,23 @@ 3.6 Foster-Greer-Thorbecke class
The quantile and the mean involved in the definition of the threshold are estimated for the whole population. When \(\gamma=0\) and \(\theta= .6*MED\), this measure is equal to the indicator ARPR computed by the function svyarpr
. The linearization of the FGT(0) is presented in Berger and Skinner (2003Berger, Yves G., and Chris J. Skinner. 2003. “Variance Estimation for a Low Income Proportion.” Journal of the Royal Statistical Society: Series C (Applied Statistics) 52 (4): 457–68. https://doi.org/10.1111/1467-9876.00417.).
Next, we give some examples of the function svyfgt
to estimate the values of the FGT poverty index.
Consider first the poverty threshold fixed (\(\gamma=0\)) in the value \(10000\). The headcount ratio (FGT0) is
-
+
fgt0 SE
eqincome 0.11444 0.0027
The poverty gap ratio (FGT(1)) (\(\gamma=1\)) index for the poverty threshold fixed at the same value is
-
+
fgt1 SE
eqincome 0.032085 0.0011
To estimate the FGT(0) with the poverty threshold fixed at \(0.6* MED\), we first fix the argument type_thresh="relq"
and then use the default values for percent
and order
:
-
+
fgt0 SE
eqincome 0.14444 0.0028
This matches the estimate obtained by:
-
+
arpr SE
eqincome 0.14444 0.0028
To estimate the poverty gap ratio with the poverty threshold equal to \(0.6*MEAN\), we use:
-
+
fgt1 SE
eqincome 0.051187 0.0011
@@ -325,59 +325,59 @@ 3.6.1 Replication Example
In July 2006, Jenkins (2008Jenkins, Stephen. 2008. “Estimation and Interpretation of Measures of Inequality, Poverty, and Social Welfare Using Stata.” North American Stata Users' Group Meetings 2006. Stata Users Group. http://EconPapers.repec.org/RePEc:boc:asug06:16.) presented at the North American Stata Users’ Group Meetings on the stata Atkinson Index command. The example below reproduces those statistics.
In order to match the presentation’s results using the svyfgt
function from the convey
library, the poverty threshold was considered absolute despite being directly estimated from the survey sample. This effectively treats the variance of the estimated poverty threshold as zero; svyfgt
does not account for the uncertainty of the poverty threshold when the level has been stated as absolute with the abs_thresh=
parameter. In general, we would instead recommend using either relq
or relm
in the type_thresh=
parameter in order to account for the added uncertainty of the poverty threshold calculation. This example serves only to show that svyfgt
behaves properly as compared to other software.
Load and prepare the same data set:
-# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the foreign library
-library(foreign)
-
-# create a temporary file on the local disk
-tf <- tempfile()
-
-# store the location of the presentation file
-presentation_zip <-
- "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
-
-# download jenkins' presentation to the temporary file
-download.file(presentation_zip , tf , mode = 'wb')
-
-# unzip the contents of the archive
-presentation_files <- unzip(tf , exdir = tempdir())
-
-# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
-x81 <-
- read.dta(grep("ifs81" , presentation_files , value = TRUE))
-x85 <-
- read.dta(grep("ifs85" , presentation_files , value = TRUE))
-x91 <-
- read.dta(grep("ifs91" , presentation_files , value = TRUE))
-
-# NOTE: we recommend using ?convey::svyarpt rather than this unweighted calculation #
-
-# calculate 60% of the unweighted median income in 1981
-unwtd_arpt81 <- quantile(x81$eybhc0 , 0.5) * .6
-
-# calculate 60% of the unweighted median income in 1985
-unwtd_arpt85 <- quantile(x85$eybhc0 , 0.5) * .6
-
-# calculate 60% of the unweighted median income in 1991
-unwtd_arpt91 <- quantile(x91$eybhc0 , 0.5) * .6
-
-# stack each of these three years of data into a single data.frame
-x <- rbind(x81 , x85 , x91)
+# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the foreign library
+library(foreign)
+
+# create a temporary file on the local disk
+tf <- tempfile()
+
+# store the location of the presentation file
+presentation_zip <-
+ "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
+
+# download jenkins' presentation to the temporary file
+download.file(presentation_zip , tf , mode = 'wb')
+
+# unzip the contents of the archive
+presentation_files <- unzip(tf , exdir = tempdir())
+
+# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
+x81 <-
+ read.dta(grep("ifs81" , presentation_files , value = TRUE))
+x85 <-
+ read.dta(grep("ifs85" , presentation_files , value = TRUE))
+x91 <-
+ read.dta(grep("ifs91" , presentation_files , value = TRUE))
+
+# NOTE: we recommend using ?convey::svyarpt rather than this unweighted calculation #
+
+# calculate 60% of the unweighted median income in 1981
+unwtd_arpt81 <- quantile(x81$eybhc0 , 0.5) * .6
+
+# calculate 60% of the unweighted median income in 1985
+unwtd_arpt85 <- quantile(x85$eybhc0 , 0.5) * .6
+
+# calculate 60% of the unweighted median income in 1991
+unwtd_arpt91 <- quantile(x91$eybhc0 , 0.5) * .6
+
+# stack each of these three years of data into a single data.frame
+x <- rbind(x81 , x85 , x91)
Replicate the author’s survey design statement from stata code..
. ge poor = (year==1981)*(x < $z_81)+(year==1985)*(x < $z_85)+(year==1991)*(x < $z_91)
. * account for clustering within HHs
. svyset hrn [pweight = wgt]
.. into R code:
-# initiate a linearized survey design object
-y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
-
-# immediately run the `convey_prep` function on the survey design
-z <- convey_prep(y)
+# initiate a linearized survey design object
+y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
+
+# immediately run the `convey_prep` function on the survey design
+z <- convey_prep(y)
Replicate the author’s headcount ratio results with stata..
. svy: mean poor if year == 1981
(running mean on estimation sample)
@@ -427,52 +427,52 @@ 3.6.1 Replication Example
poor | .2021312 .0062077 .1899615 .2143009
--------------------------------------------------------------
..using R code:
-headcount_81 <-
- svyfgt( ~ eybhc0 ,
- subset(z , year == 1981) ,
- g = 0 ,
- abs_thresh = unwtd_arpt81)
-
-stopifnot(round(coef(headcount_81) , 5) == .14101)
-stopifnot(round(SE(headcount_81) , 5) == .00449)
-
-headcount_81_ci <-
- confint(headcount_81 , df = degf(subset(z , year == 1981)))
-
-stopifnot(round(headcount_81_ci[1] , 5) == .13222)
-stopifnot(round(headcount_81_ci[2] , 5) == .14981)
-
-headcount_85 <-
- svyfgt(~ eybhc0 ,
- subset(z , year == 1985) ,
- g = 0 ,
- abs_thresh = unwtd_arpt85)
-
-
-stopifnot(round(coef(headcount_85) , 5) == .13764)
-stopifnot(round(SE(headcount_85) , 5) == .00465)
-
-headcount_85_ci <-
- confint(headcount_85 , df = degf(subset(z , year == 1985)))
-
-stopifnot(round(headcount_85_ci[1] , 5) == .12852)
-stopifnot(round(headcount_85_ci[2] , 5) == .14677)
-
-headcount_91 <-
- svyfgt(~ eybhc0 ,
- subset(z , year == 1991) ,
- g = 0 ,
- abs_thresh = unwtd_arpt91)
-
-
-stopifnot(round(coef(headcount_91) , 5) == .20213)
-stopifnot(round(SE(headcount_91) , 5) == .00621)
-
-headcount_91_ci <-
- confint(headcount_91 , df = degf(subset(z , year == 1991)))
-
-stopifnot(round(headcount_91_ci[1] , 5) == .18996)
-stopifnot(round(headcount_91_ci[2] , 5) == .21430)
+headcount_81 <-
+ svyfgt( ~ eybhc0 ,
+ subset(z , year == 1981) ,
+ g = 0 ,
+ abs_thresh = unwtd_arpt81)
+
+stopifnot(round(coef(headcount_81) , 5) == .14101)
+stopifnot(round(SE(headcount_81) , 5) == .00449)
+
+headcount_81_ci <-
+ confint(headcount_81 , df = degf(subset(z , year == 1981)))
+
+stopifnot(round(headcount_81_ci[1] , 5) == .13222)
+stopifnot(round(headcount_81_ci[2] , 5) == .14981)
+
+headcount_85 <-
+ svyfgt(~ eybhc0 ,
+ subset(z , year == 1985) ,
+ g = 0 ,
+ abs_thresh = unwtd_arpt85)
+
+
+stopifnot(round(coef(headcount_85) , 5) == .13764)
+stopifnot(round(SE(headcount_85) , 5) == .00465)
+
+headcount_85_ci <-
+ confint(headcount_85 , df = degf(subset(z , year == 1985)))
+
+stopifnot(round(headcount_85_ci[1] , 5) == .12852)
+stopifnot(round(headcount_85_ci[2] , 5) == .14677)
+
+headcount_91 <-
+ svyfgt(~ eybhc0 ,
+ subset(z , year == 1991) ,
+ g = 0 ,
+ abs_thresh = unwtd_arpt91)
+
+
+stopifnot(round(coef(headcount_91) , 5) == .20213)
+stopifnot(round(SE(headcount_91) , 5) == .00621)
+
+headcount_91_ci <-
+ confint(headcount_91 , df = degf(subset(z , year == 1991)))
+
+stopifnot(round(headcount_91_ci[1] , 5) == .18996)
+stopifnot(round(headcount_91_ci[2] , 5) == .21430)
Confirm this replication applies for the normalized poverty gap as well, comparing stata code..
. ge ngap = poor*($z_81- x)/$z_81 if year == 1981
@@ -492,199 +492,199 @@ 3.6.1 Replication Example
ngap | .0271577 .0013502 .0245109 .0298044
--------------------------------------------------------------
..to R code:
-norm_pov_81 <-
- svyfgt( ~ eybhc0 ,
- subset(z , year == 1981) ,
- g = 1 ,
- abs_thresh = unwtd_arpt81)
-
-
-stopifnot(round(coef(norm_pov_81) , 5) == .02716)
-stopifnot(round(SE(norm_pov_81) , 5) == .00135)
-
-norm_pov_81_ci <-
- confint(norm_pov_81 , df = degf(subset(z , year == 1981)))
-
-stopifnot(round(norm_pov_81_ci[1] , 5) == .02451)
-stopifnot(round(norm_pov_81_ci[2] , 5) == .02980)
+norm_pov_81 <-
+ svyfgt( ~ eybhc0 ,
+ subset(z , year == 1981) ,
+ g = 1 ,
+ abs_thresh = unwtd_arpt81)
+
+
+stopifnot(round(coef(norm_pov_81) , 5) == .02716)
+stopifnot(round(SE(norm_pov_81) , 5) == .00135)
+
+norm_pov_81_ci <-
+ confint(norm_pov_81 , df = degf(subset(z , year == 1981)))
+
+stopifnot(round(norm_pov_81_ci[1] , 5) == .02451)
+stopifnot(round(norm_pov_81_ci[2] , 5) == .02980)
3.6.2 Monte Carlo Simulation
To provide a example for our code, we proceed with a Monte Carlo experiment.
Using the eusilcP
data from the simPop
package (Templ et al. 2017Templ, Matthias, Bernhard Meindl, Alexander Kowarik, and Olivier Dupriez. 2017. “Simulation of Synthetic Complex Data: The R Package simPop.” Journal of Statistical Software 79 (10): 1–38. https://doi.org/10.18637/jss.v079.i10.), we can compute the actual values of the FGT components for that population:
-# load libraries
-library(sampling)
-library(survey)
-library(convey)
-library(parallel)
-
-# load pseudo population data
-data("eusilcP" , package = "simPop")
-
-# compute population value of the FGT(2) index decomposition
-inc.all <- eusilcP$eqIncome
-gfun <- function(y , thresh)
- (thresh - y) / thresh
-hfun <-
- function(y , thresh , g)
- (((thresh - y) / thresh) ^ g) * (y <= thresh)
-fgt2 <- mean(hfun(inc.all , 10000 , 2) , na.rm = TRUE)
-fgt1 <- mean(hfun(inc.all , 10000 , 1) , na.rm = TRUE)
-fgt0 <- mean(hfun(inc.all , 10000 , 0) , na.rm = TRUE)
-igr.fgt <-
- mean(hfun(inc.all , 10000 , 1)[inc.all <= 10000] , na.rm = TRUE)
-gei.poor <-
- convey:::CalcGEI(gfun(inc.all , 10000) , ifelse(inc.all < 10000 , 1 , 0) , 2)
-theta.pop <-
- c(
- "fgt2" = fgt2 ,
- "fgt0" = fgt0 ,
- "fgt1" = fgt1 ,
- "igr" = igr.fgt ,
- "gei(poor;epsilon=2)" = gei.poor
- )
-theta.pop
+# load libraries
+library(sampling)
+library(survey)
+library(convey)
+library(parallel)
+
+# load pseudo population data
+data("eusilcP" , package = "simPop")
+
+# compute population value of the FGT(2) index decomposition
+inc.all <- eusilcP$eqIncome
+gfun <- function(y , thresh)
+ (thresh - y) / thresh
+hfun <-
+ function(y , thresh , g)
+ (((thresh - y) / thresh) ^ g) * (y <= thresh)
+fgt2 <- mean(hfun(inc.all , 10000 , 2) , na.rm = TRUE)
+fgt1 <- mean(hfun(inc.all , 10000 , 1) , na.rm = TRUE)
+fgt0 <- mean(hfun(inc.all , 10000 , 0) , na.rm = TRUE)
+igr.fgt <-
+ mean(hfun(inc.all , 10000 , 1)[inc.all <= 10000] , na.rm = TRUE)
+gei.poor <-
+ convey:::CalcGEI(gfun(inc.all , 10000) , ifelse(inc.all < 10000 , 1 , 0) , 2)
+theta.pop <-
+ c(
+ "fgt2" = fgt2 ,
+ "fgt0" = fgt0 ,
+ "fgt1" = fgt1 ,
+ "igr" = igr.fgt ,
+ "gei(poor;epsilon=2)" = gei.poor
+ )
+theta.pop
## fgt2 fgt0 fgt1 igr
## 0.01571192 0.11412691 0.03259544 0.28560699
## gei(poor;epsilon=2)
## 0.34386588
Now, to study the distribution of the estimator under a particular sampling design, we select 5000 samples under one-stage cluster sampling of 100 households using the cluster
function from the sampling
package (Tillé and Matei 2021Tillé, Yves, and Alina Matei. 2021. Sampling: Survey Sampling. https://CRAN.R-project.org/package=sampling.), and use the svyfgtdec
function to estimate the FGT components for each of those samples:
-# define the number of monte carlo replicates
-mc.rep <- 5000L
-
-
-
-# simulation function
-fgtdec_sim_fun <- function(this.iter) {
- set.seed(this.iter)
-
- library(sampling)
- library(survey)
- library(convey)
-
- # load pseudo population data
- data("eusilcP" , package = "simPop")
-
-
- # compute size-like variable for PPS sampling design
- eusilcP$aux <-
- log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
-
-
- # select sample
- tt <-
- sampling::cluster(
- data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
- clustername = "hid" ,
- size = 1000L ,
- method = "systematic" ,
- pik = eusilcP$aux
- )
-
- # collect data
- this.sample <- getdata(eusilcP , tt)
-
- # create survey design object
- this.desobj <-
- svydesign(
- ids = ~ hid ,
- probs = ~ Prob ,
- data = this.sample ,
- nest = FALSE
- )
-
- # prepare for convey functions
- this.desobj <- convey_prep(this.desobj)
-
- # compute estimates
- svyfgtdec( ~ eqIncome , this.desobj , abs_thresh = 10000 , g = 2)
-
-}
-
-# run replications
-cl <- makeCluster(detectCores() - 1)
-
-fgtdec.estimate.list <-
- clusterApply(cl, seq_len(mc.rep) , fgtdec_sim_fun)
-
-stopCluster(cl)
+# define the number of monte carlo replicates
+mc.rep <- 5000L
+
+
+
+# simulation function
+fgtdec_sim_fun <- function(this.iter) {
+ set.seed(this.iter)
+
+ library(sampling)
+ library(survey)
+ library(convey)
+
+ # load pseudo population data
+ data("eusilcP" , package = "simPop")
+
+
+ # compute size-like variable for PPS sampling design
+ eusilcP$aux <-
+ log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
+
+
+ # select sample
+ tt <-
+ sampling::cluster(
+ data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
+ clustername = "hid" ,
+ size = 1000L ,
+ method = "systematic" ,
+ pik = eusilcP$aux
+ )
+
+ # collect data
+ this.sample <- getdata(eusilcP , tt)
+
+ # create survey design object
+ this.desobj <-
+ svydesign(
+ ids = ~ hid ,
+ probs = ~ Prob ,
+ data = this.sample ,
+ nest = FALSE
+ )
+
+ # prepare for convey functions
+ this.desobj <- convey_prep(this.desobj)
+
+ # compute estimates
+ svyfgtdec( ~ eqIncome , this.desobj , abs_thresh = 10000 , g = 2)
+
+}
+
+# run replications
+cl <- makeCluster(detectCores() - 1)
+
+fgtdec.estimate.list <-
+ clusterApply(cl, seq_len(mc.rep) , fgtdec_sim_fun)
+
+stopCluster(cl)
The PRB of each component is estimated using the code below. Notice that PRBs are relatively small, with absolute values below 1%, with the largest bias in the GEI index component.
-
+
## fgt2 fgt0 fgt1 igr
## 0.01571192 0.11412691 0.03259544 0.28560699
## gei(poor;epsilon=2)
## 0.34386588
-# estimate the expected values of the components estimators
-# using the average of the estimates
-(theta.exp <- rowMeans(sapply(fgtdec.estimate.list , coef)))
+# estimate the expected values of the components estimators
+# using the average of the estimates
+(theta.exp <- rowMeans(sapply(fgtdec.estimate.list , coef)))
## fgt2 fgt0 fgt1 igr
## 0.01577844 0.11432776 0.03269125 0.28591312
## gei(poor;epsilon=2)
## 0.34234370
-# estimate the percentage relative bias
-(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1))
+# estimate the percentage relative bias
+(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1))
## fgt2 fgt0 fgt1 igr
## 0.4233956 0.1759893 0.2939132 0.1071867
## gei(poor;epsilon=2)
## -0.4426679
-
+
For the variance estimators, we estimate the PRB using the code below.
Note that the bias is still relatively small, with absolute values of the PRB below 5%.
-# estimate the variance of the components estimators
-# using the empirical variance of the estimates
-(vartheta.popest <-
- diag(var(t(
- sapply(fgtdec.estimate.list , coef)
- ))))
+# estimate the variance of the components estimators
+# using the empirical variance of the estimates
+(vartheta.popest <-
+ diag(var(t(
+ sapply(fgtdec.estimate.list , coef)
+ ))))
## fgt2 fgt0 fgt1 igr
## 6.447720e-06 1.015546e-04 1.468049e-05 4.753320e-04
## gei(poor;epsilon=2)
## 1.841360e-03
-# estimate the expected value of the Watts index variance estimator
-# using the (estimated) expected value of the variance estimates
-(vartheta.exp <-
- rowMeans(sapply(fgtdec.estimate.list , function(z)
- diag(vcov(
- z
- )))))
+# estimate the expected value of the Watts index variance estimator
+# using the (estimated) expected value of the variance estimates
+(vartheta.exp <-
+ rowMeans(sapply(fgtdec.estimate.list , function(z)
+ diag(vcov(
+ z
+ )))))
## fgt2 fgt0 fgt1 igr
## 6.462904e-06 1.014859e-04 1.473047e-05 4.923412e-04
## gei(poor;epsilon=2)
## 1.911158e-03
-# estimate the percentage relative bias of the variance estimators
-(percentage_relative_bias_variance <- 100 * (vartheta.exp / vartheta.popest - 1))
+# estimate the percentage relative bias of the variance estimators
+(percentage_relative_bias_variance <- 100 * (vartheta.exp / vartheta.popest - 1))
## fgt2 fgt0 fgt1 igr
## 0.23548684 -0.06759586 0.34047392 3.57838870
## gei(poor;epsilon=2)
## 3.79052600
-
+
Regarding the MSE of the decomposition estimator, the squared bias accounts for less than 0.1% of the MSE.
This means that, with a good estimate of the variance, we should be able to have a good approximation for the MSE.
-# estimate MSE
-theta.bias2 <- (theta.exp - theta.pop) ^ 2
-(theta.mse <- theta.bias2 + vartheta.popest)
+# estimate MSE
+theta.bias2 <- (theta.exp - theta.pop) ^ 2
+(theta.mse <- theta.bias2 + vartheta.popest)
## fgt2 fgt0 fgt1 igr
## 6.452146e-06 1.015949e-04 1.468967e-05 4.754257e-04
## gei(poor;epsilon=2)
## 1.843678e-03
-
+
## fgt2 fgt0 fgt1 igr
## 0.06858780 0.03970787 0.06247986 0.01971227
## gei(poor;epsilon=2)
## 0.12567511
-
+
The CIs based on the normal approximation work reasonably well for all components. The code below shows that the coverage rates are close to the 95% nominal coverage rate.
-# estimate confidence intervals of the Watts index
-# for each of the samples
-est.coverage <-
- sapply(fgtdec.estimate.list, function(this.stat)
- confint(this.stat)[, 1] <= theta.pop &
- confint(this.stat)[, 2] >= theta.pop)
-
-# evaluate empirical coverage
-stopifnot( abs( rowMeans(est.coverage) - 0.95 ) < 0.015 )
+# estimate confidence intervals of the Watts index
+# for each of the samples
+est.coverage <-
+ sapply(fgtdec.estimate.list, function(this.stat)
+ confint(this.stat)[, 1] <= theta.pop &
+ confint(this.stat)[, 2] >= theta.pop)
+
+# evaluate empirical coverage
+stopifnot( abs( rowMeans(est.coverage) - 0.95 ) < 0.015 )
For additional usage examples of svyfgt
and svyfgtdec
, type ?convey::svyfgt
or ?convey::svyfgtdec
in the R console.
diff --git a/docs/3.7-watts-poverty-measure-svywatts-svywattsdec.html b/docs/3.7-watts-poverty-measure-svywatts-svywattsdec.html
index 3e7e9b4..9598350 100644
--- a/docs/3.7-watts-poverty-measure-svywatts-svywattsdec.html
+++ b/docs/3.7-watts-poverty-measure-svywatts-svywattsdec.html
@@ -265,12 +265,12 @@
3.7 Watts poverty measure (svywatts, svywattsdec)
-✔️ time to exit poverty interpretation: watts score divided by a growth rate
-✔️ sensitive to intensity and inequality among the poor
-✔️ can be decomposed into interpretable measures
-❌ not defined for individuals with zero or negative incomes
-❌ interpretation is not very straightforward
-❌ less common than the FGT measures nowadays
+✔️ time to exit poverty interpretation: watts score divided by a growth rate
+✔️ sensitive to intensity and inequality among the poor
+✔️ can be decomposed into interpretable measures
+❌ not defined for individuals with zero or negative incomes
+❌ interpretation is not very straightforward
+❌ less common than the FGT measures nowadays
The measure proposed in Watts (1968Watts, Harold W. 1968. “An Economic Definition of Poverty.” Discussion Papers 5. Institute For Research on Poverty. https://www.irp.wisc.edu/publications/dps/pdfs/dp568.pdf.) satisfies a number of desirable poverty measurement axioms and is known to be one of the first distribution-sensitive poverty measures, as noted by Haughton and Khandker (2009Haughton, Jonathan, and Shahidur Khandker. 2009. Handbook on Poverty and Inequality. World Bank Training Series. World Bank Publications. https://openknowledge.worldbank.org/bitstream/handle/10986/11985/9780821376133.pdf.). It is defined as:
\[
Watts = \frac{1}{N} \sum_{i \in U} \log{ \bigg( \frac{y_i}{\theta} \bigg) \delta ( y_i \leqslant \theta) }.
@@ -297,294 +297,294 @@ 3.7 Watts poverty measure (svywat
3.7.1 Monte Carlo Simulation
To provide a example for our code, we proceed with a Monte Carlo experiment.
Using the eusilcP
data from the simPop
package (Templ et al. 2017Templ, Matthias, Bernhard Meindl, Alexander Kowarik, and Olivier Dupriez. 2017. “Simulation of Synthetic Complex Data: The R Package simPop.” Journal of Statistical Software 79 (10): 1–38. https://doi.org/10.18637/jss.v079.i10.), we can compute the actual value of the Watts index for that population:
-# load libraries
-library(sampling)
-library(survey)
-library(convey)
-library(parallel)
-
-# load pseudo population data
-data("eusilcP" , package = "simPop")
-
-# compute population value of the Watts index decomposition
-inc.pos <- eusilcP$eqIncome[eusilcP$eqIncome > 0]
-(theta.pop <-
- mean(ifelse(inc.pos <= 10000 , log(10000 / inc.pos) , 0) , na.rm = TRUE))
+# load libraries
+library(sampling)
+library(survey)
+library(convey)
+library(parallel)
+
+# load pseudo population data
+data("eusilcP" , package = "simPop")
+
+# compute population value of the Watts index decomposition
+inc.pos <- eusilcP$eqIncome[eusilcP$eqIncome > 0]
+(theta.pop <-
+ mean(ifelse(inc.pos <= 10000 , log(10000 / inc.pos) , 0) , na.rm = TRUE))
## [1] 0.05025374
Now, to study the distribution of the estimator under a particular sampling design, we select 5000 samples under one-stage cluster sampling of 100 households using the cluster
function from the sampling
package (Tillé and Matei 2021Tillé, Yves, and Alina Matei. 2021. Sampling: Survey Sampling. https://CRAN.R-project.org/package=sampling.), and use the svywatts
function to estimate the Watts index for each of those samples:
-# define the number of monte carlo replicates
-mc.rep <- 5000L
-
-
-# simulation function
-watts_sim_fun <- function(this.iter) {
- set.seed(this.iter)
-
- library(survey)
- library(convey)
- library(sampling)
-
-
- # load pseudo population data
- data("eusilcP" , package = "simPop")
-
- # compute size-like variable for PPS sampling design
- eusilcP$aux <-
- log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
-
- # select sample
- tt <-
- sampling::cluster(
- data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) ,] ,
- clustername = "hid" ,
- size = 1000L ,
- method = "systematic" ,
- pik = eusilcP$aux
- )
-
- # collect data
- this.sample <- getdata(eusilcP , tt)
-
- # create survey design object
- this.desobj <-
- svydesign(
- ids = ~ hid ,
- probs = ~ Prob ,
- data = this.sample ,
- nest = FALSE
- )
-
- # prepare for convey functions
- this.desobj <- convey_prep(this.desobj)
-
- # filter positive incomes
- this.desobj <- subset(this.desobj , eqIncome > 0)
-
- # compute estimates
- svywatts( ~ eqIncome , this.desobj , abs_thresh = 10000)
-
-}
-
-# run replications
-cl <- makeCluster(detectCores() - 1)
-
-watts.estimate.list <-
- clusterApply(cl, seq_len(mc.rep) , watts_sim_fun)
-
-stopCluster(cl)
+# define the number of monte carlo replicates
+mc.rep <- 5000L
+
+
+# simulation function
+watts_sim_fun <- function(this.iter) {
+ set.seed(this.iter)
+
+ library(survey)
+ library(convey)
+ library(sampling)
+
+
+ # load pseudo population data
+ data("eusilcP" , package = "simPop")
+
+ # compute size-like variable for PPS sampling design
+ eusilcP$aux <-
+ log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
+
+ # select sample
+ tt <-
+ sampling::cluster(
+ data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) ,] ,
+ clustername = "hid" ,
+ size = 1000L ,
+ method = "systematic" ,
+ pik = eusilcP$aux
+ )
+
+ # collect data
+ this.sample <- getdata(eusilcP , tt)
+
+ # create survey design object
+ this.desobj <-
+ svydesign(
+ ids = ~ hid ,
+ probs = ~ Prob ,
+ data = this.sample ,
+ nest = FALSE
+ )
+
+ # prepare for convey functions
+ this.desobj <- convey_prep(this.desobj)
+
+ # filter positive incomes
+ this.desobj <- subset(this.desobj , eqIncome > 0)
+
+ # compute estimates
+ svywatts( ~ eqIncome , this.desobj , abs_thresh = 10000)
+
+}
+
+# run replications
+cl <- makeCluster(detectCores() - 1)
+
+watts.estimate.list <-
+ clusterApply(cl, seq_len(mc.rep) , watts_sim_fun)
+
+stopCluster(cl)
Then, we evaluate the Percentage Relative Bias (PRB) of the Watts index estimator. Under this scenario, the PRB of the Watts index estimator is 0.3772%.
-# estimate the expected value of the Watts index estimator
-# using the average of the estimates
-(theta.exp <- mean(sapply(watts.estimate.list , coef)))
+# estimate the expected value of the Watts index estimator
+# using the average of the estimates
+(theta.exp <- mean(sapply(watts.estimate.list , coef)))
## [1] 0.05044329
-# estimate the percentage relative bias
-(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1) )
+# estimate the percentage relative bias
+(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1) )
## [1] 0.377195
-
+
For the variance estimator, we have:
-# estimate the variance of the Watts index estimator
-# using the empirical variance of the estimates
-(vartheta.popest <- var(sapply(watts.estimate.list , coef)))
+# estimate the variance of the Watts index estimator
+# using the empirical variance of the estimates
+(vartheta.popest <- var(sapply(watts.estimate.list , coef)))
## [1] 6.141434e-05
-# estimate the expected value of the Watts index variance estimator
-# using the (estimated) expected value of the variance estimates
-(vartheta.exp <- mean(sapply(watts.estimate.list , vcov)))
+# estimate the expected value of the Watts index variance estimator
+# using the (estimated) expected value of the variance estimates
+(vartheta.exp <- mean(sapply(watts.estimate.list , vcov)))
## [1] 6.100902e-05
-# estimate the percentage relative bias of the variance estimator
-(percentage_relative_bias_variance <- 100 * (vartheta.exp / vartheta.popest - 1))
+# estimate the percentage relative bias of the variance estimator
+(percentage_relative_bias_variance <- 100 * (vartheta.exp / vartheta.popest - 1))
## [1] -0.6599717
-
+
Under this scenario, the PRB of the Watts index variance estimator is -0.6600%.
Our simulations show that the Squared Bias of this estimator accounts for less than 0.1% of its Mean Squared Error:
-theta.bias2 <- (theta.exp - theta.pop) ^ 2
-theta.mse <- theta.bias2 + vartheta.popest
-(squared_bias_over_mse <- 100 * (theta.bias2 / theta.mse))
+theta.bias2 <- (theta.exp - theta.pop) ^ 2
+theta.mse <- theta.bias2 + vartheta.popest
+(squared_bias_over_mse <- 100 * (theta.bias2 / theta.mse))
## [1] 0.05847158
-
+
Next, we evaluate the Percentage Coverage Rate (PCR). In theory, under repeated sampling, the estimated 95% CIs should cover the population parameter approximately 95% of the time. We can evaluate that using:
-# estimate confidence intervals of the Watts index
-# for each of the samples
-est.coverage <-
- sapply(watts.estimate.list, function(this.stat)
- confint(this.stat)[, 1] <= theta.pop &
- confint(this.stat)[, 2] >= theta.pop)
-
-# evaluate empirical coverage
-(empirical_coverage <- mean(est.coverage))
+# estimate confidence intervals of the Watts index
+# for each of the samples
+est.coverage <-
+ sapply(watts.estimate.list, function(this.stat)
+ confint(this.stat)[, 1] <= theta.pop &
+ confint(this.stat)[, 2] >= theta.pop)
+
+# evaluate empirical coverage
+(empirical_coverage <- mean(est.coverage))
## [1] 0.9268
-
+
Our coverages are not too far from the nominal coverage level of 95%.
For the Watts index decomposition, we start by computing the (true) population values of the components:
-# compute population value of the Watts index decomposition
-inc.pos <- eusilcP$eqIncome[eusilcP$eqIncome > 0]
-wdec1 <-
- mean(ifelse(inc.pos <= 10000 , log(10000 / inc.pos) , 0) , na.rm = TRUE)
-wdec2 <- mean(inc.pos <= 10000 , na.rm = TRUE)
-mu.poor <- mean(inc.pos [inc.pos <= 10000])
-wdec3 <- log(10000 / mu.poor)
-wdec4 <-
- -mean(log(inc.pos[inc.pos <= 10000] / mu.poor) , na.rm = TRUE)
-theta.pop <-
- c(
- "watts" = wdec1 ,
- "fgt0" = wdec2 ,
- "watts pov. gap ratio" = wdec3 ,
- "theil(poor)" = wdec4
- )
-theta.pop
+# compute population value of the Watts index decomposition
+inc.pos <- eusilcP$eqIncome[eusilcP$eqIncome > 0]
+wdec1 <-
+ mean(ifelse(inc.pos <= 10000 , log(10000 / inc.pos) , 0) , na.rm = TRUE)
+wdec2 <- mean(inc.pos <= 10000 , na.rm = TRUE)
+mu.poor <- mean(inc.pos [inc.pos <= 10000])
+wdec3 <- log(10000 / mu.poor)
+wdec4 <-
+ -mean(log(inc.pos[inc.pos <= 10000] / mu.poor) , na.rm = TRUE)
+theta.pop <-
+ c(
+ "watts" = wdec1 ,
+ "fgt0" = wdec2 ,
+ "watts pov. gap ratio" = wdec3 ,
+ "theil(poor)" = wdec4
+ )
+theta.pop
## watts fgt0 watts pov. gap ratio
## 0.05025374 0.11399096 0.33497664
## theil(poor)
## 0.10588056
Then, using the same sampling strategy of the svywatts
, we compute the svywattsdec
for each sample:
-# simulation function
-wattsdec_sim_fun <- function(this.iter) {
-
- set.seed(this.iter)
-
- library(survey)
- library(convey)
- library(sampling)
-
-
- # load pseudo population data
- data("eusilcP" , package = "simPop")
-
- # compute size-like variable for PPS sampling design
- eusilcP$aux <-
- log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
-
- # select sample
- tt <-
- sampling::cluster(
- data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
- clustername = "hid" ,
- size = 1000L ,
- method = "systematic" ,
- pik = eusilcP$aux
- )
-
- # collect data
- this.sample <- getdata(eusilcP , tt)
-
- # create survey design object
- this.desobj <-
- svydesign(
- ids = ~ hid ,
- probs = ~ Prob ,
- data = this.sample ,
- nest = FALSE
- )
-
- # prepare for convey functions
- this.desobj <- convey_prep(this.desobj)
-
- # filter positive incomes
- this.desobj <- subset(this.desobj , eqIncome > 0)
-
- # compute estimates
- svywattsdec(~ eqIncome , this.desobj , abs_thresh = 10000)
-
-}
-
-# run replications
-cl <- makeCluster(detectCores() - 1)
-
-wattsdec.estimate.list <-
- clusterApply(cl, seq_len(mc.rep) , wattsdec_sim_fun)
-
-stopCluster(cl)
+# simulation function
+wattsdec_sim_fun <- function(this.iter) {
+
+ set.seed(this.iter)
+
+ library(survey)
+ library(convey)
+ library(sampling)
+
+
+ # load pseudo population data
+ data("eusilcP" , package = "simPop")
+
+ # compute size-like variable for PPS sampling design
+ eusilcP$aux <-
+ log(ifelse(eusilcP$eqIncome >= 1000 , eusilcP$eqIncome , 1000))
+
+ # select sample
+ tt <-
+ sampling::cluster(
+ data = eusilcP[sample.int(nrow(eusilcP) , nrow(eusilcP) , replace = FALSE) , ] ,
+ clustername = "hid" ,
+ size = 1000L ,
+ method = "systematic" ,
+ pik = eusilcP$aux
+ )
+
+ # collect data
+ this.sample <- getdata(eusilcP , tt)
+
+ # create survey design object
+ this.desobj <-
+ svydesign(
+ ids = ~ hid ,
+ probs = ~ Prob ,
+ data = this.sample ,
+ nest = FALSE
+ )
+
+ # prepare for convey functions
+ this.desobj <- convey_prep(this.desobj)
+
+ # filter positive incomes
+ this.desobj <- subset(this.desobj , eqIncome > 0)
+
+ # compute estimates
+ svywattsdec(~ eqIncome , this.desobj , abs_thresh = 10000)
+
+}
+
+# run replications
+cl <- makeCluster(detectCores() - 1)
+
+wattsdec.estimate.list <-
+ clusterApply(cl, seq_len(mc.rep) , wattsdec_sim_fun)
+
+stopCluster(cl)
The PRB of each component is estimated using the code below. Notice that PRBs are relatively small, with absolute values below 1%, with the largest bias in the Theil index component.
-
+
## watts fgt0 watts pov. gap ratio
## 0.05025374 0.11399096 0.33497664
## theil(poor)
## 0.10588056
-# estimate the expected values of the components estimators
-# using the average of the estimates
-(theta.exp <- rowMeans(sapply(wattsdec.estimate.list , coef)))
+# estimate the expected values of the components estimators
+# using the average of the estimates
+(theta.exp <- rowMeans(sapply(wattsdec.estimate.list , coef)))
## watts fgt0 watts pov. gap ratio
## 0.05044329 0.11418992 0.33584759
## theil(poor)
## 0.10584416
-# estimate the percentage relative bias
-(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1))
+# estimate the percentage relative bias
+(percentage_relative_bias <- 100 * (theta.exp / theta.pop - 1))
## watts fgt0 watts pov. gap ratio
## 0.37719501 0.17453750 0.26000223
## theil(poor)
## -0.03437576
-
+
For the variance estimators, we estimate the PRB using the code below.
Note that the bias of the variance estimators is still relatively small, with absolute value of the Watts variance estimator’s PRB below 1% and all four components variance estimators below 5%.
-# estimate the variance of the components estimators
-# using the empirical variance of the estimates
-(vartheta.popest <-
- diag(var(t(
- sapply(wattsdec.estimate.list , coef)
- ))))
+# estimate the variance of the components estimators
+# using the empirical variance of the estimates
+(vartheta.popest <-
+ diag(var(t(
+ sapply(wattsdec.estimate.list , coef)
+ ))))
## watts fgt0 watts pov. gap ratio
## 6.141434e-05 1.015260e-04 9.241259e-04
## theil(poor)
## 1.108525e-03
-# estimate the expected value of the Watts index variance estimator
-# using the (estimated) expected value of the variance estimates
-(vartheta.exp <-
- rowMeans(sapply(wattsdec.estimate.list , function(z)
- diag(vcov(
- z
- )))))
+# estimate the expected value of the Watts index variance estimator
+# using the (estimated) expected value of the variance estimates
+(vartheta.exp <-
+ rowMeans(sapply(wattsdec.estimate.list , function(z)
+ diag(vcov(
+ z
+ )))))
## watts fgt0 watts pov. gap ratio
## 6.100902e-05 1.013968e-04 9.613831e-04
## theil(poor)
## 1.070018e-03
-# estimate the percentage relative bias of the variance estimators
-(percentage_relative_bias <-
- 100 * (vartheta.exp / vartheta.popest - 1))
+# estimate the percentage relative bias of the variance estimators
+(percentage_relative_bias <-
+ 100 * (vartheta.exp / vartheta.popest - 1))
## watts fgt0 watts pov. gap ratio
## -0.6599717 -0.1273107 4.0316155
## theil(poor)
## -3.4736907
-
+
Regarding the MSE, the squared bias accounts for less than 0.1% of the MSE.
This means that, with a good estimate of the variance, we should be able to have a good approximation for the MSE.
-# estimate MSE
-theta.bias2 <- (theta.exp - theta.pop) ^ 2
-(theta.mse <- theta.bias2 + vartheta.popest)
+# estimate MSE
+theta.bias2 <- (theta.exp - theta.pop) ^ 2
+(theta.mse <- theta.bias2 + vartheta.popest)
## watts fgt0 watts pov. gap ratio
## 6.145027e-05 1.015656e-04 9.248844e-04
## theil(poor)
## 1.108526e-03
-
+
## watts fgt0 watts pov. gap ratio
## 0.0584715789 0.0389737000 0.0820154610
## theil(poor)
## 0.0001195063
-
+
However, the CIs based on the normal approximation might not work very well for some components. The code below shows that coverage rate for the Theil index component differs significantly from the 95% nominal coverage rate.
-# estimate confidence intervals of the Watts index
-# for each of the samples
-est.coverage <-
- sapply(wattsdec.estimate.list, function(this.stat)
- confint(this.stat)[, 1] <= theta.pop &
- confint(this.stat)[, 2] >= theta.pop)
-
-# evaluate empirical coverage
-(empirical_coverage <- rowMeans(est.coverage))
+# estimate confidence intervals of the Watts index
+# for each of the samples
+est.coverage <-
+ sapply(wattsdec.estimate.list, function(this.stat)
+ confint(this.stat)[, 1] <= theta.pop &
+ confint(this.stat)[, 2] >= theta.pop)
+
+# evaluate empirical coverage
+(empirical_coverage <- rowMeans(est.coverage))
## watts fgt0 watts pov. gap ratio
## 0.9268 0.9494 0.9476
## theil(poor)
## 0.8452
-
+
One of the reasons for this is that the sample might not be large enough for the CLT to hold. The distribution of the estimator shows substantial asymmetry, which would be a problem for the normal approximation.
-hist(
- sapply(wattsdec.estimate.list , coef)[4, ] ,
- main = "Histogram of Theil component estimator" ,
- xlim = c(0, .30) ,
- ylim = c(0 , 1500) ,
- xlab = "Estimate"
-)
+hist(
+ sapply(wattsdec.estimate.list , coef)[4, ] ,
+ main = "Histogram of Theil component estimator" ,
+ xlim = c(0, .30) ,
+ ylim = c(0 , 1500) ,
+ xlab = "Estimate"
+)
For additional usage examples of svywatts
and svywattsdec
, type ?convey::svywatts
or ?convey::svywattsdec
in the R console.
diff --git a/docs/4.11-generalized-entropy-and-decomposition-svygei-svygeidec.html b/docs/4.11-generalized-entropy-and-decomposition-svygei-svygeidec.html
index 21fd23c..f2743fa 100644
--- a/docs/4.11-generalized-entropy-and-decomposition-svygei-svygeidec.html
+++ b/docs/4.11-generalized-entropy-and-decomposition-svygei-svygeidec.html
@@ -265,12 +265,12 @@
4.11 Generalized Entropy and Decomposition (svygei, svygeidec)
-✔️ flexible inequality-aversion parameter -- varying its epsilon parameter can highlight the effect of inequality in different parts of the income distribution
-✔️ can be group-decomposed into within-inequality and between-inequality
-✔️ this parameter can also be (somewhat) tuned to be less affected by outliers
-❌ does not handle zero or negative incomes
-❌ hard to interpret
-❌ can be very sensitive to outliers
+✔️ flexible inequality-aversion parameter -- varying its epsilon parameter can highlight the effect of inequality in different parts of the income distribution
+✔️ can be group-decomposed into within-inequality and between-inequality
+✔️ this parameter can also be (somewhat) tuned to be less affected by outliers
+❌ does not handle zero or negative incomes
+❌ hard to interpret
+❌ can be very sensitive to outliers
Using a generalization of the information function, now defined as:
\[
g(f) = \frac{1}{\alpha-1} [ 1 - f^{\alpha - 1} ]
@@ -306,38 +306,38 @@ 4.11 Generalized Entropy and Deco
4.11.1 Replication Example
In July 2006, Jenkins (2008Jenkins, Stephen. 2008. “Estimation and Interpretation of Measures of Inequality, Poverty, and Social Welfare Using Stata.” North American Stata Users' Group Meetings 2006. Stata Users Group. http://EconPapers.repec.org/RePEc:boc:asug06:16.) presented at the North American Stata Users’ Group Meetings on the stata Generalized Entropy Index command. The example below reproduces those statistics.
Load and prepare the same data set:
-# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the foreign library
-library(foreign)
-
-# create a temporary file on the local disk
-tf <- tempfile()
-
-# store the location of the presentation file
-presentation_zip <-
- "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
-
-# download jenkins' presentation to the temporary file
-download.file(presentation_zip , tf , mode = 'wb')
-
-# unzip the contents of the archive
-presentation_files <- unzip(tf , exdir = tempdir())
-
-# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
-x81 <-
- read.dta(grep("ifs81" , presentation_files , value = TRUE))
-x85 <-
- read.dta(grep("ifs85" , presentation_files , value = TRUE))
-x91 <-
- read.dta(grep("ifs91" , presentation_files , value = TRUE))
-
-# stack each of these three years of data into a single data.frame
-x <- rbind(x81 , x85 , x91)
+# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the foreign library
+library(foreign)
+
+# create a temporary file on the local disk
+tf <- tempfile()
+
+# store the location of the presentation file
+presentation_zip <-
+ "https://web.archive.org/web/20150928053959/http://repec.org/nasug2006/nasug2006_jenkins.zip"
+
+# download jenkins' presentation to the temporary file
+download.file(presentation_zip , tf , mode = 'wb')
+
+# unzip the contents of the archive
+presentation_files <- unzip(tf , exdir = tempdir())
+
+# load the institute for fiscal studies' 1981, 1985, and 1991 data.frame objects
+x81 <-
+ read.dta(grep("ifs81" , presentation_files , value = TRUE))
+x85 <-
+ read.dta(grep("ifs85" , presentation_files , value = TRUE))
+x91 <-
+ read.dta(grep("ifs91" , presentation_files , value = TRUE))
+
+# stack each of these three years of data into a single data.frame
+x <- rbind(x81 , x85 , x91)
Replicate the author’s survey design statement from stata code..
. * account for clustering within HHs
. version 8: svyset [pweight = wgt], psu(hrn)
@@ -345,11 +345,11 @@ 4.11.1 Replication Example
psu is hrn
construct an
.. into R code:
-# initiate a linearized survey design object
-y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
-
-# immediately run the `convey_prep` function on the survey design
-z <- convey_prep(y)
+# initiate a linearized survey design object
+y <- svydesign( ~ hrn , data = x , weights = ~ wgt)
+
+# immediately run the `convey_prep` function on the survey design
+z <- convey_prep(y)
Replicate the author’s subset statement and each of his svygei results..
. svygei x if year == 1981
@@ -371,21 +371,21 @@ 4.11.1 Replication Example
GE(3) | .1739994 .00662015 26.28 0.000 .1610242 .1869747
---------------------------------------------------------------------------
..using R code:
-
+
## gei SE
## eybhc0 0.19021 0.0247
-
+
## gei SE
## eybhc0 0.11429 0.0028
-
+
## gei SE
## eybhc0 0.11169 0.0023
-
+
## gei SE
## eybhc0 0.12879 0.0033
-
+
## gei SE
## eybhc0 0.174 0.0066
Confirm this replication applies for subsetted objects as well. Compare stata output..
@@ -407,83 +407,83 @@ 4.11.1 Replication Example
GE(3) | .2609507 .01850689 14.10 0.000 .2246779 .2972235
---------------------------------------------------------------------------
..to R code:
-
+
## gei SE
## eybhc0 0.16024 0.0094
-
+
## gei SE
## eybhc0 0.12762 0.0033
-
+
## gei SE
## eybhc0 0.13372 0.0041
-
+
## gei SE
## eybhc0 0.16764 0.0073
-
+
## gei SE
## eybhc0 0.26095 0.0185
Replicate the author’s decomposition by population subgroup (work status) shown on PDF page 57..
-# define work status (PDF page 22)
-z <-
- update(z , wkstatus = c(1 , 1 , 1 , 1 , 2 , 3 , 2 , 2)[as.numeric(esbu)])
-z <-
- update(z , wkstatus = factor(wkstatus , labels = c("1+ ft working" , "no ft working" , "elderly")))
-
-# subset to 1991 and remove records with zero income
-z91 <- subset(z , year == 1991 & eybhc0 > 0)
-
-# population share
-svymean( ~ wkstatus, z91)
+# define work status (PDF page 22)
+z <-
+ update(z , wkstatus = c(1 , 1 , 1 , 1 , 2 , 3 , 2 , 2)[as.numeric(esbu)])
+z <-
+ update(z , wkstatus = factor(wkstatus , labels = c("1+ ft working" , "no ft working" , "elderly")))
+
+# subset to 1991 and remove records with zero income
+z91 <- subset(z , year == 1991 & eybhc0 > 0)
+
+# population share
+svymean( ~ wkstatus, z91)
## mean SE
## wkstatus1+ ft working 0.61724 0.0067
## wkstatusno ft working 0.20607 0.0059
## wkstatuselderly 0.17669 0.0046
-
+
## wkstatus eybhc0 se
## 1+ ft working 1+ ft working 278.8040 3.703790
## no ft working no ft working 151.6317 3.153968
## elderly elderly 176.6045 4.661740
-
+
## wkstatus eybhc0 se
## 1+ ft working 1+ ft working 0.2300708 0.02853959
## no ft working no ft working 10.9231761 10.65482557
## elderly elderly 0.1932164 0.02571991
-
+
## wkstatus eybhc0 se
## 1+ ft working 1+ ft working 0.1536921 0.006955506
## no ft working no ft working 0.1836835 0.014740510
## elderly elderly 0.1653658 0.016409770
-
+
## wkstatus eybhc0 se
## 1+ ft working 1+ ft working 0.1598558 0.008327994
## no ft working no ft working 0.1889909 0.016766120
## elderly elderly 0.2023862 0.027787224
-
+
## wkstatus eybhc0 se
## 1+ ft working 1+ ft working 0.2130664 0.01546521
## no ft working no ft working 0.2846345 0.06016394
## elderly elderly 0.3465088 0.07362898
-
+
## gei decomposition SE
## total 3.682893 3.3999
## within 3.646572 3.3998
## between 0.036321 0.0028
-
+
## gei decomposition SE
## total 0.195236 0.0065
## within 0.161935 0.0061
## between 0.033301 0.0025
-
+
## gei decomposition SE
## total 0.200390 0.0079
## within 0.169396 0.0076
## between 0.030994 0.0022
-
+
## gei decomposition SE
## total 0.274325 0.0167
## within 0.245067 0.0164
diff --git a/docs/4.12-j-divergence-and-decomposition-svyjdiv-svyjdivdec.html b/docs/4.12-j-divergence-and-decomposition-svyjdiv-svyjdivdec.html
index 7fed43b..f13f5ce 100644
--- a/docs/4.12-j-divergence-and-decomposition-svyjdiv-svyjdivdec.html
+++ b/docs/4.12-j-divergence-and-decomposition-svyjdiv-svyjdivdec.html
@@ -265,12 +265,12 @@
4.12 J-Divergence and Decomposition (svyjdiv, svyjdivdec)
-✔️ can be interpreted in terms of GEI indices
-✔️ can be group-decomposed into within-inequality and between-inequality
-✔️ does not need to (explicitly) choose inequality aversion parameters
-❌ does not handle zero or negative incomes
-❌ hard to interpret
-❌ the decomposition interpretation is not exactly the same as the GEI, but very similar
+✔️ can be interpreted in terms of GEI indices
+✔️ can be group-decomposed into within-inequality and between-inequality
+✔️ does not need to (explicitly) choose inequality aversion parameters
+❌ does not handle zero or negative incomes
+❌ hard to interpret
+❌ the decomposition interpretation is not exactly the same as the GEI, but very similar
The J-divergence measure (Rohde 2016Rohde, Nicholas. 2016. “J-Divergence Measurements of Economic Inequality.” Journal of the Royal Statistical Society: Series A (Statistics in Society) 179 (3): 847–70. https://doi.org/10.1111/rssa.12153.) can be seen as the sum of \(GE_0\) and \(GE_1\), satisfying axioms that, individually, those two indices do not. The J-divergence measure is defined as:
\[
J = \frac{1}{N} \sum_{i \in U} \bigg( \frac{ y_i }{ \mu } -1 \bigg) \ln \bigg( \frac{y_i}{\mu} \bigg)
@@ -287,136 +287,136 @@ 4.12 J-Divergence and Decompositi
4.12.1 Monte Carlo Simulation
First, we should check that the finite-population values make sense. The J-divergence can be seen as the sum of \(GE^{(0)}\) and \(GE^{(1)}\). So, taking the starting population from the svyzenga
section of this text, we have:
-# compute finite population J-divergence
-(jdivt.pop <-
- (convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))))
+# compute finite population J-divergence
+(jdivt.pop <-
+ (convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))))
## [1] 0.4332649
-# compute finite population GE indices
-(gei0.pop <-
- convey:::CalcGEI(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0) , 0))
+# compute finite population GE indices
+(gei0.pop <-
+ convey:::CalcGEI(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0) , 0))
## [1] 0.2215037
-
+
## [1] 0.2117612
-
+
## [1] TRUE
As expected, the J-divergence matches the sum of GEs in the finite population. And as we’ve checked the GE measures before, the J-divergence computation function seems safe.
In order to assess the estimators implemented in svyjdiv
and svyjdivdec
, we can run a Monte Carlo experiment. Using the same samples we used in the svyzenga
replication example, we have:
-# estimate J-divergence with each sample
-jdiv.estimate.list <-
- lapply(survey.list ,
- function(this.sample) {
- svyjdiv( ~ x ,
- subset(this.sample , x > 0) ,
- na.rm = TRUE)
- })
-
-# compute the (finite population overall) J-divergence
-(theta.pop <-
- convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0)))
+# estimate J-divergence with each sample
+jdiv.estimate.list <-
+ lapply(survey.list ,
+ function(this.sample) {
+ svyjdiv( ~ x ,
+ subset(this.sample , x > 0) ,
+ na.rm = TRUE)
+ })
+
+# compute the (finite population overall) J-divergence
+(theta.pop <-
+ convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0)))
## [1] 0.4332649
-# estimate the expected value of the J-divergence estimator
-# using the average of the estimates
-(theta.exp <- mean(sapply(jdiv.estimate.list , coef)))
+# estimate the expected value of the J-divergence estimator
+# using the average of the estimates
+(theta.exp <- mean(sapply(jdiv.estimate.list , coef)))
## [1] 0.4327823
-
+
## [1] -0.1113765
-# estimate the variance of the J-divergence estimator
-# using the variance of the estimates
-(vartheta.popest <- var(sapply(jdiv.estimate.list , coef)))
+# estimate the variance of the J-divergence estimator
+# using the variance of the estimates
+(vartheta.popest <- var(sapply(jdiv.estimate.list , coef)))
## [1] 0.0005434848
-# estimate the expected value of the J-divergence index variance estimator
-# using the expected of the variance estimates
-(vartheta.exp <- mean(sapply(jdiv.estimate.list , vcov)))
+# estimate the expected value of the J-divergence index variance estimator
+# using the expected of the variance estimates
+(vartheta.exp <- mean(sapply(jdiv.estimate.list , vcov)))
## [1] 0.0005342947
-# estimate the percentage relative bias of the variance estimator
-100 * (vartheta.exp / vartheta.popest - 1)
+# estimate the percentage relative bias of the variance estimator
+100 * (vartheta.exp / vartheta.popest - 1)
## [1] -1.690964
For the decomposition, we repeat the same procedure:
-# estimate J-divergence decomposition with each sample
-jdivdec.estimate.list <-
- lapply(survey.list ,
- function(this.sample) {
- svyjdivdec( ~ x ,
- ~ SEX ,
- subset(this.sample , x > 0) ,
- na.rm = TRUE)
- })
-
-# compute the (finite population) J-divergence decomposition per sex
-jdivt.pop <-
- convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))
-
-overall.mean <- mean(pop.df$x[pop.df$x > 0])
-
-group.mean <-
- c(by(pop.df$x[pop.df$x > 0] , list("SEX" = factor(pop.df$SEX[pop.df$x > 0])) , FUN = mean))
-
-group.pshare <-
- c(prop.table(by(rep(1 , sum(
- pop.df$x > 0
- )) , list(
- "SEX" = factor(pop.df$SEX[pop.df$x > 0])
- ) , FUN = sum)))
-
-jdivb.pop <-
- sum(group.pshare * (group.mean / overall.mean - 1) * log(group.mean / overall.mean))
-
-jdivw.pop <- jdivt.pop - jdivb.pop
-
-(theta.pop <- c(jdivt.pop , jdivw.pop , jdivb.pop))
+# estimate J-divergence decomposition with each sample
+jdivdec.estimate.list <-
+ lapply(survey.list ,
+ function(this.sample) {
+ svyjdivdec( ~ x ,
+ ~ SEX ,
+ subset(this.sample , x > 0) ,
+ na.rm = TRUE)
+ })
+
+# compute the (finite population) J-divergence decomposition per sex
+jdivt.pop <-
+ convey:::CalcJDiv(pop.df$x , ifelse(pop.df$x > 0 , 1 , 0))
+
+overall.mean <- mean(pop.df$x[pop.df$x > 0])
+
+group.mean <-
+ c(by(pop.df$x[pop.df$x > 0] , list("SEX" = factor(pop.df$SEX[pop.df$x > 0])) , FUN = mean))
+
+group.pshare <-
+ c(prop.table(by(rep(1 , sum(
+ pop.df$x > 0
+ )) , list(
+ "SEX" = factor(pop.df$SEX[pop.df$x > 0])
+ ) , FUN = sum)))
+
+jdivb.pop <-
+ sum(group.pshare * (group.mean / overall.mean - 1) * log(group.mean / overall.mean))
+
+jdivw.pop <- jdivt.pop - jdivb.pop
+
+(theta.pop <- c(jdivt.pop , jdivw.pop , jdivb.pop))
## [1] 0.433264877 0.428398928 0.004865949
-# estimate the expected value of the J-divergence decomposition estimator
-# using the average of the estimates
-(theta.exp <- rowMeans(sapply(jdivdec.estimate.list , coef)))
+# estimate the expected value of the J-divergence decomposition estimator
+# using the average of the estimates
+(theta.exp <- rowMeans(sapply(jdivdec.estimate.list , coef)))
## total within between
## 0.432782322 0.427480257 0.005302065
-
+
## total within between
## -0.1113765 -0.2144430 8.9626164
The estimated PRB for the total is the same as before, so we will focus on the within and between components. While the within component has a small relative bias (-0.21%), the between component PRB is significant, amounting to 8.96%.
For the variance estimator, we do:
-# estimate the variance of the J-divergence estimator
-# using the variance of the estimates
-(vartheta.popest <-
- diag(var(t(
- sapply(jdivdec.estimate.list , coef)
- ))))
+# estimate the variance of the J-divergence estimator
+# using the variance of the estimates
+(vartheta.popest <-
+ diag(var(t(
+ sapply(jdivdec.estimate.list , coef)
+ ))))
## total within between
## 5.434848e-04 5.391901e-04 8.750879e-06
-# estimate the expected value of the J-divergence index variance estimator
-# using the expected of the variance estimates
-(vartheta.exp <-
- rowMeans(sapply(jdivdec.estimate.list , function(z)
- diag(vcov(
- z
- )))))
+# estimate the expected value of the J-divergence index variance estimator
+# using the expected of the variance estimates
+(vartheta.exp <-
+ rowMeans(sapply(jdivdec.estimate.list , function(z)
+ diag(vcov(
+ z
+ )))))
## total within between
## 5.342947e-04 5.286750e-04 8.891772e-06
-# estimate the percentage relative bias of the variance estimator
-100 * (vartheta.exp / vartheta.popest - 1)
+# estimate the percentage relative bias of the variance estimator
+100 * (vartheta.exp / vartheta.popest - 1)
## total within between
## -1.690964 -1.950177 1.610034
The PRB of the variance estimators for both components are small: -1.95% for the within component and 1.61% for the between component.
Now, how much should we care about the between component bias? Our simulations show that the Squared Bias of this estimator accounts for less than 2% of its Mean Squared Error:
-theta.bias2 <- (theta.exp - theta.pop) ^ 2
-theta.mse <- theta.bias2 + vartheta.popest
-100 * (theta.bias2 / theta.mse)
+theta.bias2 <- (theta.exp - theta.pop) ^ 2
+theta.mse <- theta.bias2 + vartheta.popest
+100 * (theta.bias2 / theta.mse)
## total within between
## 0.04282728 0.15627854 2.12723212
Next, we evaluate the Percentage Coverage Rate (PCR). In theory, under repeated sampling, the estimated 95% CIs should cover the population parameter 95% of the time. We can evaluate that using:
-# estimate confidence intervals of the Zenga index
-# for each of the samples
-est.coverage <-
- t(sapply(jdivdec.estimate.list, function(this.stat)
- confint(this.stat)[, 1] <= theta.pop &
- confint(this.stat)[, 2] >= theta.pop))
-
-# evaluate
-colMeans(est.coverage)
+# estimate confidence intervals of the Zenga index
+# for each of the samples
+est.coverage <-
+ t(sapply(jdivdec.estimate.list, function(this.stat)
+ confint(this.stat)[, 1] <= theta.pop &
+ confint(this.stat)[, 2] >= theta.pop))
+
+# evaluate
+colMeans(est.coverage)
## total within between
## 0.9390 0.9376 0.9108
Our coverages are not too far from the nominal coverage level of 95%, however the bias of the between component estimator can affect its coverage rate.
diff --git a/docs/4.2-the-gender-pay-gap-svygpg.html b/docs/4.2-the-gender-pay-gap-svygpg.html
index 349c76e..f7ac493 100644
--- a/docs/4.2-the-gender-pay-gap-svygpg.html
+++ b/docs/4.2-the-gender-pay-gap-svygpg.html
@@ -265,12 +265,12 @@
4.2 The Gender Pay Gap (svygpg)
-✔️ easy to understand
-✔️ the difference of men and women average wages expressed as a fraction of average men wages
-✔️ alternatively: the average women wage is `( 1 - GPG ) x average men wage`
-❌ not an inequality measure in the Pigou-Dalton Principle sense
-❌ ignores within-inequality among men and among women
-❌ binary gender only
+✔️ easy to understand
+✔️ the difference of men and women average wages expressed as a fraction of average men wages
+✔️ alternatively: the average women wage is `( 1 - GPG ) x average men wage`
+❌ not an inequality measure in the Pigou-Dalton Principle sense
+❌ ignores within-inequality among men and among women
+❌ binary gender only
Although the Gender Pay Gap (GPG) is not an inequality measure in the usual sense, it can still be a useful instrument to evaluate the effects of gender discrimination. Put simply, it expresses the relative difference between the average hourly earnings of men and women, presenting this difference as a percentage of the average of hourly earnings of men. Like some other functions described in this text, the GPG can also be calculated using wealth or assets in place of earnings.
In mathematical terms, this index can be described as,
\[ GPG = \frac{ \bar{y}_{male} - \bar{y}_{female} }{ \bar{y}_{male} } \]
@@ -282,148 +282,148 @@ 4.2 The Gender Pay Gap (svygpg)
4.2.1 Replication Example
The R vardpoor
package (Breidaks, Liberts, and Ivanova 2016Breidaks, Juris, Martins Liberts, and Santa Ivanova. 2016. “Vardpoor: Estimation of Indicators on Social Exclusion and Poverty and Its Linearization, Variance Estimation.” Riga, Latvia: CSB.), created by researchers at the Central Statistical Bureau of Latvia, includes a GPG coefficient calculation using the ultimate cluster method. The example below reproduces those statistics.
Load and prepare the same data set:
-# load the convey package
-library(convey)
-
-# load the survey library
-library(survey)
-
-# load the vardpoor library
-library(vardpoor)
-
-# load the laeken library
-library(laeken)
-
-# load the synthetic EU statistics on income & living conditions
-data(eusilc)
-
-# make all column names lowercase
-names(eusilc) <- tolower(names(eusilc))
-
-# coerce the gender variable to numeric 1 or 2
-eusilc$one_two <- as.numeric(eusilc$rb090 == "female") + 1
-
-# add a column with the row number
-dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
-
-# calculate the gpg coefficient
-# using the R vardpoor library
-varpoord_gpg_calculation <-
- varpoord(
- # analysis variable
- Y = "eqincome",
-
- # weights variable
- w_final = "rb050",
-
- # row number variable
- ID_level1 = "IDd",
-
- # row number variable
- ID_level2 = "IDd",
-
- # strata variable
- H = "db040",
-
- N_h = NULL ,
-
- # clustering variable
- PSU = "rb030",
-
- # data.table
- dataset = dati,
-
- # gpg coefficient function
- type = "lingpg" ,
-
- # gender variable
- gender = "one_two",
-
- # get linearized variable
- outp_lin = TRUE
- )
-
-
-
-# construct a survey.design
-# using our recommended setup
-des_eusilc <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc
- )
-
-# immediately run the convey_prep function on it
-des_eusilc <- convey_prep(des_eusilc)
-
-# coefficients do match
-varpoord_gpg_calculation$all_result$value
+# load the convey package
+library(convey)
+
+# load the survey library
+library(survey)
+
+# load the vardpoor library
+library(vardpoor)
+
+# load the laeken library
+library(laeken)
+
+# load the synthetic EU statistics on income & living conditions
+data(eusilc)
+
+# make all column names lowercase
+names(eusilc) <- tolower(names(eusilc))
+
+# coerce the gender variable to numeric 1 or 2
+eusilc$one_two <- as.numeric(eusilc$rb090 == "female") + 1
+
+# add a column with the row number
+dati <- data.table::data.table(IDd = 1:nrow(eusilc), eusilc)
+
+# calculate the gpg coefficient
+# using the R vardpoor library
+varpoord_gpg_calculation <-
+ varpoord(
+ # analysis variable
+ Y = "eqincome",
+
+ # weights variable
+ w_final = "rb050",
+
+ # row number variable
+ ID_level1 = "IDd",
+
+ # row number variable
+ ID_level2 = "IDd",
+
+ # strata variable
+ H = "db040",
+
+ N_h = NULL ,
+
+ # clustering variable
+ PSU = "rb030",
+
+ # data.table
+ dataset = dati,
+
+ # gpg coefficient function
+ type = "lingpg" ,
+
+ # gender variable
+ gender = "one_two",
+
+ # get linearized variable
+ outp_lin = TRUE
+ )
+
+
+
+# construct a survey.design
+# using our recommended setup
+des_eusilc <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc
+ )
+
+# immediately run the convey_prep function on it
+des_eusilc <- convey_prep(des_eusilc)
+
+# coefficients do match
+varpoord_gpg_calculation$all_result$value
## [1] 7.645389
-
+
## eqincome
## -8.278297
-# linearized variables do match
-# vardpoor
-lin_gpg_varpoord <- varpoord_gpg_calculation$lin_out$lin_gpg
-# convey
-lin_gpg_convey <-
- attr(svygpg( ~ eqincome , des_eusilc, sex = ~ rb090), "lin")
-
-# check equality
-all.equal(lin_gpg_varpoord, 100 * lin_gpg_convey[, 1])
+# linearized variables do match
+# vardpoor
+lin_gpg_varpoord <- varpoord_gpg_calculation$lin_out$lin_gpg
+# convey
+lin_gpg_convey <-
+ attr(svygpg( ~ eqincome , des_eusilc, sex = ~ rb090), "lin")
+
+# check equality
+all.equal(lin_gpg_varpoord, 100 * lin_gpg_convey[, 1])
## [1] "Mean relative difference: 2.172419"
-# variances do not match exactly
-attr(svygpg( ~ eqincome , des_eusilc , sex = ~ rb090) , 'var') * 10000
+# variances do not match exactly
+attr(svygpg( ~ eqincome , des_eusilc , sex = ~ rb090) , 'var') * 10000
## eqincome
## eqincome 0.8926311
-
+
## [1] 0.6482346
-
+
## [1] 0.8051301
-
+
## eqincome
## eqincome 0.9447916
The variance estimate is computed by using the approximation defined in 2, while the linearized variable \(z\) is defined by 2.1. The functions convey::svygpg
and vardpoor::lingpg
produce the same linearized variable \(z\).
However, the measures of uncertainty do not line up, because library(vardpoor)
defaults to an ultimate cluster method that can be replicated with an alternative setup of the survey.design
object.
-# within each strata, sum up the weights
-cluster_sums <-
- aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
-
-# name the within-strata sums of weights the `cluster_sum`
-names(cluster_sums) <- c("db040" , "cluster_sum")
-
-# merge this column back onto the data.frame
-eusilc <- merge(eusilc , cluster_sums)
-
-# construct a survey.design
-# with the fpc using the cluster sum
-des_eusilc_ultimate_cluster <-
- svydesign(
- ids = ~ rb030 ,
- strata = ~ db040 ,
- weights = ~ rb050 ,
- data = eusilc ,
- fpc = ~ cluster_sum
- )
-
-# again, immediately run the convey_prep function on the `survey.design`
-des_eusilc_ultimate_cluster <-
- convey_prep(des_eusilc_ultimate_cluster)
-
-# matches
-attr(svygpg( ~ eqincome , des_eusilc_ultimate_cluster , sex = ~ rb090) ,
- 'var') * 10000
+# within each strata, sum up the weights
+cluster_sums <-
+ aggregate(eusilc$rb050 , list(eusilc$db040) , sum)
+
+# name the within-strata sums of weights the `cluster_sum`
+names(cluster_sums) <- c("db040" , "cluster_sum")
+
+# merge this column back onto the data.frame
+eusilc <- merge(eusilc , cluster_sums)
+
+# construct a survey.design
+# with the fpc using the cluster sum
+des_eusilc_ultimate_cluster <-
+ svydesign(
+ ids = ~ rb030 ,
+ strata = ~ db040 ,
+ weights = ~ rb050 ,
+ data = eusilc ,
+ fpc = ~ cluster_sum
+ )
+
+# again, immediately run the convey_prep function on the `survey.design`
+des_eusilc_ultimate_cluster <-
+ convey_prep(des_eusilc_ultimate_cluster)
+
+# matches
+attr(svygpg( ~ eqincome , des_eusilc_ultimate_cluster , sex = ~ rb090) ,
+ 'var') * 10000
## eqincome
## eqincome 0.8910413
-
+
## [1] 0.6482346
-
+
## [1] 0.8051301
-
+
## eqincome
## eqincome 0.9439499
For additional usage examples of svygpg
, type ?convey::svygpg
in the R console.
diff --git a/docs/4.3-quintile-share-ratio-svyqsr.html b/docs/4.3-quintile-share-ratio-svyqsr.html
index f373d63..8628d08 100644
--- a/docs/4.3-quintile-share-ratio-svyqsr.html
+++ b/docs/4.3-quintile-share-ratio-svyqsr.html
@@ -265,11 +265,11 @@