diff --git a/ReBespokeIV/README.md b/ReBespokeIV/README.md new file mode 100644 index 0000000..6c09305 --- /dev/null +++ b/ReBespokeIV/README.md @@ -0,0 +1,32 @@ +# RE: "Estimating the Effect of a Treatment When There Is Nonadherence in a Trial" + +### Paul N Zivich + +----------------------------------- + +The folder includes the code to replicate the simulations (Python only) and example (Python, R, SAS) for the two-stage +regression bespoke instrumental variable (BSIV) approach described in the letter and referenced paper. + +Data is available from https://www.causeweb.org/tshs/category/dataset/ + +----------------------------------- + +## File Manifesto + +`estfunc.py` +- Python code for the estimating functions for the two-stage regression BSIV. + +`example.py` +- Python code to replicate the Obstetrics and Periodontal Therapy Study example. + +`example.R` +- R code to replicate the Obstetrics and Periodontal Therapy Study example. + +`example.sas` +- SAS code to replicate the Obstetrics and Periodontal Therapy Study example. + +`process_data.py` +- Python code to process the publicly available Obstetrics and Periodontal Therapy Study data for the replications. + +`simulation.py` +- Python code to run the simulation experiment described in the letter. diff --git a/ReBespokeIV/estfunc.py b/ReBespokeIV/estfunc.py new file mode 100644 index 0000000..b05158c --- /dev/null +++ b/ReBespokeIV/estfunc.py @@ -0,0 +1,39 @@ +##################################################################################################################### +# Bespoke Instrumental Variable via two-stage regression as an M-estimator +# Estimating function for Python simulations and example +# +# Paul Zivich (2023/11/22) +#################################################################################################################### + +import numpy as np +from delicatessen.estimating_equations import ee_regression + + +def ee_2sr_bsiv(theta, y, r, a, L): + # Parameters + idxL = L.shape[1] + 2 + beta = theta[0:2] # Parameters of interest + alpha = theta[2:idxL] # Nuisance parameters for E[Y | L] + gamma = theta[idxL:] # Nuisance parameters for E[A | L] + + # Control arm model (E[Y | L]) + ee_control = ee_regression(alpha, L, y, # Regression for Y given L + model='linear') # ... via linear model + ee_control = ee_control * (1 - r) # Restricting to control arm for fit + yhat = np.dot(L, alpha) # Predicted values for Y + + # Received treatment model (E[A | L, R=1]) + ee_receipt = ee_regression(gamma, L, a, # Regression for A given L + model='linear') # ... via linear model + ee_receipt = ee_receipt * r # Restricting to non-control arm for fit + ahat = np.dot(L, gamma) # Predicted values for A + + # Controlled direct effect + X = np.asarray([np.ones(ahat.shape), ahat]).T # Stacking a new design matrix together + ee_cde = ee_regression(beta, X=X, y=y, # Regression for Y given A-hat + model='linear', # ... via linear model + offset=yhat) # ... with Y-hat offset term + ee_cde = ee_cde * r # Restricting to non-control arm for fit + + # Returning stacked estimating functions + return np.vstack([ee_cde, ee_control, ee_receipt]) diff --git a/ReBespokeIV/example.R b/ReBespokeIV/example.R new file mode 100644 index 0000000..57030a0 --- /dev/null +++ b/ReBespokeIV/example.R @@ -0,0 +1,84 @@ +############################################################################### +# Bespoke Instrumental Variable via two-stage regression as an M-estimator +# +# Paul Zivich (2023/11/22) +############################################################################### + +library(geex) + +# Reading in Data +setwd("C:/Users/zivic/Documents/Research/#PZivich/LetterRichardson/") +d <- read.csv("processed.csv") + + +# The root-finding algorithm has trouble with generic starting values, +# so we will instead 'pre-wash' the nuisance model fits and use those +# as starting values. + +ef_nui <- function(data){ + r <- data$R + a <- data$A + y <- data$Y + l1 <- data$L1 + l2 <- data$L2 + function(theta){ + yhat <- theta[1] + theta[2]*l1 + theta[3]*l2 + ahat <- theta[4] + theta[5]*l1 + theta[6]*l2 + + c((1-r)*(y - yhat), + (1-r)*(y - yhat)*l1, + (1-r)*(y - yhat)*l2, + r*(a - ahat), + r*(a - ahat)*l1, + r*(a - ahat)*l2) + } +} + +inits <- c(0, 0, 0, 0, 0, 0) +mestr <- m_estimate(estFUN=ef_nui, data=d, + root_control=setup_root_control(start=inits)) +inits_wash <- roots(mestr) + + +# Solving the overall estimating equations with Geex + +ef_overall <- function(data){ + r <- data$R + a <- data$A + y <- data$Y + l1 <- data$L1 + l2 <- data$L2 + function(theta){ + yhat <- theta[3] + theta[4]*l1 + theta[5]*l2 + ahat <- theta[6] + theta[7]*l1 + theta[8]*l2 + cde <- theta[1] + theta[2]*ahat + ytilde <- y - yhat + + c(r*(ytilde - cde), + r*(ytilde - cde)*ahat, + (1-r)*(y - yhat), + (1-r)*(y - yhat)*l1, + (1-r)*(y - yhat)*l2, + r*(a - ahat), + r*(a - ahat)*l1, + r*(a - ahat)*l2) + } +} + +inits <- c(0, 0, inits_wash) +mestr <- geex::m_estimate(estFUN=ef_overall, data=d, + root_control=setup_root_control(start=inits)) +theta <- roots(mestr) # Point estimates +se <- sqrt(diag(vcov(mestr))) # Variance estimates + +# Processing results to display +beta0 = theta[1] +beta0_se = se[1] +beta0_ci = c(beta0 - 1.96*beta0_se, beta0 + 1.96*beta0_se) +beta1 = theta[2] +beta1_se = se[2] +beta1_ci = c(beta1 - 1.96*beta1_se, beta1 + 1.96*beta1_se) + +print(c(beta0, beta0_ci)) +print(c(beta1, beta1_ci)) + diff --git a/ReBespokeIV/example.py b/ReBespokeIV/example.py new file mode 100644 index 0000000..297abd7 --- /dev/null +++ b/ReBespokeIV/example.py @@ -0,0 +1,75 @@ +##################################################################################################################### +# Bespoke Instrumental Variable via two-stage regression as an M-estimator +# +# Paul Zivich (2023/11/22) +#################################################################################################################### + +import numpy as np +import pandas as pd +from delicatessen import MEstimator + +from estfunc import ee_2sr_bsiv + +# Reading in data +d = pd.read_csv("processed.csv") +d['intercept'] = 1 + +# Processing data into array for delicatessen +r = np.asarray(d['R']) +a = np.asarray(d['A']) +L = np.asarray(d[['intercept', 'L1', 'L2']]) +y = np.asarray(d['Y']) + +# Running the M-estimator procedure + + +def psi(theta): + return ee_2sr_bsiv(theta=theta, y=y, a=a, r=r, L=L) + + +estr = MEstimator(psi, init=[0., ]*8) +estr.estimate() +estimates = np.asarray(estr.theta) +ci = estr.confidence_intervals() + +# Results +print("Beta_0", np.round(estimates[0], 2), np.round(ci[0, :], 6)) +print("Beta_1", np.round(estimates[1], 2), np.round(ci[1, :], 6)) + + +# Repeating but with a bootstrap +bs_iters = 5000 +np.random.seed(20231122) +beta0_rep, beta1_rep = [], [] + +for i in range(bs_iters): + ds = d.sample(frac=1, replace=True) + r = np.asarray(ds['R']) + a = np.asarray(ds['A']) + L = np.asarray(ds[['intercept', 'L1', 'L2']]) + y = np.asarray(ds['Y']) + + # Applying M-estimator to resampled data + estr = MEstimator(psi, init=[0., ] * 8) + estr.estimate() + beta0_rep.append(estr.theta[0]) + beta1_rep.append(estr.theta[1]) + +beta0_se = np.std(beta0_rep, ddof=0) +beta1_se = np.std(beta1_rep, ddof=0) +beta0_ci = [estimates[0] - 1.96*beta0_se, + estimates[0] + 1.96*beta0_se] +beta1_ci = [estimates[1] - 1.96*beta1_se, + estimates[1] + 1.96*beta1_se] + +print("Beta_0", np.round(estimates[0], 2), np.round(beta0_ci, 6)) +print("Beta_1", np.round(estimates[1], 2), np.round(beta1_ci, 6)) + +# OUTPUT +# ------------------------------------------------- +# Sandwich +# Beta_0 -0.33 [-0.428549 -0.233846] +# Beta_1 0.18 [0.004587 0.358593] +# Bootstrap +# Beta_0 -0.33 [-0.431346 -0.23105 ] +# Beta_1 0.18 [-0.00159 0.364769] diff --git a/ReBespokeIV/example.sas b/ReBespokeIV/example.sas new file mode 100644 index 0000000..8ff7f09 --- /dev/null +++ b/ReBespokeIV/example.sas @@ -0,0 +1,109 @@ +/********************************************************************************************************** +Bespoke Instrumental Variable via two-stage regression as an M-estimator + +Paul Zivich (2023/11/22) +**********************************************************************************************************/ + +* Reading in formatted data; +PROC IMPORT OUT=dat + DATAFILE="C:\Users\zivic\Documents\Research\#PZivich\LetterRichardson\processed.csv" + DBMS=CSV REPLACE; + GETNAMES=YES; + DATAROW=2; +RUN; + + +* M-estimator via PROC IML; +PROC IML; /*All steps are completed in PROC IML*/ + /*********************************************** + Read data into IML */ + use dat; /*Open input data from above*/ + read all var {r} into r; /*Read in each column needed as its own vector*/ + read all var {a} into a; + read all var {y} into y; + read all var {l1} into l1; + read all var {l2} into l2; + close dat; + n = nrow(r); /*Save number of observations in data */ + + /*********************************************** + Defining estimating equation */ + q = 8; /*Save number parameters to be estimated*/ + + START efunc(beta) global(r, a, y, l1, l2); /*Start to define estimating function */ + /*Processing predicted values from models*/ + yhat = beta[3] + beta[4]*l1 + beta[5]*l2; /*hatY = E[Y | L, R=0]*/ + ahat = beta[6] + beta[7]*l1 + beta[8]*l2; /*hatA = Pr(A=1 | L, R=1)*/ + cde = beta[1] + beta[2]*ahat; /*Parameters of interest*/ + ytilde = y - yhat; /*tildeY = Y - hatY*/ + + /*Estimating functions*/ + ef_1 = r#(ytilde - cde); /*EF for \beta_0*/ + ef_2 = r#(ytilde - cde)#ahat; /*EF for \beta_1*/ + ef_3 = (1-r)#(y - yhat); /*EF for intercept of hatY*/ + ef_4 = (1-r)#(y - yhat)#l1; /*EF for L1 term of hatY*/ + ef_5 = (1-r)#(y - yhat)#l2; /*EF for L2 term of hatY*/ + ef_6 = r#(a - ahat); /*EF for intercept of hatA*/ + ef_7 = r#(a - ahat)#l1; /*EF for L1 term of hatA*/ + ef_8 = r#(a - ahat)#l2; /*EF for L2 term of hat A*/ + + /*Stacking estimating functions together*/ + ef_mat = ef_1||ef_2||ef_3||ef_4||ef_5||ef_6||ef_7||ef_8; + RETURN(ef_mat); /*Return n by q matrix for estimating functions*/ + FINISH efunc; /*End definition of estimating equation*/ + + START eequat(beta); /*Start to define estimating equation (single argument)*/ + ef = efunc(beta); /*Compute estimating functions*/ + RETURN(ef[+,]); /*Return column sums, 1 by q vector)*/ + FINISH eequat; /*End definition of estimating equation*/ + + /*********************************************** + Root-finding */ + initial = {0.,0.,0.,0.,0.,0.,0.,0.}; * Initial parameter values; + optn = q || 1; * Set options for nlplm, (8 - requests 8 roots,1 - printing summary output); + tc = j(1, 12, .); * Create vector for Termination Criteria options, set all to default using .; + tc[6] = 1e-9; * Replace 6th option in tc to change default tolerance; + CALL nlplm(rc, /*Use the Levenberg-Marquardt root-finding method*/ + beta_hat, /*... name of output parameters that give the root*/ + "eequat", /*... function to find the roots of*/ + initial, /*... starting values for root-finding*/ + optn, , /*... optional arguments for root-finding procedure*/ + tc); /*... update convergence tolerance*/ + + /*********************************************** + Baking the bread (approximate derivative) */ + par = q||.||.; * Set options for nlpfdd, (3 - 3 parameters, . = default); + CALL nlpfdd(func, /*Derivative approximation function*/ + deriv, /*... name of output matrix that gives the derivative*/ + na, /*... name of output matrix that gives the 2nd derivative - we do not need this*/ + "eequat", /*... function to approximate the derivative of*/ + beta_hat, /*... point where to find derivative*/ + par); /*... details for derivative approximation*/ + bread = - (deriv) / n; * Negative derivative, averaged; + + /*********************************************** + Cooking the filling (matrix algebra) */ + residuals = efunc(beta_hat); * Value of estimating functions at beta hat (n by q matrix); + outerprod = residuals` * residuals; * Outerproduct of residuals (note transpose is flipped from slides); + filling = outerprod / n; * Divide by n for filling; + + /*********************************************** + Assembling the sandwich (matrix algebra) */ + sandwich = ( inv(bread) * filling * inv(bread)` ) / n; + + /*********************************************** + Formatting results for output */ + vars = {"beta_0","beta_1","Y|R=0","R|L1,R=0","R|L2,R=0","A|R=1","A|L1,R=1","A|L2,R=1"}; + est = beta_hat`; /*Point estimates*/ + se = sqrt(vecdiag(sandwich)); /*Extract corresponding SE for each parameter*/ + lcl = est - 1.96*se; /*Calculated lcl*/ + ucl = est + 1.96*se; /*Calculate ucl*/ + PRINT vars est se lcl ucl; /*Print information to the Results Viewer*/ + + CREATE ests_mest VAR {vars est se lcl ucl}; /*Create an output data set called `out`*/ + APPEND; /*... that includes the parameter estimates, variance, and SE*/ + CLOSE ests_mest; /*Close the output*/ + QUIT; +RUN; + +/*END*/ diff --git a/ReBespokeIV/process_data.py b/ReBespokeIV/process_data.py new file mode 100644 index 0000000..b51d4df --- /dev/null +++ b/ReBespokeIV/process_data.py @@ -0,0 +1,37 @@ +##################################################################################################################### +# Bespoke Instrumental Variable via two-stage regression as an M-estimator +# Process publicly available data from Teaching of Statistics in the Health Sciences Resources Portal +# (https://www.causeweb.org/tshs/category/dataset/) to replicate the analysis. +# +# Paul Zivich (2023/11/22) +#################################################################################################################### + +import numpy as np +import pandas as pd + +# Reading in original data +d = pd.read_csv("OPT_Study_Person-level_Data.csv") + +# Restricting to only columns needed +d = d[['Group', 'Tx comp?', 'OFIBRIN1', 'ETXU_CAT1', 'V5 %BOP']].copy() + +# Recoding some variables from the excel formatting +d['assign'] = np.where(d['Group'] == 'T', 1, 0) +d['complete'] = np.where(d['Tx comp?'] == 'Yes', 1, 0) +d['OFIBRIN1'] = pd.to_numeric(d['OFIBRIN1'], errors='coerce') +d['ETXU_CAT1'] = pd.to_numeric(d['ETXU_CAT1'], errors='coerce') +d['V5 %BOP'] = d['V5 %BOP'] / 100 + +# Dropping observations with missing data +d = d.dropna(subset=['V5 %BOP', 'OFIBRIN1', 'ETXU_CAT1']).copy() + +# Renaming variables for consistency with the paper +d['R'] = d['assign'] +d['A'] = d['complete'] +d['Y'] = d['V5 %BOP'] +d['L1'] = d['OFIBRIN1'] +d['L2'] = d['ETXU_CAT1'] + +# Outputting as a CSV for use across programs +d[['R', 'A', 'Y', 'L1', 'L2']].to_csv("processed.csv") +d.info() diff --git a/ReBespokeIV/simulation.py b/ReBespokeIV/simulation.py new file mode 100644 index 0000000..19915a2 --- /dev/null +++ b/ReBespokeIV/simulation.py @@ -0,0 +1,140 @@ +##################################################################################################################### +# Bespoke Instrumental Variable via two-stage regression as an M-estimator +# Replication of the simulation study but with the sandwich variance estimator +# +# Paul Zivich (2023/11/22) +#################################################################################################################### + +import numpy as np +import pandas as pd +from delicatessen import MEstimator + +from estfunc import ee_2sr_bsiv + + +# Setup +sims = 1000 +exclude_restrict = False +n = 5000 + + +def generate_data(n, exr): + d = pd.DataFrame() + d['L1'] = np.random.uniform(-1, 1, size=n) + d['L2'] = np.random.binomial(n=1, p=0.5, size=n) + d['R'] = np.random.binomial(n=1, p=0.5, size=n) + pr_a = 0.25 + 0.25*d['L1'] + 0.25*d['L2'] + a_flip = np.random.binomial(n=1, p=pr_a, size=n) + d['A'] = np.where(d['R'] == 1, a_flip, 0) + d['C'] = 1 + + # Outcome generation for scenarios + if exr: + y_det = 1 + 1*d['L1'] + 1*d['L2'] + 1*d['A'] + else: + y_det = 1 + 1*d['L1'] + 1*d['L2'] + 1*d['A'] + 1*d['R'] + d['Y'] = y_det + np.random.normal(size=n) + + return d + + +def psi(theta): + return ee_2sr_bsiv(theta=theta, y=y, a=a, r=r, L=L) + + +# Creating result storage +results = pd.DataFrame(columns=['beta0', 'beta0_var', 'beta0_ci', + 'beta1', 'beta1_var', 'beta1_ci']) + +if exclude_restrict: + beta0_truth = 0 + np.random.seed(7777777) +else: + beta0_truth = 1 + np.random.seed(96369) +beta1_truth = 1 + +for i in range(sims): + row = [] + + # Sample data and process for estimating equations + d = generate_data(n=n, exr=exclude_restrict) + y = np.asarray(d['Y']) + a = np.asarray(d['A']) + r = np.asarray(d['R']) + L = np.asarray(d[['C', 'L1']]) + + # Apply M-estimator + estr = MEstimator(psi, init=[0., 0., + 0., 0., 0., 0., ]) + estr.estimate(deriv_method='exact') + beta0ci = estr.confidence_intervals()[0, :] + beta1ci = estr.confidence_intervals()[1, :] + + # Processing output to save + beta0, beta1 = estr.theta[0] - beta0_truth, estr.theta[1] - beta1_truth + beta0v, beta1v = np.diag(estr.variance)[0:2] + if beta0ci[0] < beta0_truth < beta0ci[1]: + beta0c = 1 + else: + beta0c = 0 + if beta1ci[0] < beta1_truth < beta1ci[1]: + beta1c = 1 + else: + beta1c = 0 + + row = row + [beta0, beta0v, beta0c] + row = row + [beta1, beta1v, beta1c] + results.loc[len(results.index)] = row + + +def calculate_metrics(data, estimator): + bias = np.mean(data[estimator]) + ase = np.mean(np.sqrt(data[estimator + '_var'])) + ese = np.std(data[estimator]) + ser = ase / ese + cover = np.mean(data[estimator + '_ci']) + return bias, ser, cover + + +def create_table(data): + """Function to create a table of the simulation results + + Parameters + ---------- + data : pandas.DataFrame + Simulation output from the `run_estimators.py` file + + Returns + ------- + pandas.DataFrame + """ + # Creating blank DataFrame + table = pd.DataFrame(columns=['Estimator', 'Bias', 'SER', 'Coverage']) + + # String indicators for all scenarios explored + estrs = ['beta0', 'beta1'] + + # Calculating metrics for each estimator, then adding as a row in the table + for estr in estrs: + bias, cld, cover = calculate_metrics(data=data, estimator=estr) + table.loc[len(table.index)] = [estr, bias, cld, cover] + + # Return processed simulation results + return table + + +table = create_table(data=results) +print(table) + +# OUTPUT +# ------------------------------------------------- +# With exclusion restriction +# Estimator Bias SER Coverage +# 0 beta0 -0.000898 1.026384 0.948 +# 1 beta1 0.002159 1.031067 0.954 +# +# Without exclusion restriction +# Estimator Bias SER Coverage +# 0 beta0 0.000286 1.000151 0.953 +# 1 beta1 0.002314 0.975548 0.942