Adding Bespoke IV letter code

pzivich · Nov 22, 2023 · 38bfab5 · 38bfab5
1 parent 5810679
commit 38bfab5
Show file tree

Hide file tree

Showing 7 changed files with 516 additions and 0 deletions.
diff --git a/ReBespokeIV/README.md b/ReBespokeIV/README.md
@@ -0,0 +1,32 @@
+# RE: "Estimating the Effect of a Treatment When There Is Nonadherence in a Trial"
+
+### Paul N Zivich
+
+-----------------------------------
+
+The folder includes the code to replicate the simulations (Python only) and example (Python, R, SAS) for the two-stage
+regression bespoke instrumental variable (BSIV) approach described in the letter and referenced paper.
+
+Data is available from https://www.causeweb.org/tshs/category/dataset/
+
+-----------------------------------
+
+## File Manifesto
+
+`estfunc.py`
+- Python code for the estimating functions for the two-stage regression BSIV.
+
+`example.py`
+- Python code to replicate the Obstetrics and Periodontal Therapy Study example.
+
+`example.R`
+- R code to replicate the Obstetrics and Periodontal Therapy Study example.
+
+`example.sas`
+- SAS code to replicate the Obstetrics and Periodontal Therapy Study example.
+
+`process_data.py`
+- Python code to process the publicly available Obstetrics and Periodontal Therapy Study data for the replications.
+
+`simulation.py`
+- Python code to run the simulation experiment described in the letter.
diff --git a/ReBespokeIV/estfunc.py b/ReBespokeIV/estfunc.py
@@ -0,0 +1,39 @@
+#####################################################################################################################
+# Bespoke Instrumental Variable via two-stage regression as an M-estimator
+#       Estimating function for Python simulations and example
+#
+# Paul Zivich (2023/11/22)
+####################################################################################################################
+
+import numpy as np
+from delicatessen.estimating_equations import ee_regression
+
+
+def ee_2sr_bsiv(theta, y, r, a, L):
+    # Parameters
+    idxL = L.shape[1] + 2
+    beta = theta[0:2]        # Parameters of interest
+    alpha = theta[2:idxL]    # Nuisance parameters for E[Y | L]
+    gamma = theta[idxL:]     # Nuisance parameters for E[A | L]
+
+    # Control arm model (E[Y | L])
+    ee_control = ee_regression(alpha, L, y,        # Regression for Y given L
+                               model='linear')     # ... via linear model
+    ee_control = ee_control * (1 - r)              # Restricting to control arm for fit
+    yhat = np.dot(L, alpha)                        # Predicted values for Y
+
+    # Received treatment model (E[A | L, R=1])
+    ee_receipt = ee_regression(gamma, L, a,        # Regression for A given L
+                               model='linear')     # ... via linear model
+    ee_receipt = ee_receipt * r                    # Restricting to non-control arm for fit
+    ahat = np.dot(L, gamma)                        # Predicted values for A
+
+    # Controlled direct effect
+    X = np.asarray([np.ones(ahat.shape), ahat]).T  # Stacking a new design matrix together
+    ee_cde = ee_regression(beta, X=X, y=y,         # Regression for Y given A-hat
+                           model='linear',         # ... via linear model
+                           offset=yhat)            # ... with Y-hat offset term
+    ee_cde = ee_cde * r                            # Restricting to non-control arm for fit
+
+    # Returning stacked estimating functions
+    return np.vstack([ee_cde, ee_control, ee_receipt])
diff --git a/ReBespokeIV/example.R b/ReBespokeIV/example.R
@@ -0,0 +1,84 @@
+###############################################################################
+# Bespoke Instrumental Variable via two-stage regression as an M-estimator
+#
+# Paul Zivich (2023/11/22)
+###############################################################################
+
+library(geex)
+
+# Reading in Data
+setwd("C:/Users/zivic/Documents/Research/#PZivich/LetterRichardson/")
+d <- read.csv("processed.csv")
+
+
+# The root-finding algorithm has trouble with generic starting values,
+#   so we will instead 'pre-wash' the nuisance model fits and use those
+#   as starting values. 
+
+ef_nui <- function(data){
+    r <- data$R
+    a <- data$A
+    y <- data$Y
+    l1 <- data$L1
+    l2 <- data$L2
+    function(theta){
+        yhat <- theta[1] + theta[2]*l1 + theta[3]*l2
+        ahat <- theta[4] + theta[5]*l1 + theta[6]*l2
+
+        c((1-r)*(y - yhat),
+          (1-r)*(y - yhat)*l1,
+          (1-r)*(y - yhat)*l2,
+          r*(a - ahat),
+          r*(a - ahat)*l1,
+          r*(a - ahat)*l2)
+    }
+}
+
+inits <- c(0, 0, 0, 0, 0, 0)
+mestr <- m_estimate(estFUN=ef_nui, data=d,
+                    root_control=setup_root_control(start=inits))
+inits_wash <- roots(mestr)
+
+
+# Solving the overall estimating equations with Geex
+
+ef_overall <- function(data){
+    r <- data$R
+    a <- data$A
+    y <- data$Y
+    l1 <- data$L1
+    l2 <- data$L2
+    function(theta){
+        yhat <- theta[3] + theta[4]*l1 + theta[5]*l2
+        ahat <- theta[6] + theta[7]*l1 + theta[8]*l2
+        cde <- theta[1] + theta[2]*ahat
+        ytilde <- y - yhat
+
+        c(r*(ytilde - cde),
+          r*(ytilde - cde)*ahat,
+          (1-r)*(y - yhat),
+          (1-r)*(y - yhat)*l1,
+          (1-r)*(y - yhat)*l2,
+          r*(a - ahat),
+          r*(a - ahat)*l1,
+          r*(a - ahat)*l2)
+    }
+}
+
+inits <- c(0, 0, inits_wash)
+mestr <- geex::m_estimate(estFUN=ef_overall, data=d,
+                          root_control=setup_root_control(start=inits))
+theta <- roots(mestr)               # Point estimates
+se <- sqrt(diag(vcov(mestr)))       # Variance estimates
+
+# Processing results to display
+beta0 = theta[1]
+beta0_se = se[1]
+beta0_ci = c(beta0 - 1.96*beta0_se, beta0 + 1.96*beta0_se)
+beta1 = theta[2]
+beta1_se = se[2]
+beta1_ci = c(beta1 - 1.96*beta1_se, beta1 + 1.96*beta1_se)
+
+print(c(beta0, beta0_ci))
+print(c(beta1, beta1_ci))
+
diff --git a/ReBespokeIV/example.py b/ReBespokeIV/example.py
@@ -0,0 +1,75 @@
+#####################################################################################################################
+# Bespoke Instrumental Variable via two-stage regression as an M-estimator
+#
+# Paul Zivich (2023/11/22)
+####################################################################################################################
+
+import numpy as np
+import pandas as pd
+from delicatessen import MEstimator
+
+from estfunc import ee_2sr_bsiv
+
+# Reading in data
+d = pd.read_csv("processed.csv")
+d['intercept'] = 1
+
+# Processing data into array for delicatessen
+r = np.asarray(d['R'])
+a = np.asarray(d['A'])
+L = np.asarray(d[['intercept', 'L1', 'L2']])
+y = np.asarray(d['Y'])
+
+# Running the M-estimator procedure
+
+
+def psi(theta):
+    return ee_2sr_bsiv(theta=theta, y=y, a=a, r=r, L=L)
+
+
+estr = MEstimator(psi, init=[0., ]*8)
+estr.estimate()
+estimates = np.asarray(estr.theta)
+ci = estr.confidence_intervals()
+
+# Results
+print("Beta_0", np.round(estimates[0], 2), np.round(ci[0, :], 6))
+print("Beta_1", np.round(estimates[1], 2), np.round(ci[1, :], 6))
+
+
+# Repeating but with a bootstrap
+bs_iters = 5000
+np.random.seed(20231122)
+beta0_rep, beta1_rep = [], []
+
+for i in range(bs_iters):
+    ds = d.sample(frac=1, replace=True)
+    r = np.asarray(ds['R'])
+    a = np.asarray(ds['A'])
+    L = np.asarray(ds[['intercept', 'L1', 'L2']])
+    y = np.asarray(ds['Y'])
+
+    # Applying M-estimator to resampled data
+    estr = MEstimator(psi, init=[0., ] * 8)
+    estr.estimate()
+    beta0_rep.append(estr.theta[0])
+    beta1_rep.append(estr.theta[1])
+
+beta0_se = np.std(beta0_rep, ddof=0)
+beta1_se = np.std(beta1_rep, ddof=0)
+beta0_ci = [estimates[0] - 1.96*beta0_se,
+            estimates[0] + 1.96*beta0_se]
+beta1_ci = [estimates[1] - 1.96*beta1_se,
+            estimates[1] + 1.96*beta1_se]
+
+print("Beta_0", np.round(estimates[0], 2), np.round(beta0_ci, 6))
+print("Beta_1", np.round(estimates[1], 2), np.round(beta1_ci, 6))
+
+# OUTPUT
+# -------------------------------------------------
+# Sandwich
+# Beta_0 -0.33 [-0.428549 -0.233846]
+# Beta_1 0.18 [0.004587 0.358593]
+# Bootstrap
+# Beta_0 -0.33 [-0.431346 -0.23105 ]
+# Beta_1 0.18 [-0.00159   0.364769]
diff --git a/ReBespokeIV/example.sas b/ReBespokeIV/example.sas
@@ -0,0 +1,109 @@
+/**********************************************************************************************************
+Bespoke Instrumental Variable via two-stage regression as an M-estimator
+
+Paul Zivich (2023/11/22)
+**********************************************************************************************************/
+
+* Reading in formatted data;
+PROC IMPORT OUT=dat 
+            DATAFILE="C:\Users\zivic\Documents\Research\#PZivich\LetterRichardson\processed.csv" 
+            DBMS=CSV REPLACE;
+	GETNAMES=YES;
+    DATAROW=2; 
+RUN;
+
+
+* M-estimator via PROC IML;
+PROC IML;                            /*All steps are completed in PROC IML*/
+	/***********************************************
+	Read data into IML */
+	use dat;							/*Open input data from above*/
+		read all var {r} into r;		/*Read in each column needed as its own vector*/
+		read all var {a} into a;
+		read all var {y} into y;
+		read all var {l1} into l1;
+		read all var {l2} into l2;
+	close dat;
+	n = nrow(r);                        /*Save number of observations in data */
+
+	/***********************************************
+	Defining estimating equation */
+	q = 8;											/*Save number parameters to be estimated*/
+
+	START efunc(beta) global(r, a, y, l1, l2);	   	/*Start to define estimating function */ 
+		/*Processing predicted values from models*/
+		yhat = beta[3] + beta[4]*l1 + beta[5]*l2;  	/*hatY = E[Y | L, R=0]*/
+		ahat = beta[6] + beta[7]*l1 + beta[8]*l2;  	/*hatA = Pr(A=1 | L, R=1)*/
+		cde = beta[1] + beta[2]*ahat;              	/*Parameters of interest*/
+		ytilde = y - yhat;                         	/*tildeY = Y - hatY*/
+
+		/*Estimating functions*/
+		ef_1 = r#(ytilde - cde); 					/*EF for \beta_0*/
+		ef_2 = r#(ytilde - cde)#ahat;				/*EF for \beta_1*/
+		ef_3 = (1-r)#(y - yhat);					/*EF for intercept of hatY*/
+		ef_4 = (1-r)#(y - yhat)#l1;					/*EF for L1 term of hatY*/
+		ef_5 = (1-r)#(y - yhat)#l2;					/*EF for L2 term of hatY*/
+		ef_6 = r#(a - ahat);						/*EF for intercept of hatA*/
+		ef_7 = r#(a - ahat)#l1;						/*EF for L1 term of hatA*/
+		ef_8 = r#(a - ahat)#l2;						/*EF for L2 term of hat A*/
+
+		/*Stacking estimating functions together*/
+		ef_mat = ef_1||ef_2||ef_3||ef_4||ef_5||ef_6||ef_7||ef_8;
+		RETURN(ef_mat);                         	/*Return n by q matrix for estimating functions*/
+	FINISH efunc;                       			/*End definition of estimating equation*/
+
+	START eequat(beta);					 			/*Start to define estimating equation (single argument)*/ 
+		ef = efunc(beta);							/*Compute estimating functions*/
+		RETURN(ef[+,]);                  			/*Return column sums, 1 by q vector)*/
+	FINISH eequat;                       			/*End definition of estimating equation*/
+
+	/***********************************************
+	Root-finding */
+	initial = {0.,0.,0.,0.,0.,0.,0.,0.};        * Initial parameter values;
+	optn = q || 1;                      * Set options for nlplm, (8 - requests 8 roots,1 - printing summary output);
+	tc = j(1, 12, .);                   * Create vector for Termination Criteria options, set all to default using .;
+	tc[6] = 1e-9;                       * Replace 6th option in tc to change default tolerance;
+	CALL nlplm(rc,                      /*Use the Levenberg-Marquardt root-finding method*/
+			   beta_hat,                /*... name of output parameters that give the root*/
+			   "eequat",                /*... function to find the roots of*/
+			   initial,                 /*... starting values for root-finding*/
+               optn, ,                  /*... optional arguments for root-finding procedure*/
+               tc);                     /*... update convergence tolerance*/
+
+	/***********************************************
+	Baking the bread (approximate derivative) */
+	par = q||.||.;                   	* Set options for nlpfdd, (3 - 3 parameters, . = default);
+	CALL nlpfdd(func,                   /*Derivative approximation function*/
+                deriv,                  /*... name of output matrix that gives the derivative*/
+                na,                     /*... name of output matrix that gives the 2nd derivative - we do not need this*/
+                "eequat",               /*... function to approximate the derivative of*/
+                beta_hat,               /*... point where to find derivative*/
+                par);                   /*... details for derivative approximation*/ 
+	bread = - (deriv) / n;              * Negative derivative, averaged;
+
+	/***********************************************
+	Cooking the filling (matrix algebra) */
+	residuals = efunc(beta_hat);		    * Value of estimating functions at beta hat (n by q matrix);
+	outerprod = residuals` * residuals;     * Outerproduct of residuals (note transpose is flipped from slides);
+	filling = outerprod / n; 				* Divide by n for filling;
+
+	/***********************************************
+	Assembling the sandwich (matrix algebra) */
+	sandwich = ( inv(bread) * filling * inv(bread)` ) / n;
+
+	/***********************************************
+	Formatting results for output */
+	vars = {"beta_0","beta_1","Y|R=0","R|L1,R=0","R|L2,R=0","A|R=1","A|L1,R=1","A|L2,R=1"}; 
+	est = beta_hat`;                    			/*Point estimates*/
+	se = sqrt(vecdiag(sandwich));       			/*Extract corresponding SE for each parameter*/
+	lcl = est - 1.96*se; 							/*Calculated lcl*/
+	ucl = est + 1.96*se;							/*Calculate ucl*/
+	PRINT vars est se lcl ucl;     					/*Print information to the Results Viewer*/
+
+	CREATE ests_mest VAR {vars est se lcl ucl};   	/*Create an output data set called `out`*/
+		APPEND;                         		  	/*... that includes the parameter estimates, variance, and SE*/
+	CLOSE ests_mest;                          		/*Close the output*/
+	QUIT;                                   
+RUN;
+
+/*END*/
diff --git a/ReBespokeIV/process_data.py b/ReBespokeIV/process_data.py
@@ -0,0 +1,37 @@
+#####################################################################################################################
+# Bespoke Instrumental Variable via two-stage regression as an M-estimator
+#       Process publicly available data from Teaching of Statistics in the Health Sciences Resources Portal
+#       (https://www.causeweb.org/tshs/category/dataset/) to replicate the analysis.
+#
+# Paul Zivich (2023/11/22)
+####################################################################################################################
+
+import numpy as np
+import pandas as pd
+
+# Reading in original data
+d = pd.read_csv("OPT_Study_Person-level_Data.csv")
+
+# Restricting to only columns needed
+d = d[['Group', 'Tx comp?', 'OFIBRIN1', 'ETXU_CAT1', 'V5 %BOP']].copy()
+
+# Recoding some variables from the excel formatting
+d['assign'] = np.where(d['Group'] == 'T', 1, 0)
+d['complete'] = np.where(d['Tx comp?'] == 'Yes', 1, 0)
+d['OFIBRIN1'] = pd.to_numeric(d['OFIBRIN1'], errors='coerce')
+d['ETXU_CAT1'] = pd.to_numeric(d['ETXU_CAT1'], errors='coerce')
+d['V5 %BOP'] = d['V5 %BOP'] / 100
+
+# Dropping observations with missing data
+d = d.dropna(subset=['V5 %BOP', 'OFIBRIN1', 'ETXU_CAT1']).copy()
+
+# Renaming variables for consistency with the paper
+d['R'] = d['assign']
+d['A'] = d['complete']
+d['Y'] = d['V5 %BOP']
+d['L1'] = d['OFIBRIN1']
+d['L2'] = d['ETXU_CAT1']
+
+# Outputting as a CSV for use across programs
+d[['R', 'A', 'Y', 'L1', 'L2']].to_csv("processed.csv")
+d.info()