##### Week5 Computing Corner: twang package, IPTW with boosted regression for propensity R version 3.2.2 (2015-08-14) -- "Fire Safety" > install.packages("twang") > library(twang) > set.seed(1) # for tutorial match > data(lalonde) # like week1 ComCo > # Details IPTW for later > # Weights for ATT are 1 for the treatment cases and p/(1-p) for the control cases. > # Weights for ATE are 1/p for the treatment cases and 1/(1-p) for the control cases ### propensity score from boosted regression (calls gbm ) ## tuning params etc from tutorial, ATT is goal here > ps.lalonde <- ps(treat ~ age + educ + black + hispan + nodegree + + married + re74 + re75, data = lalonde, + n.trees=5000, interaction.depth=2, shrinkage=0.01, perm.test.iters=0, + stop.method=c("es.mean","ks.max"), estimand = "ATT", verbose=FALSE) ########### note from tutorial default stopping rules in twang use two balance metrics: absolute standardized bias (also referred to as the absolute standardized mean di erence or the E ect Size) and the Kolmogorov-Smirnov (KS) statistic. ################ # plots page 10 tutorial > plot(ps.lalonde, plots = 2) # propen boxplots (es and ks from stop.method) poor overlap > plot(ps.lalonde, plots = 3) # standardized imbalance plot like from MatchIt > lalonde.balance = bal.table(ps.lalonde) # like MatchIt tables > lalonde.balance $unw tx.mn tx.sd ct.mn ct.sd std.eff.sz stat p ks ks.pval age 25.816 7.155 28.030 10.787 -0.309 -2.994 0.003 0.158 0.003 educ 10.346 2.011 10.235 2.855 0.055 0.547 0.584 0.111 0.074 black 0.843 0.365 0.203 0.403 1.757 19.371 0.000 0.640 0.000 hispan 0.059 0.237 0.142 0.350 -0.349 -3.413 0.001 0.083 0.317 nodegree 0.708 0.456 0.597 0.491 0.244 2.716 0.007 0.111 0.074 married 0.189 0.393 0.513 0.500 -0.824 -8.607 0.000 0.324 0.000 re74 2095.574 4886.620 5619.237 6788.751 -0.721 -7.254 0.000 0.447 0.000 re75 1532.055 3219.251 2466.484 3291.996 -0.290 -3.282 0.001 0.288 0.000 $es.mean.ATT tx.mn tx.sd ct.mn ct.sd std.eff.sz stat p ks ks.pval age 25.816 7.155 25.802 7.279 0.002 0.015 0.988 0.122 0.892 educ 10.346 2.011 10.573 2.089 -0.113 -0.706 0.480 0.099 0.977 black 0.843 0.365 0.842 0.365 0.003 0.027 0.978 0.001 1.000 hispan 0.059 0.237 0.042 0.202 0.072 0.804 0.421 0.017 1.000 nodegree 0.708 0.456 0.609 0.489 0.218 0.967 0.334 0.099 0.977 married 0.189 0.393 0.189 0.392 0.002 0.012 0.990 0.001 1.000 re74 2095.574 4886.620 1556.930 3801.566 0.110 1.027 0.305 0.066 1.000 re75 1532.055 3219.251 1211.575 2647.615 0.100 0.833 0.405 0.103 0.969 $ks.max.ATT tx.mn tx.sd ct.mn ct.sd std.eff.sz stat p ks ks.pval age 25.816 7.155 25.764 7.408 0.007 0.055 0.956 0.107 0.919 educ 10.346 2.011 10.572 2.140 -0.113 -0.712 0.477 0.107 0.919 black 0.843 0.365 0.835 0.371 0.022 0.187 0.852 0.008 1.000 hispan 0.059 0.237 0.043 0.203 0.069 0.779 0.436 0.016 1.000 nodegree 0.708 0.456 0.601 0.490 0.235 1.100 0.272 0.107 0.919 married 0.189 0.393 0.199 0.400 -0.024 -0.169 0.866 0.010 1.000 re74 2095.574 4886.620 1673.666 3944.600 0.086 0.800 0.424 0.054 1.000 re75 1532.055 3219.251 1257.242 2674.922 0.085 0.722 0.471 0.094 0.971 > summary(ps.lalonde) # note ess (effective sample size) !! n.treat n.ctrl ess.treat ess.ctrl max.es mean.es max.ks max.ks.p mean.ks iter unw 185 429 185 429.00000 1.7567745 0.56872589 0.6404460 NA 0.27024507 NA es.mean.ATT 185 429 185 22.96430 0.2177817 0.07746175 0.1223384 NA 0.06361021 2127 ks.max.ATT 185 429 185 27.05472 0.2348846 0.08025994 0.1070761 NA 0.06282432 1756 # boosted regression look # bar chart for free > summary(ps.lalonde$gbm.obj) var rel.inf black black 52.5951837 re74 re74 17.4828827 age age 16.8346839 re75 re75 6.3199135 educ educ 3.4137190 married married 2.8185068 nodegree nodegree 0.4291914 hispan hispan 0.1059190 > attach(lalonde) > cor(treat,lalonde) # black has highest cor with treatment treat age educ black hispan married nodegree re74 re75 re78 [1,] 1 -0.1028929 0.01930817 0.6009066 -0.1179833 -0.3013337 0.1058572 -0.249779 -0.1301972 -0.03903271 > detach(lalonde) > propen1 = ps.lalonde$ps # extract propensity scores--note this is a data frame: both es and ks criteria > str(propen1) 'data.frame': 614 obs. of 2 variables: $ es.mean.ATT: num 0.595 0.738 0.927 0.959 0.953 ... $ ks.max.ATT : num 0.615 0.692 0.924 0.953 0.948 ... > fivenum(propen1$es.mean.ATT) [1] 0.0006532284 0.0329630726 0.1080491150 0.6055660930 0.9768987928 > boxplot(propen1$es.mean.ATT) # replicates twang plot # add treatment and outcome to my little data frame > propen1$treat = lalonde$treat > propen1$re78 = lalonde$re78 > head(propen1) es.mean.ATT ks.max.ATT treat re78 1 0.5945568 0.6151896 1 9930.0460 2 0.7382721 0.6917407 1 3595.8940 3 0.9272562 0.9235889 1 24909.4500 4 0.9587267 0.9529411 1 7506.1460 5 0.9534908 0.9484507 1 289.7899 6 0.9591846 0.9529411 1 4056.4940 > boxplot(propen1$es.mean.ATT ~ propen1$treat) > # do by hand the ATT estimation (see cc_3 session) > propen1$weight.ATT = ifelse(propen1$treat ==1, 1, propen1$es.mean.ATT/(1 - propen1$es.mean.ATT)) > lm.ATT = lm(propen1$re78 ~ propen1$treat, data = propen1, weights = (propen1$weight.ATT)) > summary(lm.ATT) Call: lm(formula = propen1$re78 ~ propen1$treat, data = propen1, weights = (propen1$weight.ATT)) Weighted Residuals: Min 1Q Median 3Q Max -20052 -1947 -284 1478 53959 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 5616.6 430.4 13.051 <2e-16 *** propen1$treat 732.5 574.4 1.275 0.203 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 5175 on 612 degrees of freedom Multiple R-squared: 0.00265, Adjusted R-squared: 0.001021 F-statistic: 1.626 on 1 and 612 DF, p-value: 0.2027 > # point estimate matches tutorial which uses weighted regression from survey package > # is the standard IPTW method too optimistic? survey gives se of 1057! # smoking paper, itn_4 used bootstrap of ATE regression to get s.e. ## week 4 cc did a bca bootstrap # try a surveyreg in RQ > confint(lm.ATT) 2.5 % 97.5 % (Intercept) 4771.4763 6461.778 propen1$treat -395.5411 1860.574 > plot(lm.ATT) # standard regression diagnostics > # tutorial sec2.5 accomodates logistic propen; dx.wts, bal.table give nice covariate balance statistics > # tutorial sec 2.4 repeats week1 ComCo-- not done till ancova is run > # tutorial section 3 does Lindner, with ATE on lifepres ######################################################################### ## Dabblings with rpart and gbm (see practical guide for good use of gbm) ##### Alternative Propensity score estimation > library(rpart) > rfit = rpart(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, data = m2full.dat) > detach(m2full.dat) > m2full.dat$rpropen = predict(rfit) > attach(m2full.dat) > boxplot(rpropen ~ abcix) # not rich variety of scores, need to tune the tree better > tapply(rpropen, abcix, fivenum) $`0` [1] 0.1250000 0.6221837 0.6221837 0.6221837 0.8681319 $`1` [1] 0.1250000 0.6221837 0.6221837 0.8269231 0.8681319 # these will show you my lack of success > rpropen > hist(rpropen) # do the propensity ancova anyway, get about the same results > pancR = lm(log(cardbill) ~ abcix + rpropen, data =m2full.dat) > summary(pancR) Call: lm(formula = log(cardbill) ~ abcix + rpropen, data = m2full.dat) Residuals: Min 1Q Median 3Q Max -1.6905 -0.3004 -0.1215 0.1974 2.6986 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 9.28241 0.08106 114.519 < 2e-16 *** abcix 0.16936 0.03319 5.102 4.02e-07 *** rpropen 0.17923 0.11851 1.512 0.131 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4605 on 993 degrees of freedom Multiple R-squared: 0.03444, Adjusted R-squared: 0.0325 F-statistic: 17.71 on 2 and 993 DF, p-value: 2.769e-08 > # about same result as glm propensity ancova, even though few discrete values from rpart > ### now try boosted regression > install.packages("gbm") package ‘gbm’ successfully unpacked and MD5 sums checked > library(gbm) # first try, no specifications > gbs = gbm(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, distribution = "bernoulli", data = m2full.dat) > summary(gbs) var rel.inf ves1proc ves1proc 69.395903 acutemi acutemi 20.766530 stent stent 6.805945 ejecfrac ejecfrac 3.031623 height height 0.000000 female female 0.000000 diabetic diabetic 0.000000 # now try with 'standard' specifications > gbs_2 = gbm(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, distribution = "bernoulli", data = m2full.dat, n.trees = 100, interaction.depth = 4, train.fraction = .8, shrinkage = .0005) > summary(gbs_2) var rel.inf stent stent 45.170811 ves1proc ves1proc 27.683077 acutemi acutemi 10.746116 diabetic diabetic 8.733826 ejecfrac ejecfrac 5.018815 female female 1.412828 height height 1.234526 # get quite a different participation # try to get some classification probabilities out of this > ?predict.gbm > boostP = predict(gbs, type = "response", n.trees = 100) > fivenum(boostP) # not much range [1] 0.6938479 0.6964024 0.6965995 0.7078838 0.7143611 > boostP = predict(gbs_2, m2full.dat, type = "response", n.trees = 100) > fivenum(boostP) [1] 0.8528079 0.8807155 0.8812766 0.8815080 0.8816121 > m2full.dat$boost_propen = boostP > attach(m2full.dat) > boxplot(boost_propen ~ abcix)