##### Week5 Computing Corner: twang package, IPTW with boosted regression for propensity
R version 3.2.2 (2015-08-14) -- "Fire Safety"

> install.packages("twang")
> library(twang)

> set.seed(1) # for tutorial match
> data(lalonde) # like week1 ComCo

> # Details IPTW for later
> # Weights for ATT are 1 for the treatment cases and p/(1-p) for the control cases.
> # Weights for ATE are 1/p for the treatment cases and 1/(1-p) for the control cases

### propensity score from boosted regression (calls gbm )
 ## tuning params etc from tutorial, ATT is goal here
> ps.lalonde <- ps(treat ~ age + educ + black + hispan + nodegree + + married + re74 + re75, data = lalonde, 
+     n.trees=5000, interaction.depth=2, shrinkage=0.01, perm.test.iters=0,
+  stop.method=c("es.mean","ks.max"), estimand = "ATT", verbose=FALSE)

########### note from tutorial
default stopping rules in twang use two balance metrics: absolute standardized
bias (also referred to as the absolute standardized mean dierence or the Eect Size) and the
Kolmogorov-Smirnov (KS) statistic.
################

# plots page 10 tutorial
> plot(ps.lalonde, plots = 2) # propen boxplots (es and ks from stop.method) poor overlap
> plot(ps.lalonde, plots = 3) # standardized imbalance plot like from MatchIt
> lalonde.balance = bal.table(ps.lalonde) # like MatchIt tables
> lalonde.balance
$unw
            tx.mn    tx.sd    ct.mn    ct.sd std.eff.sz   stat     p    ks ks.pval
age        25.816    7.155   28.030   10.787     -0.309 -2.994 0.003 0.158   0.003
educ       10.346    2.011   10.235    2.855      0.055  0.547 0.584 0.111   0.074
black       0.843    0.365    0.203    0.403      1.757 19.371 0.000 0.640   0.000
hispan      0.059    0.237    0.142    0.350     -0.349 -3.413 0.001 0.083   0.317
nodegree    0.708    0.456    0.597    0.491      0.244  2.716 0.007 0.111   0.074
married     0.189    0.393    0.513    0.500     -0.824 -8.607 0.000 0.324   0.000
re74     2095.574 4886.620 5619.237 6788.751     -0.721 -7.254 0.000 0.447   0.000
re75     1532.055 3219.251 2466.484 3291.996     -0.290 -3.282 0.001 0.288   0.000

$es.mean.ATT
            tx.mn    tx.sd    ct.mn    ct.sd std.eff.sz   stat     p    ks ks.pval
age        25.816    7.155   25.802    7.279      0.002  0.015 0.988 0.122   0.892
educ       10.346    2.011   10.573    2.089     -0.113 -0.706 0.480 0.099   0.977
black       0.843    0.365    0.842    0.365      0.003  0.027 0.978 0.001   1.000
hispan      0.059    0.237    0.042    0.202      0.072  0.804 0.421 0.017   1.000
nodegree    0.708    0.456    0.609    0.489      0.218  0.967 0.334 0.099   0.977
married     0.189    0.393    0.189    0.392      0.002  0.012 0.990 0.001   1.000
re74     2095.574 4886.620 1556.930 3801.566      0.110  1.027 0.305 0.066   1.000
re75     1532.055 3219.251 1211.575 2647.615      0.100  0.833 0.405 0.103   0.969

$ks.max.ATT
            tx.mn    tx.sd    ct.mn    ct.sd std.eff.sz   stat     p    ks ks.pval
age        25.816    7.155   25.764    7.408      0.007  0.055 0.956 0.107   0.919
educ       10.346    2.011   10.572    2.140     -0.113 -0.712 0.477 0.107   0.919
black       0.843    0.365    0.835    0.371      0.022  0.187 0.852 0.008   1.000
hispan      0.059    0.237    0.043    0.203      0.069  0.779 0.436 0.016   1.000
nodegree    0.708    0.456    0.601    0.490      0.235  1.100 0.272 0.107   0.919
married     0.189    0.393    0.199    0.400     -0.024 -0.169 0.866 0.010   1.000
re74     2095.574 4886.620 1673.666 3944.600      0.086  0.800 0.424 0.054   1.000
re75     1532.055 3219.251 1257.242 2674.922      0.085  0.722 0.471 0.094   0.971

> summary(ps.lalonde)  # note ess (effective sample size) !!
            n.treat n.ctrl ess.treat  ess.ctrl    max.es    mean.es    max.ks max.ks.p    mean.ks iter
unw             185    429       185 429.00000 1.7567745 0.56872589 0.6404460       NA 0.27024507   NA
es.mean.ATT     185    429       185  22.96430 0.2177817 0.07746175 0.1223384       NA 0.06361021 2127
ks.max.ATT      185    429       185  27.05472 0.2348846 0.08025994 0.1070761       NA 0.06282432 1756


# boosted regression look # bar chart for free
> summary(ps.lalonde$gbm.obj)
              var    rel.inf
black       black 52.5951837
re74         re74 17.4828827
age           age 16.8346839
re75         re75  6.3199135
educ         educ  3.4137190
married   married  2.8185068
nodegree nodegree  0.4291914
hispan     hispan  0.1059190

> attach(lalonde)
> cor(treat,lalonde) # black has highest cor with treatment
     treat        age       educ     black     hispan    married  nodegree      re74       re75        re78
[1,]     1 -0.1028929 0.01930817 0.6009066 -0.1179833 -0.3013337 0.1058572 -0.249779 -0.1301972 -0.03903271
> detach(lalonde)

> propen1 = ps.lalonde$ps # extract propensity scores--note this is a data frame: both es and ks criteria
> str(propen1)
'data.frame':   614 obs. of  2 variables:
 $ es.mean.ATT: num  0.595 0.738 0.927 0.959 0.953 ...
 $ ks.max.ATT : num  0.615 0.692 0.924 0.953 0.948 ...
> fivenum(propen1$es.mean.ATT)
[1] 0.0006532284 0.0329630726 0.1080491150 0.6055660930 0.9768987928
> boxplot(propen1$es.mean.ATT) # replicates twang plot
# add treatment and outcome to my little data frame
> propen1$treat = lalonde$treat
> propen1$re78 = lalonde$re78
> head(propen1)
  es.mean.ATT ks.max.ATT treat       re78
1   0.5945568  0.6151896     1  9930.0460
2   0.7382721  0.6917407     1  3595.8940
3   0.9272562  0.9235889     1 24909.4500
4   0.9587267  0.9529411     1  7506.1460
5   0.9534908  0.9484507     1   289.7899
6   0.9591846  0.9529411     1  4056.4940

> boxplot(propen1$es.mean.ATT ~ propen1$treat)

> # do by hand the ATT estimation (see cc_3 session)
> propen1$weight.ATT = 
      ifelse(propen1$treat ==1, 1, propen1$es.mean.ATT/(1 - propen1$es.mean.ATT))

> lm.ATT = lm(propen1$re78 ~ propen1$treat, data = propen1, weights = (propen1$weight.ATT))
> summary(lm.ATT)
Call: lm(formula = propen1$re78 ~ propen1$treat, data = propen1, weights = (propen1$weight.ATT))

Weighted Residuals:
   Min     1Q Median     3Q    Max 
-20052  -1947   -284   1478  53959 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)     5616.6      430.4  13.051   <2e-16 ***
propen1$treat    732.5      574.4   1.275    0.203    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5175 on 612 degrees of freedom
Multiple R-squared:  0.00265,   Adjusted R-squared:  0.001021 
F-statistic: 1.626 on 1 and 612 DF,  p-value: 0.2027

> # point estimate matches tutorial which uses weighted regression from survey package
> # is the standard IPTW method too optimistic? survey gives se of 1057!
# smoking paper, itn_4 used bootstrap of ATE regression to get s.e.
## week 4 cc did a bca bootstrap
# try a surveyreg in RQ

> confint(lm.ATT)
                  2.5 %   97.5 %
(Intercept)   4771.4763 6461.778
propen1$treat -395.5411 1860.574
> plot(lm.ATT) # standard regression diagnostics

> # tutorial sec2.5 accomodates logistic propen; dx.wts, bal.table give nice covariate balance statistics
> # tutorial sec 2.4 repeats week1 ComCo-- not done till ancova is run
> # tutorial section 3 does Lindner, with ATE on lifepres

#########################################################################

## Dabblings with rpart and gbm (see practical guide for good use of gbm)

##### Alternative Propensity score estimation

> library(rpart)
> rfit = rpart(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, data = m2full.dat)
> detach(m2full.dat)
> m2full.dat$rpropen = predict(rfit)
> attach(m2full.dat)
> boxplot(rpropen ~ abcix) # not rich variety of scores, need to tune the tree better

> tapply(rpropen, abcix, fivenum)
$`0` [1] 0.1250000 0.6221837 0.6221837 0.6221837 0.8681319
$`1` [1] 0.1250000 0.6221837 0.6221837 0.8269231 0.8681319

# these will show you my lack of success
> rpropen
> hist(rpropen)

# do the propensity ancova anyway, get about the same results
> pancR = lm(log(cardbill) ~ abcix + rpropen, data =m2full.dat)
> summary(pancR)

Call:
lm(formula = log(cardbill) ~ abcix + rpropen, data = m2full.dat)
Residuals:
    Min      1Q  Median      3Q     Max 
-1.6905 -0.3004 -0.1215  0.1974  2.6986 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.28241    0.08106 114.519  < 2e-16 ***
abcix        0.16936    0.03319   5.102 4.02e-07 ***
rpropen      0.17923    0.11851   1.512    0.131    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4605 on 993 degrees of freedom
Multiple R-squared:  0.03444,   Adjusted R-squared:  0.0325 
F-statistic: 17.71 on 2 and 993 DF,  p-value: 2.769e-08

> # about same result as glm propensity ancova, even though few discrete values from rpart

> ### now try boosted regression
> install.packages("gbm")
package ‘gbm’ successfully unpacked and MD5 sums checked
> library(gbm)

# first try, no specifications
> gbs = gbm(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, distribution = "bernoulli", data = m2full.dat)
> summary(gbs)
              var   rel.inf
ves1proc ves1proc 69.395903
acutemi   acutemi 20.766530
stent       stent  6.805945
ejecfrac ejecfrac  3.031623
height     height  0.000000
female     female  0.000000
diabetic diabetic  0.000000

# now try with 'standard' specifications
> gbs_2 = gbm(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, distribution = "bernoulli", data = m2full.dat, n.trees = 100, interaction.depth = 4, train.fraction = .8, shrinkage = .0005)
> summary(gbs_2)
              var   rel.inf
stent       stent 45.170811
ves1proc ves1proc 27.683077
acutemi   acutemi 10.746116
diabetic diabetic  8.733826
ejecfrac ejecfrac  5.018815
female     female  1.412828
height     height  1.234526

# get quite a different participation

# try to get some classification probabilities out of this
> ?predict.gbm
> boostP = predict(gbs, type = "response", n.trees = 100)
> fivenum(boostP)  # not much range
[1] 0.6938479 0.6964024 0.6965995 0.7078838 0.7143611
> boostP = predict(gbs_2, m2full.dat, type = "response", n.trees = 100)
> fivenum(boostP)
[1] 0.8528079 0.8807155 0.8812766 0.8815080 0.8816121
> m2full.dat$boost_propen = boostP
> attach(m2full.dat)
> boxplot(boost_propen ~ abcix)