############ Computing Corner Week 4
##### Alternative Propensity score estimation

> library(rpart)
> rfit = rpart(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, data = m2full.dat)
> detach(m2full.dat)
> m2full.dat$rpropen = predict(rfit)
> attach(m2full.dat)
> boxplot(rpropen ~ abcix) # not rich variety of scores, need to tune the tree better

> tapply(rpropen, abcix, fivenum)
$`0` [1] 0.1250000 0.6221837 0.6221837 0.6221837 0.8681319
$`1` [1] 0.1250000 0.6221837 0.6221837 0.8269231 0.8681319

# these will show you my lack of success
> rpropen
> hist(rpropen)

# do the propensity ancova anyway, get about the same results
> pancR = lm(log(cardbill) ~ abcix + rpropen, data =m2full.dat)
> summary(pancR)

Call:
lm(formula = log(cardbill) ~ abcix + rpropen, data = m2full.dat)
Residuals:
    Min      1Q  Median      3Q     Max 
-1.6905 -0.3004 -0.1215  0.1974  2.6986 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.28241    0.08106 114.519  < 2e-16 ***
abcix        0.16936    0.03319   5.102 4.02e-07 ***
rpropen      0.17923    0.11851   1.512    0.131    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4605 on 993 degrees of freedom
Multiple R-squared:  0.03444,   Adjusted R-squared:  0.0325 
F-statistic: 17.71 on 2 and 993 DF,  p-value: 2.769e-08

> # about same result as glm propensity ancova, even though few discrete values from rpart

> ### now try boosted regression
> install.packages("gbm")
package ‘gbm’ successfully unpacked and MD5 sums checked
> library(gbm)

# first try, no specifications
> gbs = gbm(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, distribution = "bernoulli", data = m2full.dat)
> summary(gbs)
              var   rel.inf
ves1proc ves1proc 69.395903
acutemi   acutemi 20.766530
stent       stent  6.805945
ejecfrac ejecfrac  3.031623
height     height  0.000000
female     female  0.000000
diabetic diabetic  0.000000

# now try with 'standard' specifications
> gbs_2 = gbm(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, distribution = "bernoulli", data = m2full.dat, n.trees = 100, interaction.depth = 4, train.fraction = .8, shrinkage = .0005)
> summary(gbs_2)
              var   rel.inf
stent       stent 45.170811
ves1proc ves1proc 27.683077
acutemi   acutemi 10.746116
diabetic diabetic  8.733826
ejecfrac ejecfrac  5.018815
female     female  1.412828
height     height  1.234526

# get quite a different participation

# try to get some classification probabilities out of this
> ?predict.gbm
> boostP = predict(gbs, type = "response", n.trees = 100)
> fivenum(boostP)  # not much range
[1] 0.6938479 0.6964024 0.6965995 0.7078838 0.7143611
> boostP = predict(gbs_2, m2full.dat, type = "response", n.trees = 100)
> fivenum(boostP)
[1] 0.8528079 0.8807155 0.8812766 0.8815080 0.8816121
> m2full.dat$boost_propen = boostP
> attach(m2full.dat)
> boxplot(boost_propen ~ abcix)
> # try twang in RQ