############ Computing Corner Week 4 ##### Alternative Propensity score estimation > library(rpart) > rfit = rpart(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, data = m2full.dat) > detach(m2full.dat) > m2full.dat$rpropen = predict(rfit) > attach(m2full.dat) > boxplot(rpropen ~ abcix) # not rich variety of scores, need to tune the tree better > tapply(rpropen, abcix, fivenum) $`0` [1] 0.1250000 0.6221837 0.6221837 0.6221837 0.8681319 $`1` [1] 0.1250000 0.6221837 0.6221837 0.8269231 0.8681319 # these will show you my lack of success > rpropen > hist(rpropen) # do the propensity ancova anyway, get about the same results > pancR = lm(log(cardbill) ~ abcix + rpropen, data =m2full.dat) > summary(pancR) Call: lm(formula = log(cardbill) ~ abcix + rpropen, data = m2full.dat) Residuals: Min 1Q Median 3Q Max -1.6905 -0.3004 -0.1215 0.1974 2.6986 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 9.28241 0.08106 114.519 < 2e-16 *** abcix 0.16936 0.03319 5.102 4.02e-07 *** rpropen 0.17923 0.11851 1.512 0.131 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4605 on 993 degrees of freedom Multiple R-squared: 0.03444, Adjusted R-squared: 0.0325 F-statistic: 17.71 on 2 and 993 DF, p-value: 2.769e-08 > # about same result as glm propensity ancova, even though few discrete values from rpart > ### now try boosted regression > install.packages("gbm") package ‘gbm’ successfully unpacked and MD5 sums checked > library(gbm) # first try, no specifications > gbs = gbm(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, distribution = "bernoulli", data = m2full.dat) > summary(gbs) var rel.inf ves1proc ves1proc 69.395903 acutemi acutemi 20.766530 stent stent 6.805945 ejecfrac ejecfrac 3.031623 height height 0.000000 female female 0.000000 diabetic diabetic 0.000000 # now try with 'standard' specifications > gbs_2 = gbm(abcix ~ stent + height + female + diabetic + acutemi + ejecfrac + ves1proc, distribution = "bernoulli", data = m2full.dat, n.trees = 100, interaction.depth = 4, train.fraction = .8, shrinkage = .0005) > summary(gbs_2) var rel.inf stent stent 45.170811 ves1proc ves1proc 27.683077 acutemi acutemi 10.746116 diabetic diabetic 8.733826 ejecfrac ejecfrac 5.018815 female female 1.412828 height height 1.234526 # get quite a different participation # try to get some classification probabilities out of this > ?predict.gbm > boostP = predict(gbs, type = "response", n.trees = 100) > fivenum(boostP) # not much range [1] 0.6938479 0.6964024 0.6965995 0.7078838 0.7143611 > boostP = predict(gbs_2, m2full.dat, type = "response", n.trees = 100) > fivenum(boostP) [1] 0.8528079 0.8807155 0.8812766 0.8815080 0.8816121 > m2full.dat$boost_propen = boostP > attach(m2full.dat) > boxplot(boost_propen ~ abcix) > # try twang in RQ