##### Week 7 CC, dose-response functions

R version 3.2.2 (2015-08-14) -- "Fire Safety"
> install.packages("causaldrf")
> library(causaldrf)
### first try the smoking dataset
> data(nmes_data)
> dim(nmes_data)
[1] 9708   12
> nm = nmes_data
> dim(nm)
[1] 9708   12

> summary(nm)
   packyears         AGESMOKE        LASTAGE          MALE        RACE3   
 Min.   :  0.05   Min.   : 9.00   Min.   :19.0   Min.   :0.0000   1: 633  
 1st Qu.:  6.60   1st Qu.:16.00   1st Qu.:32.0   1st Qu.:0.0000   2:1496  
 Median : 17.25   Median :18.00   Median :45.0   Median :1.0000   3:7579  
 Mean   : 24.48   Mean   :18.39   Mean   :47.1   Mean   :0.5159           
 3rd Qu.: 34.50   3rd Qu.:20.00   3rd Qu.:62.0   3rd Qu.:1.0000           
 Max.   :216.00   Max.   :70.00   Max.   :94.0   Max.   :1.0000           
 beltuse  educate  marital  SREGION  POVSTALB    HSQACCWT    
 1:2613   1:2047   1:6188   1:2047   1:1034   Min.   :  908  
 2:2175   2:2451   2: 771   2:2451   2: 470   1st Qu.: 4975  
 3:4920   3:3386   3:1076   3:3386   3:1443   Median : 7075  
          4:1824   4: 333   4:1824   4:3273   Mean   : 8072  
                   5:1340            5:3488   3rd Qu.:10980  
                                              Max.   :35172  
    TOTALEXP       
 Min.   :     0.0  
 1st Qu.:    90.0  
 Median :   406.1  
 Mean   :  2042.0  
 3rd Qu.:  1350.3  
 Max.   :175096.0  

> plot(nm$packyears, nm$TOTALEXP)
# not much dose response evident??
> plot(nm$packyears, log(nm$TOTALEXP))
> 
###################################################################
## Artificial data example
# true dose-response
> ?curve
> curve(x + 2/(1 + x)^3, 0,4)  # plot shown in CC materials

> # I read in the supplied sim data rather than create it
> data(hi_sim_data)
> ?hi_sim_data
starting httpd help server ... done

> sim = hi_sim_data #simplify my typing
> head(sim)
         X1         X2         T        gps         Y
1 2.8762787 0.52729990 0.2223654 1.59678021 1.0393833
2 0.4875109 0.18797037 0.5856397 0.45479046 0.5546942
3 0.7407761 0.22908956 0.3763913 0.67324450 0.3727181
4 0.6561316 1.29076597 2.0496851 0.03599808 3.2768007
5 0.2495930 1.02818788 1.0473338 0.33516304 1.6742432
6 0.3888915 0.07456587 1.4867275 0.23268359 1.8758940



> # run the Imbens estimator
> hi_estimate <- hi_est(Y = Y,
+                       treat = T,
+                       treat_formula = T ~ X1 + X2,
+                       outcome_formula = Y ~ T + I(T^2) +
+                         gps + I(gps^2) + T * gps,
+                       data = sim,
+                       grid_val = quantile(hi_sim_data$T,
+                                   probs = seq(0, .95, by = 0.01)),
+                       treat_mod = "Gamma",
+                       link_function = "inverse")
> summary(hi_estimate)

Estimated values:
 [1] 1.844885 1.892549 1.885924 1.872564 1.859171 1.843147 1.833042 1.821880
 [9] 1.810837 1.789285 1.771371 1.753162 1.740413 1.715755 1.697731 1.681189
[17] 1.654436 1.641968 1.623282 1.601950 1.589434 1.568637 1.559566 1.548223
[25] 1.531785 1.513656 1.489437 1.475388 1.454679 1.445420 1.429200 1.419578
[33] 1.398187 1.377899 1.366134 1.349114 1.333631 1.311605 1.295041 1.282153
[41] 1.266689 1.249200 1.236905 1.225240 1.213712 1.207096 1.198966 1.194443
[49] 1.186686 1.181235 1.173873 1.162262 1.155129 1.145633 1.141084 1.135167
[57] 1.132318 1.129410 1.127848 1.127688 1.128432 1.130149 1.133215 1.136869
[65] 1.141953 1.146036 1.156082 1.165157 1.174271 1.188479 1.207558 1.222418
[73] 1.236831 1.253292 1.269864 1.284218 1.317895 1.338323 1.366325 1.412826
[81] 1.446591 1.483681 1.532863 1.613043 1.668634 1.738570 1.809313 1.875484
[89] 1.977283 2.064272 2.206672 2.328933 2.515111 2.726571 2.860258 3.308236

Treatment Summary:

Call:
glm(formula = formula_t, family = Gamma(link = link_function), 
    data = samp_dat)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.2278  -1.0433  -0.3420   0.3783   2.8016  

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.01262    0.02422  -0.521    0.602    
X1           0.99870    0.06560  15.223   <2e-16 ***
X2           0.97901    0.06462  15.151   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for Gamma family taken to be 0.9431856)

    Null deviance: 1888.7  on 999  degrees of freedom
Residual deviance: 1135.7  on 997  degrees of freedom
AIC: 1187

Number of Fisher Scoring iterations: 6


Outcome Summary:

Call:
lm(formula = outcome_formula, data = tempdat)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.0078 -0.6943  0.0184  0.6469  3.1011 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.094e-01  1.331e-01  -0.822    0.411    
T            1.022e+00  5.696e-02  17.949   <2e-16 ***
I(T^2)      -2.309e-03  4.504e-03  -0.513    0.608    
gps          1.069e+00  1.045e-01  10.232   <2e-16 ***
I(gps^2)    -5.769e-06  2.115e-02   0.000    1.000    
T:gps        3.214e-01  3.088e-01   1.041    0.298    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.02 on 994 degrees of freedom
Multiple R-squared:  0.7032,    Adjusted R-squared:  0.7017 
F-statistic: 471.1 on 5 and 994 DF,  p-value: < 2.2e-16

# Vignette plot compares handfull of estimates, here's HI

> x <- hi_sim_data$T
> quantile_grid <- quantile(x, probs = seq(0, .95, by = 0.01))
>  # quantile_grid <- quantile_grid[1:100]
>  true_hi_fun <- function(t){t + 2/(1 + t)^3}
> plot(quantile_grid,
+       true_hi_fun(quantile_grid),
+       pch = ".",
+       main = "true ADRF",
+       xlab = "treat",
+       ylab = "Y",
+       col = "black")
> 
>  lines(quantile_grid,
+        true_hi_fun(quantile_grid),
+        col = "black",
+        lty = 1,
+        lwd = 2)
> lines(quantile_grid,
+        hi_estimate$param,
+        lty = 4,
+        col = "green",
+        lwd = 2)
> # adapt code from vignette .Rnw to get this reduced plot