# Regression Methods in Biostatistics, Vittinghoff, Glidden, Shiboski
# and McCulloch (Springer, 2005)
#
# Chapter 3. Basic Statistical Methods
#
# Time-stamp: <2008-02-10 13:41:06 chl>

postscript(file="RMB_c3.ps")

##############################################################################
### t-test

# first, we load the data 
HERS <- read.table("hersdata.txt",header=TRUE,sep="\t")
HERS.women.wht_diab <- subset(HERS,diabetes==0)
HERS.women.wht_diab$exercise <- factor(HERS.women.wht_diab$exercise,labels=c("No","Yes"))

# before carrying out the t-test, we shall look at the data with the help of
# some graphics
boxplot(glucose~exercise,data=HERS.women.wht_diab)
title(xlab="Exercise",ylab="Glucose Level (mg/dL)")

op <- par(mfrow=c(2,2))
with(HERS.women.wht_diab, hist(glucose[exercise=="No"],
                               main="Exercice: No",xlab="Glucose Level (mg/dL)"))
with(HERS.women.wht_diab, hist(glucose[exercise=="Yes"],
                               main="Exercice: Yes",xlab="Glucose Level (mg/dL)"))
with(HERS.women.wht_diab, qqnorm(glucose[exercise=="No"]))
with(HERS.women.wht_diab, qqline(glucose[exercise=="No"]))
with(HERS.women.wht_diab, qqnorm(glucose[exercise=="Yes"]))
with(HERS.women.wht_diab, qqline(glucose[exercise=="Yes"]))
par(op)

# Note:
# the Q-Q plots are not very useful here since sample size is large enough
# for valid t-test

# compute overall and group means
mean(HERS.women.wht_diab$glucose,na.rm=TRUE)
group.means <- with(HERS.women.wht_diab, tapply(glucose,exercise,mean,na.rm=TRUE))
(dobs <- diff(group.means))

# now compute t statistic, assuming equal variance (two-sided test)
(glucose.ttest <- t.test(glucose~exercise,data=HERS.women.wht_diab,var.equal=TRUE))

# compare with Welch corrected t-test (default within R)
t.test(glucose~exercise,data=HERS.women.wht_diab,var.equal=FALSE)

# given the sample size, one should reasonnably make use of asymptotic
# convergence of the t distribution to the standard normal
1-pnorm(as.numeric(glucose.ttest$statistic))

##############################################################################
### one-way ANOVA
HERS$raceth <- factor(HERS$raceth,labels=c("White","Afr Amer","Other"))
summary(HERS$raceth)

# get a numerical summary tabulated by the factor of interest (ethnicity)
my.summary <- function(x,digits=3) {
  tmp <- summary(x)
  tmp <- c(tmp,sd(x),length(x),sum(is.na(x)))
  names(tmp)[7:9] <- c("Sd","N","NAs")
  return(round(tmp,digits))
}
with(HERS, by(SBP,raceth,my.summary,digits=2))

# plot the between-group dispersion around the overall mean
plot.design(SBP~raceth,data=HERS)

# perform the ANOVA and display the anova table
hers.aov <- aov(SBP~raceth,data=HERS)
summary(hers.aov)

model.tables(hers.aov)

# we can check that there is no clear departures from ANOVA assumptions
op <- par(mfrow=c(2,2))
plot(hers.aov)
par(op)

# Notes:
# 1. some individual have high residual so we should check if anything else goes
#    wrong with these observations (916, 2161 and 175) among other covariates...
# 2. pairwise comparisons should follow

# As suggested by the authors, we could perform an ANOVA which does not assume
# equal variance among groups, as we did for the t-test. R. L. Welch proposes to
# correct the df in a way that is close to what is done in the case of two
# samples comparison.
#
# Welch, R.L. On the comparison of several mean values: An alternative approach,
# Biometrika 38: 330--336, 1951.

# though this is not really recommended here, we could perform a non parametric
# ANOVA (based on ranks)
kruskal.test(SBP~raceth,data=HERS)

plot.design(SBP~raceth,data=HERS,fun=median)

# Note:
# the test is now non significant!

##############################################################################
### simple linear regression

# first, we plot the data
plot(SBP~age,data=HERS,cex=.5,xlab="Age (years)", ylab="Systolic Blood Pressure (mmHg)")

# and we add average SBP for each decile of age
age.decile <- quantile(HERS$age,probs=seq(0,1,.1))

tmp <- as.numeric(age.decile)
xx <- numeric(length(tmp)-1) # mid-interval points
for (i in 1:(length(tmp)-1)) xx[i] <- (tmp[i]+tmp[i+1])/2
HERS$ageq <- cut(HERS$age,breaks=age.decile,labels=FALSE)
points(xx,as.numeric(tapply(HERS$SBP,HERS$ageq,mean)),pch=15,cex=1.2)

# compute the linear regression
HERS.lm <- lm(SBP~age,data=HERS)
summary(HERS.lm)

# add the regression line to the previous plot
abline(HERS.lm)

# some diagnostics plots
op <- par(mfrow=c(2,2))
plot(HERS.lm)
par(op)

# Note:
# Results differ from those presented by the authors because they consider only
# a 10% random sample of the total sample (cf. chap3.do on the website)

# doing the analysis with a random sample (10% of the total size)
set.seed(12386)
idx <- sample(HERS$id,size=nrow(HERS)/10)
HERS.sample10 <- HERS[idx,]
HERS.lm2 <- lm(SBP~age,data=HERS.sample10)
summary(HERS.lm2)

# we now can predict SBP for a woman of average age
as.numeric(HERS.lm$coefficients[1] + HERS.lm$coefficients[2]*mean(HERS$age))

# but we could also create a new dataframe containing fitted values for each age
# along with confidence interval for prediction (larger than CI for estimation)
HERS.lm.fit <- data.frame(age=seq(min(HERS$age),max(HERS$age),by=1))
fitted.values <- predict(HERS.lm,HERS.lm.fit,interval="prediction")
fitted.values2 <- predict(HERS.lm,HERS.lm.fit,interval="confidence")
HERS.lm.fit <- cbind(HERS.lm.fit,fitted.values,fitted.values2[,2:3])
colnames(HERS.lm.fit)[5:6] <- c("lwr2","upr2")

with(HERS.lm.fit, {
  matplot(age,cbind(lwr,upr),type="l",xlab="Age",ylab="Predicted SBP",col=2,lty=2)
  matlines(age,cbind(lwr2,upr2),type="l",xlab="Age",ylab="Predicted SBP",col=3,lty=2)
  lines(age,fit)
  points(mean(HERS$age),mean(HERS$SBP))
})
legend("topleft",c("Prediction","Confidence"),col=2:3,lty=2,title="CI",bty="n")

# computing confidence interval for regression parameters using bootstrap
library(boot)
boot.fun <- function(data,x) return(as.numeric(lm(data[x,2]~data[x,1])$coefficients[2]))

sbp.age <- cbind(HERS$age,HERS$SBP)
res <- boot(sbp.age,boot.fun,1000)
res

# two-sided bootstrapped confidence interval of Cronbach's alpha
quantile(res$t,c(0.025,0.975))

# std error
sqrt(diag(cov(res$t)))

# adjusted bootstrap percentile (BCa) confidence interval (better)
# Note:
# The boot.ci( ) function takes a bootobject and generates 5 different types
# of two-sided nonparametric confidence intervals. These include the first
# order normal approximation, the basic bootstrap interval, the studentized
# bootstrap interval, the bootstrap percentile interval, and the adjusted
# bootstrap percentile (BCa) interval.
boot.ci(res,type=c("norm","perc"))  # normal approx. + percentile method

# the following displays a graphical summary of the bootstrap procedure
plot(res)

# Note:
# There is a great tutorial on boostrapping techniques written by John Fox,
# which is available at the following address:
# http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-bootstrapping.pdf


##############################################################################
### two-way contingency table

# loading the data
WCGS <- read.table("wcgs.txt",header=TRUE,sep="\t")
WCGS$chd69 <- factor(WCGS$chd69,labels=c("Noncases","Cases"))
WCGS$arcus <- factor(WCGS$arcus,labels=c("Unexposed","Exposed"))
                     
# display a basic two-way table
chd69.tab <- with(WCGS, table(chd69,arcus))
chd69.tab

# interesting summary are provided by the `vcd` package
# however, we are more interested by association measures
library(vcd)
assocstats(chd69.tab)

# here are the functions we need (but see the `epi` and `epitools` packages)
odds.ratio <- function(x,alpha=.05) {
  if (!is.table(x)) x <- as.table(x)
  pow <- function(x,a=-1) x^a
  or <- (x[1,1]*x[2,2]) / (x[1,2]*x[2,1])
  or.se <- sqrt(sum(pow(x)))
  or.ci <- exp(log(or) + c(-1,1)*qnorm(1-alpha/2)*or.se)
  return(list(OR=or,SE=or.se,CI=or.ci))
}

# Note:
# we don't use Cornfield approximation when computing confidence
# interval (Cornfield approximation relates in fact to SE calculation).
#
# See http://www.stata.com/help.cgi?cci for the Stata on-line help
#
# and the 'official' reference for CI computation with matched cases
# design:
# J J Schlesselman. Case-Control Studies. Design, Conduct, Analysis.
# Oxford University Press, 1982.
#
# Also see
# C-Y Lin & M-C Yang, Improved Exact Confidence Intervals for the
# Odds Ratio in Two Independent Binomial Samples, Clinical Trials,
# 48(6): 1008-1019, 2006.
#
# C C Brown, The Validity of Approximation Methods for Interval
# Estimation of the Odds Ratio, American Journal of Epidemiology,
# 113(4): 474-480, 1981.
# 

relative.risk <- function(x,srow=1,scol=1) {
  # we assume that Cases/Exposed is contained in cell [srow,scol]
  out <- c(as.numeric(x[srow,scol]/apply(x,2,sum)[scol]),as.numeric(x[srow,scol+1]/apply(x,2,sum)[scol+1]))
  names(out) <- c("Cases+","Cases-")
  return(out)
}

# The following functions are adapted from
# S. BOLBOACA and A. ACHIMAS CADARIU. Binomial Distribution
# Sample Confidence Intervals Estimation. 6. Excess Risk
# http://lejpt.academicdirect.org/A04/01_21.htm
#
# Here, we are expressing excess risk as
# ER = a/(a+b) - c/(c+d) = y/n - x/m
# for the classical 2x2 table
#
#     -----------
#     |  + |  - |
# |-------------- 
# | + |  a |  b |
# |--------------
# | - |  c |  d |
# |--------------
#

dwald <- function(x,m,y,n,z=1.96) {
  lwr <- max(c(y/n-x/m-z*sqrt((x*(m-x))/m^3+(y*(n-y))/n^3),-1))
  upr <- min(c(y/n-x/m+z*sqrt((x*(m-x))/m^3+(y*(n-y))/n^3),1))
  return(c(lwr,upr))
}

dac <- function(x,m,y,n,z=1.96) {
  return(dwald(x+z^2/(4*sqrt(2)),m+z^2/(2*sqrt(2)),y+z^2/(4*sqrt(2)),n+z^2/(4*sqrt(2)),z))
}

# odds-ratio
odds.ratio(chd69.tab)

# relative risk
chd69.rr <- relative.risk(chd69.tab)
excess.risk <- as.numeric(abs(diff(chd69.rr)))
dac(chd69.tab[1,2],chd69.tab[1,1]+chd69.tab[1,2],chd69.tab[2,2],chd69.tab[2,1]+chd69.tab[2,2])
# Note:
# I'm quite confuse for the moment since this doesn't seem to provide
# a logical result...
risk.ratio <- as.numeric(chd69.rr[1]/chd69.rr[2])

# global association test
# we don't need the correction for continuity when using chi square test
chisq.test(chd69.tab,correct=F)

# the `epitools` package offers facilities for computing the above
# estimates, together with test of association
# Note:
# CI for odds-ratio are estimated using Wald approximation
# We can add the option verbose=TRUE for a more detailed output
library(epitools)
epitab(chd69.tab,pvalue="chi2")

# the HIV Status by AIDS Diagnosis of Male Partner data
HIV <- matrix(c(3,2,4,22),nrow=2)
dimnames(HIV) <- list(c("Cases","Noncases"),c("Exposed","Unexposed"))
HIV

epitab(HIV)

# compare OR estimate with that obtained using mid-p method
# Kenneth J. Rothman and Sander Greenland (1998), Modern Epidemiology,
# Lippincott-Raven Publishers (p. 251)
or.midp(c(3,4,2,22))

# alternatively, we can directly make use of Fisher exact test
fisher.test(HIV)
fisher.test(HIV,alternative="greater")

# Now, if we use the `epicalc` package, we get all in one
library(epicalc)
HIV.tab <- make2x2(3,2,4,22)
# csi(3,2,4,22)
cc(cctable=HIV.tab)
cs(cctable=HIV.tab)

# are the OR constant across subgroups of age (Mantel-Haenzel test)
# WCGS$agec is a 5-level factor

# first we look at a simple two-way classification
(chd69.tab2 <- with(WCGS, table(chd69,agec)))
# column profile
apply(chd69.tab2,1,function(x) round(x/apply(chd69.tab2,2,sum)*100,2))

# global association test with chi square
chisq.test(chd69.tab2)

# a more detailed inspection of association estimates can be obtained
# using
# cc(cctable=chd69.tab2)

# Note:
# I have not yet calculated the score test trend of odds
 
# Twenty-year Vital Status by Smoking Behavior, Stratified by Age
# are not available as raw data, so we will use the WCGS study
chd69.tab3 <- xtabs(~chd69+arcus+agec,data=WCGS)

# we can get OR by strata as follows
apply(chd69.tab3,3,function(x) (x[1,1]*x[2,2])/(x[1,2]*x[2,1]))

# run the Mantel-Haenszel test and an Homogeneity of OR test
mhor(mhtable=chd69.tab3)

# alternatively, one can use (wht correction for continuity to get
# similar result)
with(WCGS, mantelhaen.test(chd69,arcus,agec,correct=FALSE))


##############################################################################
### survival data

LEUK <- read.table("leuk.txt",header=TRUE)

library(survival)
# create a survival object
LEUK.surv <- Surv(LEUK$time,LEUK$cens)

# Kaplan-Meier estimate
LEUK.km <- survfit(LEUK.surv~group,data=LEUK)
plot(LEUK.km[1])
lines(LEUK.km[2],col="gray60")
legend("topright",c("6-MP","Placebo"),lty=1,col=c("black","gray60"),bty="n")
title(xlab="Weeks since randomization",ylab="Proportion in remission")

# numerical summary
LEUK.km
summary(LEUK.km)

# comparing the two groups with log-rank test using survdiff()
# rho = 0 (default) gives logrank test while rho=1 gives Wilcoxon test
survdiff(LEUK.surv~group,data=LEUK) 

dev.off() # release the postscript device