# Regression Methods in Biostatistics, Vittinghoff, Glidden, Shiboski # and McCulloch (Springer, 2005) # # Chapter 3. Basic Statistical Methods # # Time-stamp: <2008-02-10 13:41:06 chl> postscript(file="RMB_c3.ps") ############################################################################## ### t-test # first, we load the data HERS <- read.table("hersdata.txt",header=TRUE,sep="\t") HERS.women.wht_diab <- subset(HERS,diabetes==0) HERS.women.wht_diab$exercise <- factor(HERS.women.wht_diab$exercise,labels=c("No","Yes")) # before carrying out the t-test, we shall look at the data with the help of # some graphics boxplot(glucose~exercise,data=HERS.women.wht_diab) title(xlab="Exercise",ylab="Glucose Level (mg/dL)") op <- par(mfrow=c(2,2)) with(HERS.women.wht_diab, hist(glucose[exercise=="No"], main="Exercice: No",xlab="Glucose Level (mg/dL)")) with(HERS.women.wht_diab, hist(glucose[exercise=="Yes"], main="Exercice: Yes",xlab="Glucose Level (mg/dL)")) with(HERS.women.wht_diab, qqnorm(glucose[exercise=="No"])) with(HERS.women.wht_diab, qqline(glucose[exercise=="No"])) with(HERS.women.wht_diab, qqnorm(glucose[exercise=="Yes"])) with(HERS.women.wht_diab, qqline(glucose[exercise=="Yes"])) par(op) # Note: # the Q-Q plots are not very useful here since sample size is large enough # for valid t-test # compute overall and group means mean(HERS.women.wht_diab$glucose,na.rm=TRUE) group.means <- with(HERS.women.wht_diab, tapply(glucose,exercise,mean,na.rm=TRUE)) (dobs <- diff(group.means)) # now compute t statistic, assuming equal variance (two-sided test) (glucose.ttest <- t.test(glucose~exercise,data=HERS.women.wht_diab,var.equal=TRUE)) # compare with Welch corrected t-test (default within R) t.test(glucose~exercise,data=HERS.women.wht_diab,var.equal=FALSE) # given the sample size, one should reasonnably make use of asymptotic # convergence of the t distribution to the standard normal 1-pnorm(as.numeric(glucose.ttest$statistic)) ############################################################################## ### one-way ANOVA HERS$raceth <- factor(HERS$raceth,labels=c("White","Afr Amer","Other")) summary(HERS$raceth) # get a numerical summary tabulated by the factor of interest (ethnicity) my.summary <- function(x,digits=3) { tmp <- summary(x) tmp <- c(tmp,sd(x),length(x),sum(is.na(x))) names(tmp)[7:9] <- c("Sd","N","NAs") return(round(tmp,digits)) } with(HERS, by(SBP,raceth,my.summary,digits=2)) # plot the between-group dispersion around the overall mean plot.design(SBP~raceth,data=HERS) # perform the ANOVA and display the anova table hers.aov <- aov(SBP~raceth,data=HERS) summary(hers.aov) model.tables(hers.aov) # we can check that there is no clear departures from ANOVA assumptions op <- par(mfrow=c(2,2)) plot(hers.aov) par(op) # Notes: # 1. some individual have high residual so we should check if anything else goes # wrong with these observations (916, 2161 and 175) among other covariates... # 2. pairwise comparisons should follow # As suggested by the authors, we could perform an ANOVA which does not assume # equal variance among groups, as we did for the t-test. R. L. Welch proposes to # correct the df in a way that is close to what is done in the case of two # samples comparison. # # Welch, R.L. On the comparison of several mean values: An alternative approach, # Biometrika 38: 330--336, 1951. # though this is not really recommended here, we could perform a non parametric # ANOVA (based on ranks) kruskal.test(SBP~raceth,data=HERS) plot.design(SBP~raceth,data=HERS,fun=median) # Note: # the test is now non significant! ############################################################################## ### simple linear regression # first, we plot the data plot(SBP~age,data=HERS,cex=.5,xlab="Age (years)", ylab="Systolic Blood Pressure (mmHg)") # and we add average SBP for each decile of age age.decile <- quantile(HERS$age,probs=seq(0,1,.1)) tmp <- as.numeric(age.decile) xx <- numeric(length(tmp)-1) # mid-interval points for (i in 1:(length(tmp)-1)) xx[i] <- (tmp[i]+tmp[i+1])/2 HERS$ageq <- cut(HERS$age,breaks=age.decile,labels=FALSE) points(xx,as.numeric(tapply(HERS$SBP,HERS$ageq,mean)),pch=15,cex=1.2) # compute the linear regression HERS.lm <- lm(SBP~age,data=HERS) summary(HERS.lm) # add the regression line to the previous plot abline(HERS.lm) # some diagnostics plots op <- par(mfrow=c(2,2)) plot(HERS.lm) par(op) # Note: # Results differ from those presented by the authors because they consider only # a 10% random sample of the total sample (cf. chap3.do on the website) # doing the analysis with a random sample (10% of the total size) set.seed(12386) idx <- sample(HERS$id,size=nrow(HERS)/10) HERS.sample10 <- HERS[idx,] HERS.lm2 <- lm(SBP~age,data=HERS.sample10) summary(HERS.lm2) # we now can predict SBP for a woman of average age as.numeric(HERS.lm$coefficients[1] + HERS.lm$coefficients[2]*mean(HERS$age)) # but we could also create a new dataframe containing fitted values for each age # along with confidence interval for prediction (larger than CI for estimation) HERS.lm.fit <- data.frame(age=seq(min(HERS$age),max(HERS$age),by=1)) fitted.values <- predict(HERS.lm,HERS.lm.fit,interval="prediction") fitted.values2 <- predict(HERS.lm,HERS.lm.fit,interval="confidence") HERS.lm.fit <- cbind(HERS.lm.fit,fitted.values,fitted.values2[,2:3]) colnames(HERS.lm.fit)[5:6] <- c("lwr2","upr2") with(HERS.lm.fit, { matplot(age,cbind(lwr,upr),type="l",xlab="Age",ylab="Predicted SBP",col=2,lty=2) matlines(age,cbind(lwr2,upr2),type="l",xlab="Age",ylab="Predicted SBP",col=3,lty=2) lines(age,fit) points(mean(HERS$age),mean(HERS$SBP)) }) legend("topleft",c("Prediction","Confidence"),col=2:3,lty=2,title="CI",bty="n") # computing confidence interval for regression parameters using bootstrap library(boot) boot.fun <- function(data,x) return(as.numeric(lm(data[x,2]~data[x,1])$coefficients[2])) sbp.age <- cbind(HERS$age,HERS$SBP) res <- boot(sbp.age,boot.fun,1000) res # two-sided bootstrapped confidence interval of Cronbach's alpha quantile(res$t,c(0.025,0.975)) # std error sqrt(diag(cov(res$t))) # adjusted bootstrap percentile (BCa) confidence interval (better) # Note: # The boot.ci( ) function takes a bootobject and generates 5 different types # of two-sided nonparametric confidence intervals. These include the first # order normal approximation, the basic bootstrap interval, the studentized # bootstrap interval, the bootstrap percentile interval, and the adjusted # bootstrap percentile (BCa) interval. boot.ci(res,type=c("norm","perc")) # normal approx. + percentile method # the following displays a graphical summary of the bootstrap procedure plot(res) # Note: # There is a great tutorial on boostrapping techniques written by John Fox, # which is available at the following address: # http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-bootstrapping.pdf ############################################################################## ### two-way contingency table # loading the data WCGS <- read.table("wcgs.txt",header=TRUE,sep="\t") WCGS$chd69 <- factor(WCGS$chd69,labels=c("Noncases","Cases")) WCGS$arcus <- factor(WCGS$arcus,labels=c("Unexposed","Exposed")) # display a basic two-way table chd69.tab <- with(WCGS, table(chd69,arcus)) chd69.tab # interesting summary are provided by the `vcd` package # however, we are more interested by association measures library(vcd) assocstats(chd69.tab) # here are the functions we need (but see the `epi` and `epitools` packages) odds.ratio <- function(x,alpha=.05) { if (!is.table(x)) x <- as.table(x) pow <- function(x,a=-1) x^a or <- (x[1,1]*x[2,2]) / (x[1,2]*x[2,1]) or.se <- sqrt(sum(pow(x))) or.ci <- exp(log(or) + c(-1,1)*qnorm(1-alpha/2)*or.se) return(list(OR=or,SE=or.se,CI=or.ci)) } # Note: # we don't use Cornfield approximation when computing confidence # interval (Cornfield approximation relates in fact to SE calculation). # # See http://www.stata.com/help.cgi?cci for the Stata on-line help # # and the 'official' reference for CI computation with matched cases # design: # J J Schlesselman. Case-Control Studies. Design, Conduct, Analysis. # Oxford University Press, 1982. # # Also see # C-Y Lin & M-C Yang, Improved Exact Confidence Intervals for the # Odds Ratio in Two Independent Binomial Samples, Clinical Trials, # 48(6): 1008-1019, 2006. # # C C Brown, The Validity of Approximation Methods for Interval # Estimation of the Odds Ratio, American Journal of Epidemiology, # 113(4): 474-480, 1981. # relative.risk <- function(x,srow=1,scol=1) { # we assume that Cases/Exposed is contained in cell [srow,scol] out <- c(as.numeric(x[srow,scol]/apply(x,2,sum)[scol]),as.numeric(x[srow,scol+1]/apply(x,2,sum)[scol+1])) names(out) <- c("Cases+","Cases-") return(out) } # The following functions are adapted from # S. BOLBOACA and A. ACHIMAS CADARIU. Binomial Distribution # Sample Confidence Intervals Estimation. 6. Excess Risk # http://lejpt.academicdirect.org/A04/01_21.htm # # Here, we are expressing excess risk as # ER = a/(a+b) - c/(c+d) = y/n - x/m # for the classical 2x2 table # # ----------- # | + | - | # |-------------- # | + | a | b | # |-------------- # | - | c | d | # |-------------- # dwald <- function(x,m,y,n,z=1.96) { lwr <- max(c(y/n-x/m-z*sqrt((x*(m-x))/m^3+(y*(n-y))/n^3),-1)) upr <- min(c(y/n-x/m+z*sqrt((x*(m-x))/m^3+(y*(n-y))/n^3),1)) return(c(lwr,upr)) } dac <- function(x,m,y,n,z=1.96) { return(dwald(x+z^2/(4*sqrt(2)),m+z^2/(2*sqrt(2)),y+z^2/(4*sqrt(2)),n+z^2/(4*sqrt(2)),z)) } # odds-ratio odds.ratio(chd69.tab) # relative risk chd69.rr <- relative.risk(chd69.tab) excess.risk <- as.numeric(abs(diff(chd69.rr))) dac(chd69.tab[1,2],chd69.tab[1,1]+chd69.tab[1,2],chd69.tab[2,2],chd69.tab[2,1]+chd69.tab[2,2]) # Note: # I'm quite confuse for the moment since this doesn't seem to provide # a logical result... risk.ratio <- as.numeric(chd69.rr[1]/chd69.rr[2]) # global association test # we don't need the correction for continuity when using chi square test chisq.test(chd69.tab,correct=F) # the `epitools` package offers facilities for computing the above # estimates, together with test of association # Note: # CI for odds-ratio are estimated using Wald approximation # We can add the option verbose=TRUE for a more detailed output library(epitools) epitab(chd69.tab,pvalue="chi2") # the HIV Status by AIDS Diagnosis of Male Partner data HIV <- matrix(c(3,2,4,22),nrow=2) dimnames(HIV) <- list(c("Cases","Noncases"),c("Exposed","Unexposed")) HIV epitab(HIV) # compare OR estimate with that obtained using mid-p method # Kenneth J. Rothman and Sander Greenland (1998), Modern Epidemiology, # Lippincott-Raven Publishers (p. 251) or.midp(c(3,4,2,22)) # alternatively, we can directly make use of Fisher exact test fisher.test(HIV) fisher.test(HIV,alternative="greater") # Now, if we use the `epicalc` package, we get all in one library(epicalc) HIV.tab <- make2x2(3,2,4,22) # csi(3,2,4,22) cc(cctable=HIV.tab) cs(cctable=HIV.tab) # are the OR constant across subgroups of age (Mantel-Haenzel test) # WCGS$agec is a 5-level factor # first we look at a simple two-way classification (chd69.tab2 <- with(WCGS, table(chd69,agec))) # column profile apply(chd69.tab2,1,function(x) round(x/apply(chd69.tab2,2,sum)*100,2)) # global association test with chi square chisq.test(chd69.tab2) # a more detailed inspection of association estimates can be obtained # using # cc(cctable=chd69.tab2) # Note: # I have not yet calculated the score test trend of odds # Twenty-year Vital Status by Smoking Behavior, Stratified by Age # are not available as raw data, so we will use the WCGS study chd69.tab3 <- xtabs(~chd69+arcus+agec,data=WCGS) # we can get OR by strata as follows apply(chd69.tab3,3,function(x) (x[1,1]*x[2,2])/(x[1,2]*x[2,1])) # run the Mantel-Haenszel test and an Homogeneity of OR test mhor(mhtable=chd69.tab3) # alternatively, one can use (wht correction for continuity to get # similar result) with(WCGS, mantelhaen.test(chd69,arcus,agec,correct=FALSE)) ############################################################################## ### survival data LEUK <- read.table("leuk.txt",header=TRUE) library(survival) # create a survival object LEUK.surv <- Surv(LEUK$time,LEUK$cens) # Kaplan-Meier estimate LEUK.km <- survfit(LEUK.surv~group,data=LEUK) plot(LEUK.km[1]) lines(LEUK.km[2],col="gray60") legend("topright",c("6-MP","Placebo"),lty=1,col=c("black","gray60"),bty="n") title(xlab="Weeks since randomization",ylab="Proportion in remission") # numerical summary LEUK.km summary(LEUK.km) # comparing the two groups with log-rank test using survdiff() # rho = 0 (default) gives logrank test while rho=1 gives Wilcoxon test survdiff(LEUK.surv~group,data=LEUK) dev.off() # release the postscript device