RMB_c6.R

# Regression Methods in Biostatistics, Vittinghoff, Glidden, Shiboski
# and McCulloch (Springer, 2005)
#
# Chapter 6. Logistic Regression
#
# Time-stamp: <2008-08-26 20:46:21 chl>

postscript(file="RMB_c6.ps")

##############################################################################
# WCGS Sudy: Relationship between CHD and Age

# loading the data
WCGS <- read.table("wcgs.txt",header=TRUE,sep="\t")
WCGS$chd69 <- factor(WCGS$chd69,labels=c("Noncases","Cases"))

# plot the data
stripchart(age~as.numeric(chd69),data=WCGS,method="jitter",pch="+",
           group.names=levels(WCGS$chd69),las=1,cex.axis=.8,cex=.8)
title("WCGS Study: Coronary disease and Age")


# set up the logistic model with default logit link
WCGS.glm <- glm(chd69~age,data=WCGS,family=binomial())

summary(WCGS.glm)

logLik(WCGS.glm)            # the log-likelihood
exp(coef(WCGS.glm)[2]*10)   # OR associated with a ten-year increase in age
library(MASS)
confint(WCGS.glm)           # 95% CI using profiled likelihood
confint.default(WCGS.glm)   # 95% CI based assuming asymptotic normality

WCGS.glm.fit <- fitted(WCGS.glm)  # fitted values

# estimate the probability of having CHD for age=55
exp(-5.940+0.074*55)/(1+exp(-5.940+0.074*55))
# or, alternatively
pred.df <- data.frame(age=c(55))
predict(WCGS.glm,pred.df,type="response",se=TRUE)

# we could also do the computation the "hard way", but see
# J Faraway, Extending the Linear Model with R. Generalized Linear,
#   Mixed Effects and Nonparametric Regression Models, Chapman & Hall/CRC,
#   2006
#
# to predict the reponse at age 55
x0 <- c(1,55)
eta0 <- sum(x0*coef(WCGS.glm))
library(boot)
inv.logit(eta0)

# to get the 95% CI, we first need to extract the variance-covariance matrix
(vcm <- summary(WCGS.glm)$cov.unscaled)
se <- sqrt(t(x0) %*% vcm %*% x0)         # SE on the logit scale
inv.logit(eta0+c(-1,1)*qnorm(0.975)*se)  # CI on the probability scale

# R does not provide LR test in the summary output, but it could easily be
# obtained from the Deviance Table, e.g.
anova(WCGS.glm,test="Chisq")

# See also lrm() in the Design package
library(Design)
lrm(chd69~age,data=WCGS)

# for model adequacy checking, see residuals.lrm
# e.g.
resid(lrm(chd69~age,data=WCGS,x=T,y=T), "partial", pl="loess")

#
# Note:
# The definitive reference for the Design package is
# F. Harrell, Regression Modeling Strategies, Springer, 2001.
# See Chapter 10 to 12 on binary logistic regression.
#

# assessing linearity in the relationship between CHD Risk and Age (p. 191)
tab.chd69.age <- with(WCGS, table(chd69,age))
obs.logit <- logit(tab.chd69.age[2,]/apply(tab.chd69.age,2,sum))

plot(obs.logit~as.numeric(names(obs.logit)),pch=19,cex=.8,xlab="Age (years)",
     ylab="Logit CHD Risk",ylim=c(-4,-1))
abline(lm(obs.logit~as.numeric(names(obs.logit))),lty=2)
lines(as.numeric(names(obs.logit)),
      predict(loess(obs.logit~as.numeric(names(obs.logit)))),lty=3)
legend("bottom",c("Linear fit","LOWESS smooth"),lty=2:3,horiz=TRUE)

# this suggests to test a model including a quadratic effect for age,
# after centering it to reduce colinearity with the linear term
summary(glm(chd69~age+I(scale(age,scale=F)^2),data=WCGS,family=binomial))


##############################################################################
# WCGS Study: CHD and Arcus Senilis (categorical predictor)

WCGS$arcus <- factor(WCGS$arcus,labels=c("Unexposed","Exposed"))

# we graphically display the Morbidity/Exposition table
mosaicplot(t(table(WCGS$chd69,WCGS$arcus)),col=c(4,2),main="WCGS Study")

WCGS.glm2 <- glm(chd69~arcus,data=WCGS,family=binomial())

summary(WCGS.glm2)

# estimate OR and associated 95% CI
library(epitools)
oddsratio(table(WCGS$chd69,WCGS$arcus),method="wald")  # Wald CI

# Note that we could also use the epitab() function

# estimate a model where predictor has multiple categories
table(WCGS$chd69,WCGS$agec)

WCGS.glm3 <- glm(chd69~as.factor(agec),data=WCGS,family=binomial())
summary(WCGS.glm3)

# estimate OR, together with their SD and associated 95% CI
WCGS.glm3.OR <- as.numeric(exp(coef(WCGS.glm3))[2:5])  # omit the intercept

res <- matrix(NA,nr=4,nc=3)
for (i in 2:5) {
  # we take the first level as the baseline
  test <- oddsratio(table(WCGS$chd69,WCGS$agec)[,c(1,i)],method="wald")
  res[i-1,] <- test$measure[2,]
}
colnames(res) <- c("OR","LB-CI","UB-CI")
rownames(res) <- paste("agec",1:4,sep="_")
print(res,digits=4)

# suppose, we want to change reference level for age
agec2 <- relevel(as.factor(WCGS$agec),ref="3")
table(agec2)
oddsratio(table(WCGS$chd69,agec2)[,c(1,5)],method="wald")  # OR agec_4/agec_3

##############################################################################
# Multipredictor Model

WCGS.glm4 <- glm(chd69~age+chol+sbp+bmi+smoke,data=WCGS,family=binomial())
summary(WCGS.glm4)

# we plot the 95% CI for each coefficient
# rather than using the above method with Wald CI, we use confint
# (it works because each variable is either continuous or binary)
WCGS.glm4.coeff <- coef(WCGS.glm4)
WCGS.glm4.CI <- confint(WCGS.glm4)

opar <- par(mar=c(5,6,4,2),las=1)
plot(exp(WCGS.glm4.coeff)[2:6],1:5,xlim=c(0.5,2.5),axes=FALSE,pch=19,
     cex=1.2,ylab="",xlab="OR",main="WCGS Study")
axis(1)
axis(2,at=1:5,labels=names(WCGS.glm4.coeff)[2:6],tick=FALSE)
segments(exp(WCGS.glm4.CI[2:6,1]),1:5,exp(WCGS.glm4.CI[2:6,2]),1:5,lwd=2)
abline(v=1,lty=2,col="lightgray")
par(opar)

# now we scale the predictors
# Note that age_c is a centered predictor scaled by a factor of 10
plot(WCGS$age_10,scale(WCGS$age,scale=F)/10)

WCGS.glm5 <- glm(chd69~age_10+chol_50+sbp_50+bmi_10+smoke,data=WCGS,
                 family=binomial())
summary(WCGS.glm5)

# next, we add a 4-levels predictor
table(WCGS$behpat)

WCGS.glm6 <- update(WCGS.glm5,.~.+as.factor(behpat))
summary(WCGS.glm6)
exp(confint.default(WCGS.glm6))  # approximate normal-based 95% CI 

# the reduced model without behpat is compared to the above one
anova(WCGS.glm6,WCGS.glm5,test="Chisq")

# or, by hand
2*(logLik(WCGS.glm6)-logLik(WCGS.glm5))

# or, using lrtest from the Design package
lrtest(WCGS.glm6,WCGS.glm5)

# The lmtest package also contains a function called lrtest
detach(package:Design)
library(lmtest)
lrtest(WCGS.glm6,WCGS.glm5)
# or, equivalently
# lrtest(WCGS.glm6,"behpat")

# after replacing the 4-levels behpat by a binary version (dipbat), we get the
# following results
WCGS.glm7 <- glm(chd69~age_10+chol_50+sbp_50+bmi_10+smoke+dibpat,data=WCGS,
                 family=binomial())
summary(WCGS.glm7)
exp(confint.default(WCGS.glm7))

# again, wa compare the two nested models
anova(WCGS.glm7,WCGS.glm6,test="Chisq")
  
##############################################################################
# WCGS Study: Confounding effect (Type A Behavior pattern)

WCGS.glm8 <- glm(dibpat~age_10+chol_50+sbp_50+bmi_10+smoke,data=WCGS,
                 family=binomial())
summary(WCGS.glm8)

# adjusted ORs, omitting the intercept
exp(coef(WCGS.glm8)[-1])

# unadjusted OR for dipbat/chd69
oddsratio(table(WCGS$chd69,WCGS$dibpat))

# while in the 7th model, it was
exp(coef(WCGS.glm7)[["dibpat"]])

## # we drop one at a time each predictor in the 7th model and estimate the
## # change in OR for dipbat/chd69
## mod.pred <- "age_10+chol_50+sbp_50+bmi_10+smoke+dibpat"
## res <- numeric(lenth(pred.var))
## for (i in 1:5) {
##   reduced.model <- update(WCGS.glm7,.~.-unlist(strsplit(mod.pred,"+",fixed=T))[i])
##   res[i] <- exp(coef(reduced.model)[["dibpat"]])
## }

##############################################################################
# WCGS Study: Interaction effect (age under/over 50 vs. arcus senilis)

with(WCGS, table(bage_50,arcus))
xtabs(~as.numeric(chd69)+as.numeric(bage_50)+as.numeric(arcus),data=WCGS)
summary(xtabs(~as.numeric(chd69)+as.numeric(bage_50)+as.numeric(arcus),data=WCGS))

WCGS.glm9 <- glm(chd69~bage_50*arcus,data=WCGS,family=binomial())
summary(WCGS.glm9)

exp(coef(WCGS.glm9))  # OR for coefficient bage_50 is the OR for age 50-59 vs.
                      # age 39-49, for people without arcus

# estimate OR for Arcus-Age interaction
# (we use the contrast() function in the Design package)
library(Design)
WCGS$bage_50 <- as.factor(WCGS$bage_50)
WCGS.glm9bis <- lrm(chd69~bage_50*arcus,data=WCGS)
cc <- contrast(WCGS.glm9bis,list(bage_50='1',arcus='Exposed'),
               list(bage_50='0',arcus='Exposed'))

# finally, we exponentiate the contrast and its 95% CI to work on the
# probability scale
exp(unlist(cc[2:5]))

# we may also compared the full model with a model without interaction
WCGS.glm10 <- update(WCGS.glm9,.~.-bage_50:arcus)
anova(WCGS.glm10,WCGS.glm9,test="Chi")

# if we now consider age as a continuous variable
WCGS.glm11 <- glm(chd69~age*arcus,data=WCGS,family=binomial())
summary(WCGS.glm11)

# estimate the log odds associated with having arcus for age=55
pred.df2 <- expand.grid(age=seq(min(WCGS$age),max(WCGS$age)),
                       arcus=factor(levels(WCGS$arcus)))
WCGS.glm11.pred <- predict(WCGS.glm11,pred.df2,se=TRUE)
# as we expressed the predicted value on the link scale, we don't have to
# take the log of the predicted values before substracting them
OR.11 <- WCGS.glm11.pred$fit[pred.df2$age==55 & pred.df2$arcus=="Exposed"] -
  WCGS.glm11.pred$fit[pred.df2$age==55 & pred.df2$arcus=="Unexposed"]
# thus, OR equals
exp(OR.11)

# we can also do this using the above contrast() function
WCGS.glm11bis <- lrm(chd69~age*arcus,data=WCGS)
cc2 <- contrast(WCGS.glm11bis,list(age=55,arcus='Exposed'),
                list(age=55,arcus='Unexposed'))
exp(unlist(cc2[2:5]))

# plot log odds as a function of age, separately for individuals with
# and without arcus
interaction.plot(pred.df2$age,pred.df2$arcus,WCGS.glm11.pred$fit,legend=FALSE,
                 xlab="Age (years)",ylab="Log CHD Risk")
legend("bottom",c("Unexposed","Exposed"),lty=1:2,horiz=TRUE)

# compare the p-value for Age x Arcus with that obtained from an additive
# model (i.e. where we model log(P) against a linear combination of predictors)
WCGS.glm12 <- glm((as.numeric(chd69)-1)~bage_50*arcus,data=WCGS,family=poisson)
summary(WCGS.glm12)

# however, I cannot get the Wald test p-value of 0.15 as Vittinghoff et al.
# here, it is estimated as 0.032

##############################################################################
# Prediction using an extended logistic model for CHD Events
WCGS.glm13 <- glm(chd69~age_10+chol_50+sbp_50+bmi_10+smoke+dibpat
                  +bmi_10:chol_50+bmi_10:sbp_50,
                  data=WCGS,family=binomial())
summary(WCGS.glm13)

# Note that we get slighlty different estimates

# predictions for the observed sample (5 individuals taken at random among the
# wole dataset)
set.seed(143)
five.indiv <- sample(1:nrow(WCGS),5,replace=F)
WCGS.glm13.pred <- predict(WCGS.glm13,type="response")[five.indiv]
WCGS.five.indiv <- WCGS[five.indiv,c("chd69","age","chol","sbp","bmi","smoke","dibpat")]
cbind(WCGS.five.indiv,WCGS.glm13.pred)

# compute AUC and plot the AUC
library(Epi)
WCGS.glm13.predAll <- predict(WCGS.glm13,type="response")
tmp <- cbind(predicted=WCGS.glm13.predAll,observed=WCGS$chd69[as.numeric(names(WCGS.glm13.predAll))])
ROC(tmp[,"predicted"],tmp[,"observed"]-1,plot="ROC")

# diagnostic plots (as plotted pp. 189-191)

# plot standardized residuals against obs. number
plot(residuals(WCGS.glm13,type="pearson"),ylab="Standardized Pearson Residuals",
     xlab="Observation Number",pch=19,cex=.8)

# dfbetas
ifl.glm13 <- influence(WCGS.glm13,do.coef=FALSE)
dfbetas.glm13 <- dfbetas(WCGS.glm13)

# plot of the residuals vs. fitted values
plot(residuals(WCGS.glm13,type="response")~WCGS.glm13.predAll,pch=19,cex=.8,
     xlab="Fitted values",ylab="Response Residuals")

# Leverage effect
hi <- influence(WCGS.glm13)$hat
plot(hi~WCGS.glm13.predAll,xlab="Fitted values",
     ylab=expression(paste("Leverage (",h[i],")")),pch=19,cex=.8)
hh <- which(hi>0.06)
text(WCGS.glm13.predAll[hh],hi[hh],hh,cex=.8,pos=4)

##############################################################################
# Case-Control Studies: The Ille-et-Villaine study of esopahgeal cancer
# see e.g. http://www.ncbi.nlm.nih.gov/pubmed/861389

#
# Note:
# Data are not available on companion website. Fortunately, the are included
# in the esoph R dataset.
data(esoph)

# recode tobacco consumption as binary variable
esoph$ditob <- ifelse(as.numeric(esoph$tobgp)==1,"O-9g/day","10+ g/day")
esoph$ditob <- factor(esoph$ditob,levels=c("O-9g/day","10+ g/day"),ordered=TRUE)

with(esoph, tapply(ncases,ditob,sum))

# compute odds ratio for smoking and esophageal cancer

# we first need to recode the case/control classification in the 'long' format
# before using epitab() from epitools package

# TODO...

# runnig an GLM on this dataset (here age is categorical)
esoph.glm <- glm(cbind(ncases,ncontrols)~alcgp+ditob+agegp,
                 data=esoph,family=binomial)
summary(esoph.glm)

#
# Note:
# Vittinghoff et al. take age as a continuous variable (in years), but we
# only have a categorical variable
# To approximate their results, we convert agegp to a numerical variable,
# after taking the mid-group value
esoph$age.num <- strsplit(as.character(esoph$agegp),"-")
for (i in 1:77)
  esoph$age.num[[i]] <- (as.numeric(esoph$age.num[[i]][2])
                         +as.numeric(esoph$age.num[[i]][1]))/2
esoph$age.num <- unlist(esoph$age.num)
esoph$age.num[78:88] <- "75"
esoph$age.num <- as.numeric(esoph$age.num)

esoph.glm2 <- glm(cbind(ncases,ncontrols)~alcgp+ditob+age.num,
                  data=esoph,family=binomial)
summary(esoph.glm2)

# Using this coding, we clearly don't the same results


dev.off() # release the postscript device