Les bases du langage

R Foundation for Statistical Computing

n <- 100
x <- runif(n)
y <- 0.9 + 0.65*x + rnorm(n)
## Fit a simple linear model
summary(lm(y ~ x))
nombre <- 3.141593
v <- c(1,2,3,4)
b <- c(T,F,T,F)
s <- c("h","e","l","l","o")
a <- b <- rnorm(2)
u <- b  
v[1]
v[c(1,4)]
v[b]    ## valeurs de v telles que b vaut TRUE
s[v[3]] ## v[3]=3ème valeur de s
names(v) <- c("toto","titi","tata","tutu")
v["tata"]
n <- 100
age <- sample(25:45, n, replace=TRUE)
sex <- sample(c("M","F"), n, replace=TRUE)
length(age[sex=="F"])
length(sex[age >= 30 & age <= 40])
which(age > 30)
x <- c(1,4,10,3,1)
names(x) <- letters[1:length(x)]
idx <- c(1,3,4)
g <- c(T,F,T,T,F)
all(x[idx] == x[g])
x <- rnorm(100)
res <- c(m=mean(x), s=sd(x))
y <- rnorm(100)
rbind(x, y)
cbind(x, y)
t(x) %*% x * 1/(length(x)-1)
1:5
seq(1, 5)
rep(c(1,3), 5)
rep(c(1,3), each=5)  ## rep(c(1,3), c(5,5))
x <- c(0,1,1,0,1,0,1,0,0)
x[3]
x[length(x)]
x[2:4]
x[c(2,4)]
seq(1, 10)[seq(1, 10, by=2)]  ## nombres impairs
seq(1, 10)[seq(2, 10, by=2)]  ## nombres pairs
x <- 1:10
xs <- sample(x)
sort(xs, decreasing=TRUE)
order(xs)
rank(rev(xs))
rank(sample(LETTERS))
rank(c(1,3,5,5,2))
rank(c(1,3,5,5,2), ties.method= "first")
x <- 1:10
sum(x)
x[c(1,3)] <- NA  ## ou is.na(x[c(1,3)]) <- TRUE
sum(x)
sum(x, na.rm=TRUE)
is.na(x)
sum(is.na(x))
summary(x)
x1 <- sample(1:100, 10)
x2 <- sample(1:100, 10)
x <- cbind(x1, x2)
x[,"x1"]
x <- matrix(sample(1:100, 10), ncol=2)
colnames(x) <- paste("x", 1:2, sep="")
dim(x)
ncol(x)  ## ou nrow(x)
x[,1]    ## id. à x[,"x1"]
x[1:3,]
head(x)
str(x)
x <- matrix(1:10, nrow=2, byrow=TRUE)
x[1,2]
for (j in 1:ncol(x))
  print(mean(x[,j]))
apply(x, 2, mean)  ## ou colMeans(x), plus efficace
apply(x, 2, mean, na.rm=TRUE)
factor(x, levels=, labels=, ordered=)
y <- sample(1:100, 20)
x <- factor(rep(c("a","b"), 10))
tapply(y, x, mean)
by(y, x, mean)
table(x) 
x <- factor(rep(c("t1","t2","t3"), each=10), 
            ordered=TRUE)
x[c(1,3)]
levels(x)      ## unique(x)
nlevels(x)     ## length(unique(x))
is.factor(x)
summary(x)     ## id. à table(x)
x <- rep(letters[1:3], each=10)
x <- as.factor(x)
## ou de manière équivalente
gl(3, 10, labels=letters[1:3])
factor(rep(1:2, 2), labels=2:1)
factor(rep(1:2, 2), labels=2:1, levels=2:1)
x <- gl(4, 5, 100)
relevel(x, ref=4)
levels(x) <- 4:1
levels(x)[1:2] <- "3.5"
as.numeric(x)
as.numeric(f <- factor(1:2, levels=2:1)[1])
as.numeric(levels(f))[f]
a <- gl(4, 5, 100)
b <- a[a != 1]
all(levels(a) == levels(b))
d <- droplevels(a[a != 1])
score <- c(1,5,3,1,5,2,2)
x1 <- c("M","M","F","F","F","F","M")
x2 <- c(92,96,97,93,101,100,97)
x3 <- c("C","A","C","C","C","B","B")
a <- data.frame(score, gender=x1, 
                IQ=x2, SES=x3)
dim(a)
summary(a)
str(a)
as.matrix(a)
a[,1:2]
a[1:3,c(2,4)]
a$score[1:3]
a$score[a$gender == "M"]
with(a, score[gender == "M"])
## ou de manière équivalente
subset(a, gender == "M")
a$score[a$SES %in% c("A","C")]
summary(a)
str(a)
dim(a)
colnames(a)
cbind(a, 1:7)
rbind(a, c(3, "M", 100, "C"))
is.factor(gender)
is.factor(a$gender)
within(a, SES <- factor(SES, labels=c("low","mid","high")))
within(data, expr, ...)
blood <- read.table("blood.txt", header=TRUE)
cereal <- read.csv("cereal.csv")
data(birthwt, package="MASS")
str(birthwt)
summary(birthwt)
birthwt <- within(birthwt, {
  low <- factor(low, labels=c("No","Yes"))
  race <- factor(race, labels=c("White","Black","Other"))
  smoke <- factor(smoke, labels=c("No","Yes"))
  ui <- factor(ui, labels=c("No","Yes"))
  ht <- factor(ht, labels=c("No","Yes"))
})
library(Hmisc)
birthwt <- within(birthwt, {
  units(age) <- "years"
  units(lwt) <- "pounds"
})
idx <- sapply(birthwt, is.numeric)
bwt <- apply(birthwt[,idx], 2, scale)
boxplot(bwt)
parallel(bwt, groups=birthwt$low, horiz=FALSE)
idx <- apply(bwt, 2, filter.perc, cutoff=c(.01,.99), 
             collate=TRUE)
my.col <- as.numeric(1:nrow(bwt) %in% unique(unlist(idx)))+1
splom(~ bwt, pch=19, col=my.col, alpha=.5, cex=.6)
summary(low ~ ., data=birthwt[,-10], method="reverse")
library(latticeExtra)
marginal.plot(birthwt, data=birthwt, groups=low)    
bwt.df <- transform(birthwt[,-10], 
                    ftv=factor(ftv>0, lab=c("No","Yes")),
                    ptl=factor(ptl>0, lab=c("None","1+")))
summary(low ~ ., data=bwt.df, method="reverse")
mean(birthwt$lwt/2.2)  ## poids de la mère en kg
summary(lwt/2.2 ~ low + race, data=birthwt)
stripplot(~ age, data=birthwt, jitter.data=TRUE, 
          amount=.3, aspect=.3, cex=.6)
## Tukey's five-point summary
summary(birthwt$age)
quantile(birthwt$age, probs=c(.1, .25, .5, .75, .9))
desc <- function(x, dig=2)
  round(c(ety=sd(x), iqr=IQR(x), "max-min"=diff(range(x))), 
        digits=dig)
desc(birthwt$age)
qqmath(~ age, data=birthwt, dist=qunif)
med <- function(x) {
  odd.even <- length(x) %% 2
  if (odd.even == 0) 
    (sort(x)[length(x)/2] + 
      sort(x)[1+length(x)/2]) / 2 
  else 
    sort(x)[ceiling(length(x)/2)]
}
bwplot(~ age, data=birthwt)
histogram(~ lwt, data=birthwt, type="count")
library(e1071)
skewness(birthwt$lwt)
densityplot(~ lwt, data=birthwt)
with(birthwt, cor(lwt, bwt))
with(birthwt, cor(lwt, bwt, method="spearman"))
with(birthwt, by(age, low, summary))
with(birthwt, tapply(lwt, race, 
                     function(x) c(mean(x), sd(x))))
summary(race ~ lwt, data=birthwt, method="reverse")
print(summary(race ~ lwt, birthwt, method="reverse"), 
      prmsd=TRUE)
xyplot(lwt/2.2 ~ bwt/1000, data=birthwt, type=c("p","r"))
bwplot(low ~ age, data=birthwt, panel=Hmisc::panel.bpplot)
densityplot(~ age, data=birthwt, groups=low, 
            auto.key=list(columns=2), plot.points=FALSE)
age.by.race <- aggregate(age ~ race, data=birthwt, FUN=mean)
dotplot(race ~ age, data=age.by.race)
y ~ x | a      ## y en fonction de x condit. à a
y ~ x | a + b  ## y en fonction de x condit. à a et b
xyplot(bwt/1000 ~ lwt/2.2 | smoke, data=birthwt, 
       groups=ptl>0)
bwplot(age ~ low | smoke + race, data=birthwt)
xyplot(bwt ~ lwt/2.2 | race, data=birthwt, layout=c(3,1),
       cex=sqrt(birthwt$ftv+.5), col=birthwt$smoke, 
       panel=function(...) {
         panel.xyplot(...)
         panel.abline(h=2500, lty=2)})
Age <- equal.count(birthwt$age)
ftvc <- factor(birthwt$ftv>1, 
               labels=c("0 visite","1+ visite"))
xyplot(bwt/1000 ~ lwt/2.2 | Age + ftvc, data=birthwt, 
       groups=low, pch="+")
data(birthwt, package=MASS)
birthwt$smoke <- factor(birthwt$smoke, 
                        labels=c("No","Yes"))
t.test(bwt ~ smoke, data=birthwt, var.equal=TRUE)
bwplot(smoke ~ bwt, data=birthwt)
qqmath(~ bwt, data=birthwt, group=smoke)
t.test(extra ~ group, data=sleep, 
       var.equal=TRUE, paired=TRUE)
xyplot(extra[group==2] ~ extra[group==1], 
       data=sleep, type=c("p","g"))
occ1 <- c(1.83,0.50,1.62,2.48,1.68,1.88,1.55,3.06,1.30)
occ2 <- c(0.878,0.647,0.598,2.05,1.06,1.29,1.06,3.14,1.29)
wilcox.test(occ1, occ2, paired=TRUE)
library(foreign)
polymsm <- read.dta("polymorphism.dta")[,-1]
with(polymsm, tapply(age, genotype, mean))
aov.res <- aov(age ~ genotype, data=polymsm)
summary(aov.res)
model.tables(aov.res)  
plot.design(age ~ genotype, data=polymsm)
bwplot(age ~ genotype, data=polymsm)
summary(aov(bwt ~ smoke, data=birthwt))
t.test(bwt ~ smoke, data=birthwt, var.equal=TRUE)
aov0 <- aov(age ~ 1, data=polymsm)
aov1 <- aov(age ~ genotype, data=polymsm)     
anova(aov0, aov1)
kruskal.test(age ~ genotype, data=polymsm)
paint <- read.table("PAINT.DAT", header=TRUE)
with(paint, cor.test(HAEMO, PCV))
xyplot(PCV ~ HAEMO, data=paint, type=c("p","g","r"))
library(foreign)
anorex <- read.spss("anorectic.sav", to.data.frame=TRUE)
with(subset(anorex, time==1), 
     cor.test(binge, purge, method="spearman"))
aspirin <- matrix(c(28,18,656,658), nrow=2)
dimnames(aspirin) <- list(c("Placebo","Aspirin"), 
                          c("Yes","No"))
library(vcd)
asp.or <- oddsratio(aspirin, log=FALSE)
print(list(or=asp.or, conf.int=confint(asp.or)))
summary(oddsratio(aspirin))
coffee <- matrix(c(652,1537,598,242,36,46,38,21,
                   218,327,106,67), nrow=3, byrow=TRUE)
dimnames(coffee) <- list("marital status"=c("Married",
  "Prev.married","Single"), consumption=c("0","1-150",
  "151-300",">300"))
prop.table(coffee, 1)
(chsq <- chisq.test(coffee))
chsq$residuals
dotplot(consumption ~ value, data=melt(coffee), 
        groups=marital.status)
x <- cbind(c(50,10),c(20,40))
dimnames(x) <- list(c("+","-"),
                    c("+","-"))
margin.table(x, 1)
mcnemar.test(x)
mcnemar.test(x)
binom.test(apply(x, 1, sum), .5)
TeaTasting <- matrix(c(3,1,1,3), nrow=2,
       dimnames=list(Guess=c("Milk","Tea"),
                     Truth=c("Milk","Tea")))
fisher.test(TeaTasting, alternative="greater")
data(pathologist.dat, package="exactLoglinTest")
patho <- xtabs(y ~ A + B, data=pathologist.dat)
sum(diag(patho))/sum(patho)
library(reshape)
patho.expand <- untable(pathologist.dat[,2:3], 
                        pathologist.dat$y)
library(irr)
kappa2(patho.expand)
n <- 10
x <- runif(n, 0, 10)
y <- 5.1 + 1.8 * x + rnorm(n)
summary(lm(y ~ x))
paint <- read.table("PAINT.DAT", header=TRUE)
xyplot(PCV ~ HAEMO, data=paint, type=c("p","r"))
lm.fit <- lm(PCV ~ HAEMO, data=paint)
summary(lm.fit)
confint(lm.fit)
anova(lm.fit)
xyplot(resid(lm.fit) ~ HAEMO, data=paint)
xyplot(resid(lm.fit) ~ fitted(lm.fit))
influence.measures(lm.fit)
fitted(lm.fit)
predict(lm.fit, data.frame(HAEMO=seq(13, 18, by=1)))
x <- gl(5, 1, 10, labels=letters[1:5])
model.matrix(rnorm(10) ~ x)
haemo.dec <- cut2(paint$HAEMO, g=10)
fm <- PCV ~ haemo.dec
summary(aov.fit <- aov(fm, data=paint))
summary(lm.fit <- lm(fm, data=paint))
.Last.value$sigma^2
grp.means <- tapply(paint$PCV, 
                    haemo.dec, 
                    mean)   
grp.means[2:10] - grp.means[1]
coef(lm.fit)
contr.treatment(10)
contr.sum(10)
contr.helmert(10)
fm <- y ~ x + a * b
mod1 <- lm(fm, data=dat)
update(mod1, . ~ . - a:b)  ## supprime interaction AxB
data(ToothGrowth)
ToothGrowth$dose <- factor(ToothGrowth$dose)
fm <- len ~ supp * dose
replications(fm, data=ToothGrowth)
library(Hmisc)
f <- function(x) apply(x, 2, function(x) 
                       c(mean=mean(x), sd=sd(x)))
summary(fm, data=ToothGrowth, fun=f)
library(reshape2)
m <- acast(ToothGrowth, supp ~ dose, mean, value.var="len")
xyplot(len ~ dose, data=ToothGrowth, groups=supp,
       type=c("p","a"))
aov.fit <- aov(fm, data=ToothGrowth)
summary(aov.fit)
model.tables(aov.fit, type="means", se=TRUE, 
             cterms="supp:dose")
apply(m, 2, diff)
qqmath(~ resid(aov.fit))
bwplot(len ~ interaction(supp, dose), data=ToothGrowth)
bartlett.test(len ~ interaction(supp,dose),data=ToothGrowth)
data(anorexia)
anorexia$Treat <- relevel(anorexia$Treat, ref="Cont")
anorex.aov0 <- aov(Postwt ~ Prewt + Treat, data=anorexia)
anorex.aov1 <- aov(Postwt ~ Prewt * Treat, data=anorexia)
summary(anorex.aov0)
xyplot(Postwt ~ Prewt, data=anorexia, groups=Treat, 
       aspect="iso", type=c("p","r"))
anova(anorex.aov0, anorex.aov1)
summary.lm(anorex.aov1)
lm(Postwt ~ Prewt + Treat + offset(Prewt), data=anorexia)
chs <- read.table("cholesterol.txt", header=TRUE)
chs$Subject <- factor(chs$Subject)  ## important
chs <- melt(chs, id.vars="Subject")
aov1 <- aov(value ~ variable + Error(Subject), data=chs)
summary(aov1)
library(nlme)
lme1 <- lme(value ~ variable, data=chs, 
            random= ~ 1 | Subject)
summary(lme1)
anova(lme1)
31.96^2/(31.96^2+7.61^2)  ## ICC
intervals(lme1)
fhs <- read.csv("Framingham.csv")
fhs <- subset(fhs, fhs$id != 9999 & complete.cases(fhs))
fhs$sex <- fhs$sex-1
fm <- log(sbp) ~ log(bmi) + age + log(scl) + sex + 
               sex:log(bmi) + sex:age + sex:log(scl)
mod0 <- lm(log(sbp) ~ 1, data=fhs)
mod1 <- lm(fm, data=fhs)
summary(mod1)
addterm(mod0, mod1)    ## ou add1()
drop1(mod1, test="F")
st.fwbw <- step(mod1)  ## backward + forward
preds <- attr(terms(fm), "term.labels")
preds <- paste("~", paste(preds, collapse="+"))
step(mod0, scope=as.formula(preds), direction="forward")
glm(y ~ x, family=binomial)
glm(cbind(n1, n0) ~ x, family=binomial)
glm(n1/n ~ x, family=binomial, weights=n)
bp <- read.table("hdis.dat", header=TRUE)
blab <- c("<117","117-126","127-136","137-146",
          "147-156","157-166","167-186",">186")
clab <- c("<200","200-209","210-219","220-244",
          "245-259","260-284",">284")
bp <- within(bp, {
  bpress <- factor(bpress, labels=blab)
  chol <- factor(chol, labels=clab)
})
round(xtabs(hdis/total ~ bpress + chol, data=bp), 2)
midpoint <- function(x) {
  x <- as.numeric(unlist(strsplit(x, "-")))
  return(sum(x)/2)
}
val <- sapply(levels(bp$bpress)[-c(1,8)], midpoint)
dfrm <- aggregate(bp[,3:4], list(bpress=bp[,1]), sum)
dfrm$bpress <- c(val[1]-10, val, val[6]+15)
mod1 <- glm(cbind(hdis, total-hdis) ~ bpress, 
            data=dfrm, family=binomial)
summary(mod1)
data(birthwt, package="MASS")
ethn <- c("White","Black","Other")
birthwt$race <- factor(birthwt$race, labels=ethn)
fm <- low ~ age + lwt + race + ftv
glm1 <- glm(fm, data=birthwt, family=binomial)
summary(glm1)
confint(glm1)
log.odds <- predict(glm1, data.frame(age=mean(birthwt$age), 
                                     lwt=mean(birthwt$lwt),
                                     race="White", ftv=0))
exp(log.odds)/(1+exp(log.odds))  ## 0.2483566
glm2 <- update(glm1, . ~ . - age - ftv)
anova(glm2, glm1, test="Chisq")
anova(update(glm2, . ~ . - race), glm2)
pchisq(5.4316, 2, lower.tail=FALSE)
library(car)
mmps(glm2, terms=~lwt)
influence.measures(glm2)
library(rms)
ddist <- datadist(birthwt)
options(datadist="ddist")
glm2b <- lrm(low ~ lwt + race, data=birthwt)
glm2b
exp(coef(glm2b))
Predict(glm2b, lwt, race, fun=plogis, conf.type="mean")
table(predict(glm1, type="resp")>=.5, birthwt$low)

Les bases du langage

Introduction

Représentation des données

Exemple de commandes R

Environnement de travail

Obtenir de l'aide

Ressources bibliographiques

Les objets R

Adresser les éléments d'un vecteur

Application

Recap'

Propriétés des vecteurs

Générer des vecteurs

Opérer sur des vecteurs

Valeurs manquantes

Les matrices

Opérations sur les matrices

Les facteurs

Accéder aux informations d'un facteur

Créer des facteurs

Modifier un facteur

Les erreurs fréquentes

Structure de données avancée

Adresser les éléments d'un data.frame

Propriétés d'un data.frame

Opérations sur un data.frame

Importer des données externes

Ce qu'il faut retenir

Analyse exploratoire des données

Objectifs

Un jeu de données d'exemple

Traitement préalable

Valeurs extrêmes, atypiques ou outliers

Résumé de la structure de données

Synthèse numérique et transformation

Synthèse numérique et transformation (2)

Qu'est-ce qu'une distribution?

Résumé numérique

Fonction de répartition

Approche graphique

Histogramme

Histogramme (variations)

Densité empirique

Résumé numérique bivarié

Diagramme de dispersion

Scatterplot smoother

Distributions conditionnelles

Distributions conditionnelles (2)

Mesures aggrégées

Croiser plus de deux variables

Illustrations

Illustrations (2)

Ce qu'il faut retenir

Mesures et tests d'association

Objectifs

Hypothèse nulle, risque d'erreur en inférence fréquentiste

Risque de première et deuxième espèce

En résumé

Comparer deux moyennes

Application

Vérification des conditions d'application

Cas des données non indépendantes

Application

Analyse graphique

Alternatives non-paramétriques

Application

Comparer plus de deux moyennes

Illustration

Tableau d'ANOVA

Application

Distributions conditionnelles

Diagnostic du modèle

Diagramme en barres pour publication

Quelques remarques

Alternative non-paramétrique

Mesures de corrélation

Illustration

Application

Approche graphique

Alternative non-paramétrique