Pour les utilisateurs de R, on commence par charger un certain nombre de modules intéressants.
# le package moderne standard pour manipuler ses données
library(tidyverse)
# graphes multiples
library(cowplot)
# quelques couleurs manuelles
"#3d5468"
bleufonce <- "#5b7c98"
bleuclair <- "#ff5555" rose <-
Pour se focaliser sur les stats et uniquement les stats, on travaille sur des jeux de données idéalisés, qui présentent l’avantage :
L’objectif de ce document est de réaliser les simulations, ainsi que diverses représentations graphiques qui seront incluses dans les présentations. C’est l’occasion de partager des morceaux de code R avec des participants qui seraient intéressés.
Le premier jeu de données simule un RCT (Randomized Controlled Trial) ayant pour but de tester et quantifier l’impact d’une intervention de conseil en nutrition et activité physique sur des patients diabétiques.
Les variables d’outcome sont mesurées 6 mois après le début de l’intervention.
control
et treatment
.sex
, age
, age_group
, height
, weight_t0
, bmi
, bmi_categories
, diabetes
, steps_t0
.diff_weight
.diff_steps
.feels_sleepy
.t_compliance
.n_meals_glycemia_out_target
.Pour ceux que ça intéresse, voici la loi dans laquelle les données sont simulées :
250
n_per_arm <-# data that do not depend on the group
data.frame(id = seq(1,500),
d <-group = as.factor(rep(c("control", "treatment"), each=n_per_arm)),
sex = sample(c("F","M"), size=n_per_arm*2, replace=T),
age = round(runif(n=2*n_per_arm, min=19, max=77)),
height = round(rnorm(n=2*n_per_arm, mean=1.7, sd=0.1), digits=2),
weight_t0 = round(rnorm(n=2*n_per_arm, mean=70, sd=10), 2),
diabetes = sample(c("I", "II"), size=2*n_per_arm, replace=T),
steps_t0 = round(rnorm(n=2*n_per_arm, mean=2500, sd=500)),
t_compliance = round(rexp(n=2*n_per_arm, rate=0.05)) )
# data that do not depend on the group but depend on previously simulated data
d %>%
d <- mutate(age_group = as.factor(ifelse(age < 40,
"moins de 40",
ifelse(age < 60,
"entre 40 et 60",
"plus de 60"))),
bmi = weight_t0 / height^2,
bmi_group = as.factor(ifelse(bmi < 18.5,
"underweight",
ifelse(bmi < 25,
"normal",
ifelse(bmi < 30,
"overweight",
"obese")))) )
# possibilities for the "feels_sleepy" variable
c("morning", "afternoon", "always")
modalites <-
function(x){
f_logistic <-return (1 / (1 + exp(-x)))
}
# data that do depend on the group
d %>% mutate(diff_weight = rnorm(n=n_per_arm*2,
d <-mean=-0.04 * t_compliance * ifelse(group=="treatment",1,0) +
ifelse(diabetes == "I", -0.5, 0) +
-(bmi-20)/10,
sd=2.5),
diff_steps = rnorm(n=n_per_arm*2,
mean=25 * t_compliance * ifelse(group=="treatment", 1, 0),
sd=500),
is_depressed = rbinom(n=2*n_per_arm,
size=1,
prob=sapply(X=(weight_t0-70)/5 + ifelse(group=="treatment", -1, 0),
FUN=f_logistic) ),
feels_sleepy = c( sample(modalites, n_per_arm, prob = c(0.3, 0.4, 0.3), replace = TRUE),
sample(modalites, n_per_arm, prob = c(0.2, 0.3, 0.5), replace = TRUE) ),
n_meals_glycemia_out_of_target = rpois(n=2*n_per_arm,
lambda=exp((bmi-20)/5) *
ifelse(group=="control" & diabetes== "I",
8,
ifelse(group=="control" & diabetes=="II",
5,
ifelse(group=="treatment" & diabetes=="I",
4,
2)))))
# last, the outcome at the end of the trial
d %>%
d <- mutate(weight_tf = weight_t0 + diff_weight,
steps_tf = steps_t0 + diff_steps)
On enregistre le dataset dans le fichier suivant :
write.csv(d, "etude_fictive_diabetes.csv")
On dispose ici de 50 souris, et on s’intéresse à leur système cardio-vasculaire. On souhaite quantifier l’effet de plusieurs interventions qui s’effectuent chacune à l’échelle d’une journée :
Après chaque intervention, on entregistre :
En plus de ça, on possède des mesures de baseline de différentes quantités, qu’on va supposer fixes durant l’expérience :
50
n_mouse <- c("low sugar", "high sugar")
diets <- c("I", "II", "III", "IV")
exercises <-
# data that do not depend on the group
tibble(id = rep(seq(1,n_mouse), each= length(diets)*length(exercises)),
d2 <-diet = rep(rep(diets, each=length(exercises)), n_mouse),
exercise = rep(rep(exercises, length(diets)), n_mouse),
weight = round(rep( rnorm(n=n_mouse, mean=20, sd=4), each=length(diets)*length(exercises))),
lineage = rep( sample(c("wild type", "KO"), size=n_mouse, replace=T), each=length(diets)*length(exercises)),
sex = rep( sample(c("F", "M"), size=n_mouse, replace=T), each=length(diets)*length(exercises)),
crp = round(rep( rnorm(n=n_mouse, mean=2000, sd=500), each=length(diets)*length(exercises))),
alea = rep( rnorm(n=n_mouse, mean=0, sd=1), each=length(diets)*length(exercises)) )
# data that do depend on the group
d2 %>% mutate(fc = rnorm(n=n_mouse*length(diets)*length(exercises),
d2 <-mean= 600 + alea*5 + ifelse(sex=="F", 10, 0) + (weight-20)*10 + ifelse(exercise=="IV", -50, 0) + ifelse(diet=="high sugar", 50, 0) + ifelse(lineage=="KO", 20, 0),
sd = 50),
attitude = rbinom(n=n_mouse*length(diets)*length(exercises),
size=1,
prob=sapply(X=alea + (crp-2000)/500 + ifelse(diet=="high sugar", 1, 0),
FUN=f_logistic) ),
n_extrasystole = rpois(n=n_mouse*length(diets)*length(exercises),
lambda= exp(alea + ifelse(diet=="high sugar", 1, 0) + (weight-20)/10 + ifelse(lineage=="KO", 2,0)) ))
d2 %>%
d2 <- mutate(attitude = ifelse(attitude==1, "sociable", "prostrate")) %>%
select(-alea)
On enregistre le dataset dans le fichier suivant :
write.csv(d2, "etude_fictive_souris.csv")
function(d){
get_barplots <-
d %>% ggplot(aes(x=is_depressed, fill=group)) +
p1 <- geom_bar(alpha=0.5, position="identity") +
theme_bw() +
ylab("effectif") +
xlab("dépression du patient") +
scale_fill_manual(values=c(bleuclair, rose))
d %>%
m <- group_by(group) %>%
summarise(mean=mean(is_depressed),
ci_low=prop.test(sum(is_depressed), n=250)$conf.int[1],
ci_up=prop.test(sum(is_depressed), n=250)$conf.int[2])
p1 +
p2 <- geom_vline(data=m, aes(xintercept=mean, col=group)) +
scale_color_manual(values=c(bleuclair, rose))
p2 +
p3 <- geom_vline(data=m, aes(xintercept=ci_low, col=group), linetype="dashed") +
geom_vline(data=m, aes(xintercept=ci_up, col=group), linetype="dashed")
return(list(without_mean = p1, with_mean = p2, with_ci = p3))
} function(d){
get_histograms <-
d %>% ggplot(aes(x=diff_weight, fill=group)) +
p1 <- geom_histogram(alpha=0.5, position="identity", bins=30) +
theme_bw() +
ylab("effectif") +
xlab("différence de poids à 6 mois") +
scale_fill_manual(values=c(bleuclair, rose))
d %>%
m <- group_by(group) %>%
summarise(mean=mean(diff_weight),
ci_low=t.test(diff_weight)$conf.int[1],
ci_up=t.test(diff_weight)$conf.int[2])
p1 +
p2 <- geom_vline(data=m, aes(xintercept=mean, col=group)) +
scale_color_manual(values=c(bleuclair, rose))
p2 +
p3 <- geom_vline(data=m, aes(xintercept=ci_low, col=group), linetype="dashed") +
geom_vline(data=m, aes(xintercept=ci_up, col=group), linetype="dashed")
return(list(without_mean = p1, with_mean = p2, with_ci = p3))
}
get_barplots(d)
pb <- get_histograms(d)
ph <-
12
w <- 5
h <-
plot_grid(pb$without_mean, ph$without_mean)
plot_raw <- plot_raw
ggsave("../../Figures/illu_diabete_raw_data.pdf", plot=plot_raw, height=h, width=w)
plot_grid(pb$with_mean, ph$with_mean)
plot_mean <- plot_mean
ggsave("../../Figures/illu_diabete_raw_data_mean.pdf", plot=plot_mean, height=h, width=w)
plot_grid(pb$with_ci, ph$with_ci)
plot_ci <- plot_ci
ggsave("../../Figures/illu_diabete_raw_data_ci.pdf", plot=plot_ci, height=h, width=w)
ggsave("../../Figures/illu_diabete_diff_weight_two_means.pdf", plot=ph$with_mean, height=h, width=w)
d %>% ggplot(aes(x=diff_weight)) +
p_raw <- geom_histogram(alpha=0.5, position="identity", bins=30, fill=bleuclair) +
theme_bw() +
ylab("effectif") +
xlab("différence de poids à 6 mois")
d %>%
m <- summarise(mean=mean(diff_weight),
median=median(diff_weight),
sd=sd(diff_weight),
q1=quantile(diff_weight, probs=0.025),
q2=quantile(diff_weight, probs=0.975))
p_raw +
p_mean <- geom_vline(data=m, aes(xintercept=mean), col=bleufonce)
p_mean +
p_mean_median <- geom_vline(data=m, aes(xintercept=median), col=rose)
p_raw +
p_quantiles <- geom_vline(data=m, aes(xintercept=q1), col=rose) +
geom_vline(data=m, aes(xintercept=q2), col=rose)
p_raw
p_mean
p_mean_median
p_quantiles
ggsave("../../Figures/illu_diabete_histogram.pdf", plot=p_raw, height=h, width=w)
ggsave("../../Figures/illu_diabete_mean.pdf", plot=p_mean, height=h, width=w)
ggsave("../../Figures/illu_diabete_median.pdf", plot=p_mean_median, height=h, width=w)
ggsave("../../Figures/illu_diabete_quantiles.pdf", plot=p_quantiles, height=h, width=w)
Pour la variable t_compliance
avec sa moyenne et sa médiane :
d %>% ggplot(aes(x=t_compliance)) +
pt_raw <- geom_histogram(alpha=0.5, position="identity", bins=30, fill=bleuclair) +
theme_bw() +
ylab("effectif") +
xlab("temps de suivi de l'intervention")
d %>%
mt <- summarise(mean=mean(t_compliance),
median=median(t_compliance),
sd=sd(t_compliance),
q1=quantile(t_compliance, probs=0.025),
q2=quantile(t_compliance, probs=0.975))
pt_raw +
pt_mean <- geom_vline(data=mt, aes(xintercept=mean), col=bleufonce)
pt_mean +
pt_mean_median <- geom_vline(data=mt, aes(xintercept=median), col=rose)
pt_raw +
pt_quantiles <- geom_vline(data=mt, aes(xintercept=q1), col=rose) +
geom_vline(data=mt, aes(xintercept=q2), col=rose)
pt_raw
pt_mean
pt_mean_median
pt_quantiles
ggsave("../../Figures/illu_diabete_t_histogram.pdf", plot=pt_raw, height=h, width=w)
ggsave("../../Figures/illu_diabete_t_mean.pdf", plot=pt_mean, height=h, width=w)
ggsave("../../Figures/illu_diabete_t_median.pdf", plot=pt_mean_median, height=h, width=w)
ggsave("../../Figures/illu_diabete_t_quantiles.pdf", plot=pt_quantiles, height=h, width=w)
Les graphes permettant de donner un aperçu de différentes variables de types différents :
d %>% ggplot(aes(x=feels_sleepy, fill=group)) +
pm <- geom_bar(position="dodge", alpha=0.5) +
scale_fill_manual(values=c(bleuclair, rose)) +
theme_bw()
d %>% ggplot(aes(x=n_meals_glycemia_out_of_target, fill=group)) +
pc <- geom_histogram(alpha=0.5, position="identity", bins=30) +
scale_fill_manual(values=c(bleuclair, rose)) +
theme_bw()
$without_mean ph
$without_mean pb
pm
pc
ggsave("../../Figures/illu_diabete_diff_weight.pdf", plot=ph$without_mean, height=h, width=w)
ggsave("../../Figures/illu_diabete_is_depressed.pdf", plot=pb$without_mean, height=h, width=w)
ggsave("../../Figures/illu_diabete_feels_sleepy.pdf", plot=pm, height=h, width=w)
ggsave("../../Figures/illu_diabete_n_meals.pdf", plot=pc, height=h, width=w)
Une série de simulations sous H0 (pas de différence entre les deux groupes), pour observer l’amplitude de différence entre les deux moyennes estimées.
for (i in 1:5){
tibble(group = rep(c("control", "treatment"), each=n_per_arm)) %>%
dh0 <- mutate(diff_weight = c(rnorm(n_per_arm, mean=0, sd=4), rnorm(n_per_arm, mean=0, sd=4)))
get_histograms(dh0)
p <-ggsave(paste("../../Figures/data_diff_weight_h0_",i,".pdf",sep=""),
plot = p$with_mean,
width = w,
height = h)
}
Une série de simulations sous H1 (il y a une différence entre les deux groupes), pour observer l’amplitude de différence entre les deux moyennes estimées.
for (i in 1:5){
tibble(group = rep(c("control", "treatment"), each=n_per_arm)) %>%
dh1 <- mutate(diff_weight = c(rnorm(n_per_arm, mean=0, sd=4), rnorm(n_per_arm, mean=-0.5, sd=4)))
get_histograms(dh1)
p <-ggsave(paste("../../Figures/data_diff_weight_h1_",i,".pdf",sep=""),
plot = p$with_mean,
width = w,
height = h)
}
Pour visualiser l’allure d’une loi binomiale avec 250 tirages de Bernoulli, et une probabilité de succès de 0.5.
tibble(x = seq(90, 160),
dbinom <-y = dbinom(x, size=n_per_arm, prob=0.5))
dbinom %>% ggplot(aes(x=x, y=y)) +
p_illu_binom <- geom_bar(stat="identity", fill=bleuclair) +
xlab("nombre de dépressions sur 250 patients") +
ylab("probabilité") +
theme_bw()
2 <- p_illu_binom +
p_illu_binom_ geom_vline(xintercept = qbinom(c(0.025, 0.975), size=250, prob=0.5), col=rose)
2 p_illu_binom_
ggsave("../../Figures/allure_binomiale_250.pdf", plot=p_illu_binom, width=w, height=h)
ggsave("../../Figures/allure_binomiale_250_quantiles.pdf", plot=p_illu_binom_2, width=w, height=h)
Des figures pour illustrer les résidus, la variance, la covariance, la droite de régression…
50
npoints <- tibble(x = runif(npoints, 0, 10),
dreg <-haty = 2*x -5,
y = haty + rnorm(npoints, sd=3))
0 <- dreg %>% ggplot(aes(x=x, y=y)) +
preg_ geom_point() +
ylab("variable de réponse") +
xlab("variable explicative") +
theme_bw()
0 preg_
1 <- preg_0 +
preg_ geom_abline(slope = 2, intercept = -5, col=rose)
1 preg_
2 <- preg_1 +
preg_ geom_segment(data=dreg, aes(x=x, xend=x, y=y, yend=haty), col=bleuclair)
2 preg_
3 <- preg_0 +
preg_ geom_hline(yintercept = 5, col=rose) +
geom_segment(data=dreg, aes(x=x, xend=x, y=y), yend=5, col=bleuclair)
3 preg_
4 <- preg_3 +
preg_ geom_vline(xintercept = 5, col=rose) +
geom_segment(data=dreg, aes(x=x, yend=y, y=y), xend=5, col=bleuclair)
4 preg_
ggsave("../../Figures/reglin_principe_nuage.pdf", plot=preg_0, width=w, height=h)
ggsave("../../Figures/reglin_principe_nuage_abline.pdf", plot=preg_1, width=w, height=h)
ggsave("../../Figures/reglin_principe_nuage_abline_residuals.pdf", plot=preg_2, width=w, height=h)
ggsave("../../Figures/reglin_principe_nuage_hline_variance.pdf", plot=preg_3, width=w, height=h)
ggsave("../../Figures/reglin_principe_nuage_hline_covariance.pdf", plot=preg_4, width=w, height=h)
ggsave("../../Figures/reglin_principe_2_sce.pdf", plot=plot_grid(preg_3, preg_2), width=w, height=h)