Pour les utilisateurs de R, on commence par charger un certain nombre de modules intéressants.
# le package moderne standard pour manipuler ses données
library(tidyverse)
# graphes multiples
library(cowplot)
# quelques couleurs manuelles
bleufonce <- "#3d5468"
bleuclair <- "#5b7c98"
rose <- "#ff5555"Pour se focaliser sur les stats et uniquement les stats, on travaille sur des jeux de données idéalisés, qui présentent l’avantage :
L’objectif de ce document est de réaliser les simulations, ainsi que diverses représentations graphiques qui seront incluses dans les présentations. C’est l’occasion de partager des morceaux de code R avec des participants qui seraient intéressés.
Le premier jeu de données simule un RCT (Randomized Controlled Trial) ayant pour but de tester et quantifier l’impact d’une intervention de conseil en nutrition et activité physique sur des patients diabétiques.
Les variables d’outcome sont mesurées 6 mois après le début de l’intervention.
control et treatment.sex, age, age_group, height, weight_t0, bmi, bmi_categories, diabetes, steps_t0.diff_weight.diff_steps.feels_sleepy.t_compliance.n_meals_glycemia_out_target.Pour ceux que ça intéresse, voici la loi dans laquelle les données sont simulées :
n_per_arm <- 250
# data that do not depend on the group
d <- data.frame(id = seq(1,500),
group = as.factor(rep(c("control", "treatment"), each=n_per_arm)),
sex = sample(c("F","M"), size=n_per_arm*2, replace=T),
age = round(runif(n=2*n_per_arm, min=19, max=77)),
height = round(rnorm(n=2*n_per_arm, mean=1.7, sd=0.1), digits=2),
weight_t0 = round(rnorm(n=2*n_per_arm, mean=70, sd=10), 2),
diabetes = sample(c("I", "II"), size=2*n_per_arm, replace=T),
steps_t0 = round(rnorm(n=2*n_per_arm, mean=2500, sd=500)),
t_compliance = round(rexp(n=2*n_per_arm, rate=0.05)) )
# data that do not depend on the group but depend on previously simulated data
d <- d %>%
mutate(age_group = as.factor(ifelse(age < 40,
"moins de 40",
ifelse(age < 60,
"entre 40 et 60",
"plus de 60"))),
bmi = weight_t0 / height^2,
bmi_group = as.factor(ifelse(bmi < 18.5,
"underweight",
ifelse(bmi < 25,
"normal",
ifelse(bmi < 30,
"overweight",
"obese")))) )
# possibilities for the "feels_sleepy" variable
modalites <- c("morning", "afternoon", "always")
f_logistic <- function(x){
return (1 / (1 + exp(-x)))
}
# data that do depend on the group
d <- d %>% mutate(diff_weight = rnorm(n=n_per_arm*2,
mean=-0.04 * t_compliance * ifelse(group=="treatment",1,0) +
ifelse(diabetes == "I", -0.5, 0) +
-(bmi-20)/10,
sd=2.5),
diff_steps = rnorm(n=n_per_arm*2,
mean=25 * t_compliance * ifelse(group=="treatment", 1, 0),
sd=500),
is_depressed = rbinom(n=2*n_per_arm,
size=1,
prob=sapply(X=(weight_t0-70)/5 + ifelse(group=="treatment", -1, 0),
FUN=f_logistic) ),
feels_sleepy = c( sample(modalites, n_per_arm, prob = c(0.3, 0.4, 0.3), replace = TRUE),
sample(modalites, n_per_arm, prob = c(0.2, 0.3, 0.5), replace = TRUE) ),
n_meals_glycemia_out_of_target = rpois(n=2*n_per_arm,
lambda=exp((bmi-20)/5) *
ifelse(group=="control" & diabetes== "I",
8,
ifelse(group=="control" & diabetes=="II",
5,
ifelse(group=="treatment" & diabetes=="I",
4,
2)))))
# last, the outcome at the end of the trial
d <- d %>%
mutate(weight_tf = weight_t0 + diff_weight,
steps_tf = steps_t0 + diff_steps)On enregistre le dataset dans le fichier suivant :
write.csv(d, "etude_fictive_diabetes.csv")On dispose ici de 50 souris, et on s’intéresse à leur système cardio-vasculaire. On souhaite quantifier l’effet de plusieurs interventions qui s’effectuent chacune à l’échelle d’une journée :
Après chaque intervention, on entregistre :
En plus de ça, on possède des mesures de baseline de différentes quantités, qu’on va supposer fixes durant l’expérience :
n_mouse <- 50
diets <- c("low sugar", "high sugar")
exercises <- c("I", "II", "III", "IV")
# data that do not depend on the group
d2 <- tibble(id = rep(seq(1,n_mouse), each= length(diets)*length(exercises)),
diet = rep(rep(diets, each=length(exercises)), n_mouse),
exercise = rep(rep(exercises, length(diets)), n_mouse),
weight = round(rep( rnorm(n=n_mouse, mean=20, sd=4), each=length(diets)*length(exercises))),
lineage = rep( sample(c("wild type", "KO"), size=n_mouse, replace=T), each=length(diets)*length(exercises)),
sex = rep( sample(c("F", "M"), size=n_mouse, replace=T), each=length(diets)*length(exercises)),
crp = round(rep( rnorm(n=n_mouse, mean=2000, sd=500), each=length(diets)*length(exercises))),
alea = rep( rnorm(n=n_mouse, mean=0, sd=1), each=length(diets)*length(exercises)) )
# data that do depend on the group
d2 <- d2 %>% mutate(fc = rnorm(n=n_mouse*length(diets)*length(exercises),
mean= 600 + alea*5 + ifelse(sex=="F", 10, 0) + (weight-20)*10 + ifelse(exercise=="IV", -50, 0) + ifelse(diet=="high sugar", 50, 0) + ifelse(lineage=="KO", 20, 0),
sd = 50),
attitude = rbinom(n=n_mouse*length(diets)*length(exercises),
size=1,
prob=sapply(X=alea + (crp-2000)/500 + ifelse(diet=="high sugar", 1, 0),
FUN=f_logistic) ),
n_extrasystole = rpois(n=n_mouse*length(diets)*length(exercises),
lambda= exp(alea + ifelse(diet=="high sugar", 1, 0) + (weight-20)/10 + ifelse(lineage=="KO", 2,0)) ))
d2 <- d2 %>%
mutate(attitude = ifelse(attitude==1, "sociable", "prostrate")) %>%
select(-alea)On enregistre le dataset dans le fichier suivant :
write.csv(d2, "etude_fictive_souris.csv")get_barplots <- function(d){
p1 <- d %>% ggplot(aes(x=is_depressed, fill=group)) +
geom_bar(alpha=0.5, position="identity") +
theme_bw() +
ylab("effectif") +
xlab("dépression du patient") +
scale_fill_manual(values=c(bleuclair, rose))
m <- d %>%
group_by(group) %>%
summarise(mean=mean(is_depressed),
ci_low=prop.test(sum(is_depressed), n=250)$conf.int[1],
ci_up=prop.test(sum(is_depressed), n=250)$conf.int[2])
p2 <- p1 +
geom_vline(data=m, aes(xintercept=mean, col=group)) +
scale_color_manual(values=c(bleuclair, rose))
p3 <- p2 +
geom_vline(data=m, aes(xintercept=ci_low, col=group), linetype="dashed") +
geom_vline(data=m, aes(xintercept=ci_up, col=group), linetype="dashed")
return(list(without_mean = p1, with_mean = p2, with_ci = p3))
}
get_histograms <- function(d){
p1 <- d %>% ggplot(aes(x=diff_weight, fill=group)) +
geom_histogram(alpha=0.5, position="identity", bins=30) +
theme_bw() +
ylab("effectif") +
xlab("différence de poids à 6 mois") +
scale_fill_manual(values=c(bleuclair, rose))
m <- d %>%
group_by(group) %>%
summarise(mean=mean(diff_weight),
ci_low=t.test(diff_weight)$conf.int[1],
ci_up=t.test(diff_weight)$conf.int[2])
p2 <- p1 +
geom_vline(data=m, aes(xintercept=mean, col=group)) +
scale_color_manual(values=c(bleuclair, rose))
p3 <- p2 +
geom_vline(data=m, aes(xintercept=ci_low, col=group), linetype="dashed") +
geom_vline(data=m, aes(xintercept=ci_up, col=group), linetype="dashed")
return(list(without_mean = p1, with_mean = p2, with_ci = p3))
}
pb <- get_barplots(d)
ph <- get_histograms(d)
w <- 12
h <- 5
plot_raw <- plot_grid(pb$without_mean, ph$without_mean)
plot_rawggsave("../../Figures/illu_diabete_raw_data.pdf", plot=plot_raw, height=h, width=w)
plot_mean <- plot_grid(pb$with_mean, ph$with_mean)
plot_meanggsave("../../Figures/illu_diabete_raw_data_mean.pdf", plot=plot_mean, height=h, width=w)
plot_ci <- plot_grid(pb$with_ci, ph$with_ci)
plot_ciggsave("../../Figures/illu_diabete_raw_data_ci.pdf", plot=plot_ci, height=h, width=w)
ggsave("../../Figures/illu_diabete_diff_weight_two_means.pdf", plot=ph$with_mean, height=h, width=w)p_raw <- d %>% ggplot(aes(x=diff_weight)) +
geom_histogram(alpha=0.5, position="identity", bins=30, fill=bleuclair) +
theme_bw() +
ylab("effectif") +
xlab("différence de poids à 6 mois")
m <- d %>%
summarise(mean=mean(diff_weight),
median=median(diff_weight),
sd=sd(diff_weight),
q1=quantile(diff_weight, probs=0.025),
q2=quantile(diff_weight, probs=0.975))
p_mean <- p_raw +
geom_vline(data=m, aes(xintercept=mean), col=bleufonce)
p_mean_median <- p_mean +
geom_vline(data=m, aes(xintercept=median), col=rose)
p_quantiles <- p_raw +
geom_vline(data=m, aes(xintercept=q1), col=rose) +
geom_vline(data=m, aes(xintercept=q2), col=rose)
p_rawp_meanp_mean_medianp_quantilesggsave("../../Figures/illu_diabete_histogram.pdf", plot=p_raw, height=h, width=w)
ggsave("../../Figures/illu_diabete_mean.pdf", plot=p_mean, height=h, width=w)
ggsave("../../Figures/illu_diabete_median.pdf", plot=p_mean_median, height=h, width=w)
ggsave("../../Figures/illu_diabete_quantiles.pdf", plot=p_quantiles, height=h, width=w)Pour la variable t_compliance avec sa moyenne et sa médiane :
pt_raw <- d %>% ggplot(aes(x=t_compliance)) +
geom_histogram(alpha=0.5, position="identity", bins=30, fill=bleuclair) +
theme_bw() +
ylab("effectif") +
xlab("temps de suivi de l'intervention")
mt <- d %>%
summarise(mean=mean(t_compliance),
median=median(t_compliance),
sd=sd(t_compliance),
q1=quantile(t_compliance, probs=0.025),
q2=quantile(t_compliance, probs=0.975))
pt_mean <- pt_raw +
geom_vline(data=mt, aes(xintercept=mean), col=bleufonce)
pt_mean_median <- pt_mean +
geom_vline(data=mt, aes(xintercept=median), col=rose)
pt_quantiles <- pt_raw +
geom_vline(data=mt, aes(xintercept=q1), col=rose) +
geom_vline(data=mt, aes(xintercept=q2), col=rose)
pt_rawpt_meanpt_mean_medianpt_quantilesggsave("../../Figures/illu_diabete_t_histogram.pdf", plot=pt_raw, height=h, width=w)
ggsave("../../Figures/illu_diabete_t_mean.pdf", plot=pt_mean, height=h, width=w)
ggsave("../../Figures/illu_diabete_t_median.pdf", plot=pt_mean_median, height=h, width=w)
ggsave("../../Figures/illu_diabete_t_quantiles.pdf", plot=pt_quantiles, height=h, width=w)Les graphes permettant de donner un aperçu de différentes variables de types différents :
pm <- d %>% ggplot(aes(x=feels_sleepy, fill=group)) +
geom_bar(position="dodge", alpha=0.5) +
scale_fill_manual(values=c(bleuclair, rose)) +
theme_bw()
pc <- d %>% ggplot(aes(x=n_meals_glycemia_out_of_target, fill=group)) +
geom_histogram(alpha=0.5, position="identity", bins=30) +
scale_fill_manual(values=c(bleuclair, rose)) +
theme_bw()
ph$without_meanpb$without_meanpmpcggsave("../../Figures/illu_diabete_diff_weight.pdf", plot=ph$without_mean, height=h, width=w)
ggsave("../../Figures/illu_diabete_is_depressed.pdf", plot=pb$without_mean, height=h, width=w)
ggsave("../../Figures/illu_diabete_feels_sleepy.pdf", plot=pm, height=h, width=w)
ggsave("../../Figures/illu_diabete_n_meals.pdf", plot=pc, height=h, width=w)Une série de simulations sous H0 (pas de différence entre les deux groupes), pour observer l’amplitude de différence entre les deux moyennes estimées.
for (i in 1:5){
dh0 <- tibble(group = rep(c("control", "treatment"), each=n_per_arm)) %>%
mutate(diff_weight = c(rnorm(n_per_arm, mean=0, sd=4), rnorm(n_per_arm, mean=0, sd=4)))
p <- get_histograms(dh0)
ggsave(paste("../../Figures/data_diff_weight_h0_",i,".pdf",sep=""),
plot = p$with_mean,
width = w,
height = h)
}Une série de simulations sous H1 (il y a une différence entre les deux groupes), pour observer l’amplitude de différence entre les deux moyennes estimées.
for (i in 1:5){
dh1 <- tibble(group = rep(c("control", "treatment"), each=n_per_arm)) %>%
mutate(diff_weight = c(rnorm(n_per_arm, mean=0, sd=4), rnorm(n_per_arm, mean=-0.5, sd=4)))
p <- get_histograms(dh1)
ggsave(paste("../../Figures/data_diff_weight_h1_",i,".pdf",sep=""),
plot = p$with_mean,
width = w,
height = h)
}Pour visualiser l’allure d’une loi binomiale avec 250 tirages de Bernoulli, et une probabilité de succès de 0.5.
dbinom <- tibble(x = seq(90, 160),
y = dbinom(x, size=n_per_arm, prob=0.5))
p_illu_binom <- dbinom %>% ggplot(aes(x=x, y=y)) +
geom_bar(stat="identity", fill=bleuclair) +
xlab("nombre de dépressions sur 250 patients") +
ylab("probabilité") +
theme_bw()
p_illu_binom_2 <- p_illu_binom +
geom_vline(xintercept = qbinom(c(0.025, 0.975), size=250, prob=0.5), col=rose)
p_illu_binom_2ggsave("../../Figures/allure_binomiale_250.pdf", plot=p_illu_binom, width=w, height=h)
ggsave("../../Figures/allure_binomiale_250_quantiles.pdf", plot=p_illu_binom_2, width=w, height=h)Des figures pour illustrer les résidus, la variance, la covariance, la droite de régression…
npoints <- 50
dreg <- tibble(x = runif(npoints, 0, 10),
haty = 2*x -5,
y = haty + rnorm(npoints, sd=3))
preg_0 <- dreg %>% ggplot(aes(x=x, y=y)) +
geom_point() +
ylab("variable de réponse") +
xlab("variable explicative") +
theme_bw()
preg_0preg_1 <- preg_0 +
geom_abline(slope = 2, intercept = -5, col=rose)
preg_1preg_2 <- preg_1 +
geom_segment(data=dreg, aes(x=x, xend=x, y=y, yend=haty), col=bleuclair)
preg_2preg_3 <- preg_0 +
geom_hline(yintercept = 5, col=rose) +
geom_segment(data=dreg, aes(x=x, xend=x, y=y), yend=5, col=bleuclair)
preg_3preg_4 <- preg_3 +
geom_vline(xintercept = 5, col=rose) +
geom_segment(data=dreg, aes(x=x, yend=y, y=y), xend=5, col=bleuclair)
preg_4ggsave("../../Figures/reglin_principe_nuage.pdf", plot=preg_0, width=w, height=h)
ggsave("../../Figures/reglin_principe_nuage_abline.pdf", plot=preg_1, width=w, height=h)
ggsave("../../Figures/reglin_principe_nuage_abline_residuals.pdf", plot=preg_2, width=w, height=h)
ggsave("../../Figures/reglin_principe_nuage_hline_variance.pdf", plot=preg_3, width=w, height=h)
ggsave("../../Figures/reglin_principe_nuage_hline_covariance.pdf", plot=preg_4, width=w, height=h)
ggsave("../../Figures/reglin_principe_2_sce.pdf", plot=plot_grid(preg_3, preg_2), width=w, height=h)