We load the table:
load(url("https://zenodo.org/record/7188406/files/human_microbiome_dataset.Rdata"))
the packages:
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library("ggplot2")
First we rarefy:
set.seed(0)
tax_profile_rarefied = t(rrarefy(t(tax_profile),5000))
## Warning in rrarefy(t(tax_profile), 5000): function should be used for observed
## counts, but smallest count is 2
Calculate the beta diversity:
all_vs_all_beta = as.matrix(vegdist(t(tax_profile_rarefied),method = "jaccard"))
Let’s plot the histogram of the all values:
hist(all_vs_all_beta,main = "Histogram of all vs all samples jaccard index",xlab = "Jaccard index")
The values for the jaccard indices goes from 0 to 1. The data seems to be normally distributed, with the mean being around 0.7. But we see that there is a significant amount of values that are close to 0.
We can look at the matrix of distances:
all_vs_all_beta[1:6,1:6]
## 700002_T0 700002_T1 700002_T2 700002_T3 700004_T0 700004_T1
## 700002_T0 0.0000000 0.8204765 0.7721022 0.6775985 0.8756465 0.8718412
## 700002_T1 0.8204765 0.0000000 0.5605297 0.5426989 0.5957029 0.5460890
## 700002_T2 0.7721022 0.5605297 0.0000000 0.4716491 0.6852485 0.5881689
## 700002_T3 0.6775985 0.5426989 0.4716491 0.0000000 0.6744433 0.6490138
## 700004_T0 0.8756465 0.5957029 0.6852485 0.6744433 0.0000000 0.5996359
## 700004_T1 0.8718412 0.5460890 0.5881689 0.6490138 0.5996359 0.0000000
We can see that the matrix is symmetrical (i.e. the distance between sample1 and sample2 is the same as the distance between sample2 and sample1) and that the diagonal is zero (i.e. the distance of a sample to itself is zero).
Let’s compare within subject versus between subject beta diversity:
same_subject_jaccard = c()
different_subject_jaccard = c()
for(subject in unique(metadata$Subject)){
#-------------------------------------------------------------------
# we compare T0 to T3, so the same sunject to itself after one year
t0_vs_t3 = all_vs_all_beta[paste0(subject,"_T0"),paste0(subject,"_T3")]
same_subject_jaccard = c(same_subject_jaccard,t0_vs_t3)
#------------------------------------------------------------------
# we compare the T0 of this subject to the T0 of all other subjects
all_samples_at_t0 = paste(unique(metadata$Subject),"_T0",sep = "")
this_subject_t0 = paste0(subject,"_T0")
sel_beta_between = all_vs_all_beta[this_subject_t0,all_samples_at_t0]
# we remove the zero because it is to the same sample
sel_beta_between = sel_beta_between[sel_beta_between != 0]
different_subject_jaccard = c(different_subject_jaccard,sel_beta_between)
}
Let’s check the median of the two distances vectors:
median(same_subject_jaccard)
## [1] 0.4722318
median(different_subject_jaccard)
## [1] 0.6734761
As we can see, the distances between samples from the same person are lower than the distances between samples from different people.
We can test it:
t.test(same_subject_jaccard,different_subject_jaccard)
##
## Welch Two Sample t-test
##
## data: same_subject_jaccard and different_subject_jaccard
## t = -18.032, df = 120.52, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2054858 -0.1648267
## sample estimates:
## mean of x mean of y
## 0.4884167 0.6735730
It is significant since the p-value is < 0.05.
And let’s plot it:
library(ggplot2)
df = data.frame(
jaccard = c(same_subject_jaccard,different_subject_jaccard),
type = c(rep("within-subject",length(same_subject_jaccard)), rep("between-subject",length(different_subject_jaccard)))
)
ggplot(df,aes(type,jaccard)) + geom_boxplot() +
xlab("") + ylab("Jaccard index")
This means that the human gut microbiome is conserved over time.
We load the data:
load(url("https://sunagawalab.ethz.ch/share/teaching_milanese/data_practical2.Rdata"))
Which load tax_profile_gut
and
meta_gut
.
Let’s look at the data:
tax_profile_gut[1:3,1:3]
## HCO02 HCO07 HCO09
## Prevotella copri 19.24827 6.67434 41.36079
## Catenibacterium mitsuokai 10.52756 4.51712 4.53144
## Collinsella aerofaciens 10.31782 18.51913 8.37266
Is it relative abundances or counts? Let’s check the colSums:
head(colSums(tax_profile_gut))
## HCO02 HCO07 HCO09 HCO11 HCO12 HCO53
## 99.68424 99.80712 99.93822 99.86793 97.60920 99.08658
They sum up to 100. They are relative abundance.
Let’s look at the metadata:
head(meta_gut)
## HCO02 HCO07 HCO09 HCO11
## "non-westernized" "non-westernized" "non-westernized" "non-westernized"
## HCO12 HCO53
## "non-westernized" "non-westernized"
The metadata in meta_gut contains the information of the samples. We will compare the human gut microbiome of urban-industrialized peoples from US (labelled as “westernized”) to the human gut microbiome of hunter-gatherer and traditional agriculturalist communities in Peru (labelled “non-westernized”).
Let’s calculate the PCA:
pca_r <- prcomp(t(tax_profile_gut), center = T, scale. = F)
Let’s now plot the first two components of the PCA, and colour by the type of sample (“westernized” or “non-westernized”):
eigen_value <- pca_r$sdev^2
# create data frame with colors
data_pca = data.frame(
first = pca_r$x[,1],
second = pca_r$x[,2],
type = meta_gut[colnames(tax_profile_gut)]
)
eigs <- pca_r$sdev^2
p1_e = formatC( ((eigs[1] / sum(eigs))*100) , format="f", digits=2)
p2_e = formatC( ((eigs[2] / sum(eigs))*100) , format="f", digits=2)
ggplot(data_pca,aes(first,second,col = type))+
geom_point()+
xlab(paste0("PC1 [",p1_e,"%]")) +
ylab(paste0("PC2 [",p2_e,"%]"))+
theme_bw()
For a Principal Coordinates Analysis (PCoA) we can use different distances between samples. For example, we can use the Jaccard index that we calculated before:
all_vs_all_beta = as.matrix(vegdist(t(tax_profile_gut),method = "jaccard"))
m2 = cmdscale(all_vs_all_beta)
data_PCoA = data.frame(
first = m2[,1],
second = m2[,2],
type = meta_gut[rownames(m2)]
)
ggplot(data_PCoA,aes(first,second,col = type))+
geom_point()+
xlab(paste0("PC1 [",p1_e,"%]")) +
ylab(paste0("PC2 [",p2_e,"%]"))+
theme_bw()
In which ordination plot you see a better separation between westernised and non westernised?
Visually it seems that the PCoA based on the Jaccard indices is separating better between westernised and non westernised. The PCA is based on Euclidean distances, it is basically a PCoA where the distance used is the euclidean distance.
Let’s try to plot using the Bray-curtis distance:
all_vs_all_beta = as.matrix(vegdist(t(tax_profile_gut),method = "bray"))
m2 = cmdscale(all_vs_all_beta)
data_PCoA = data.frame(
first = m2[,1],
second = m2[,2],
type = meta_gut[rownames(m2)]
)
ggplot(data_PCoA,aes(first,second,col = type))+
geom_point()+
xlab(paste0("PC1 [",p1_e,"%]")) +
ylab(paste0("PC2 [",p2_e,"%]"))+
theme_bw()
Also using the Bray-curtis we can see a good separation between the samples.
We can test which species are differentially abundant between westernised and non-westernised.
Let’s do it using a t-test. For each species, we want to check if it is differentially abundant.
It makes sense to check only species that are measured at least in 20% of the samples:
prevalence = (rowSums(tax_profile_gut > 0) / ncol(tax_profile_gut)) * 100
How many have a prevalence greater than 20?
sel_species = names(which(prevalence > 20))
length(sel_species)
## [1] 152
Is the data normally distributed?
Let’s see the distribution of the original data (removing the zeros):
hist(tax_profile_gut[tax_profile_gut != 0],100)
The relative abundances are not normally distributed.
And of the log transform data (of all values that are not zero):
log_data = log10(tax_profile_gut[tax_profile_gut != 0])
hist(log_data,100)
When doing a log transformation, the data is closer to a normal
distribution. But remember that there are still the zeros. When we do
log(0)
we obtain -Inf
(minus infinite), which
is not really a nice number to work with. Hence we add a pseudocount
(0.0001
) to avoid having -Inf
.
We will transform the data when we calculate the p-values.
We run the t-test per species:
t_test_results = rep(NA,length(sel_species))
names(t_test_results) = sel_species
for(species in sel_species){
west = log10(tax_profile_gut[species,which(meta_gut == "westernized")] + 0.0001)
non_west = log10(tax_profile_gut[species,which(meta_gut == "non-westernized")] + 0.0001)
t_test_results[species] = t.test(west,non_west)$p.value
}
Check the highest values in the t-test:
head(sort(t_test_results))
## Catenibacterium mitsuokai Ruthenibacterium lactatiformans
## 1.563647e-27 5.678567e-23
## Prevotella sp. AM42-24 Prevotella stercorea
## 4.813555e-20 1.774778e-18
## Flavonifractor plautii Prevotella sp. CAG:520
## 4.013140e-14 4.959058e-14
And the lowest:
tail(sort(t_test_results))
## Schaalia odontolytica Eubacterium sp. CAG:180
## 0.6990859 0.7164666
## Ruminococcus callidus Roseburia faecis
## 0.7513641 0.8246538
## Lactococcus lactis Firmicutes bacterium CAG:83
## 0.8590688 0.9161066
How many are significant:
length(which(t_test_results<0.05))
## [1] 117
Let’s visualize Catenibacterium mitsuokai
, which has the
lowest p value:
df_c_mit = data.frame(
rel_ab = log10(tax_profile_gut["Catenibacterium mitsuokai",] + 0.0001),
type = meta_gut
)
ggplot(df_c_mit,aes(type,rel_ab))+
geom_boxplot() +
ylab("Relative abundance (log10)")+
xlab("") +
ggtitle("Catenibacterium mitsuokai")
Since we did many tests, we need to do multiple testing correction.
adjusted_pvals = sort(p.adjust(t_test_results,method = "bonferroni"))
How many are still significant?
length(which(adjusted_pvals<0.05))
## [1] 81
Let’s try to use the Benjamini & Hochberg correction. It is also called false discovery rate (fdr) correction.
adjusted_pvals = sort(p.adjust(t_test_results,method = "fdr"))
length(which(adjusted_pvals<0.05))
## [1] 113
There are many more significant. This means that the fdr
correction is more relaxed than the Bonferroni
correction.
Load useful libraries
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(ROCR)
We run a random forest on the same dataset that we used before (westernised vs non westernised):
RF<-randomForest(x=t(tax_profile_gut),y=as.factor(meta_gut),importance=TRUE, proximity=TRUE)
RF
##
## Call:
## randomForest(x = t(tax_profile_gut), y = as.factor(meta_gut), importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 19
##
## OOB estimate of error rate: 1.72%
## Confusion matrix:
## non-westernized westernized class.error
## non-westernized 35 1 0.02777778
## westernized 0 22 0.00000000
We can find the importance of the features:
importance<-as.data.frame(RF$importance)
head(importance[order(importance$MeanDecreaseAccuracy,decreasing = T),],n=10)
## non-westernized westernized
## Catenibacterium mitsuokai 0.02562920 0.036512338
## Ruthenibacterium lactatiformans 0.01516892 0.028697203
## [Clostridium] leptum 0.01717159 0.022020851
## Prevotella sp. AM42-24 0.01537170 0.022699062
## Prevotella copri 0.01782904 0.016654834
## Flavonifractor plautii 0.01635509 0.018243506
## Prevotella stercorea 0.01395222 0.021734776
## Lawsonibacter asaccharolyticus 0.01653545 0.015659019
## Bifidobacterium longum 0.01027489 0.010244012
## Adlercreutzia equolifaciens 0.01085134 0.008771501
## MeanDecreaseAccuracy MeanDecreaseGini
## Catenibacterium mitsuokai 0.029349529 1.4383610
## Ruthenibacterium lactatiformans 0.019316505 0.9962526
## [Clostridium] leptum 0.018659955 1.0491515
## Prevotella sp. AM42-24 0.017812754 1.0330177
## Prevotella copri 0.017162784 1.1416923
## Flavonifractor plautii 0.016703733 0.9808145
## Prevotella stercorea 0.016129271 1.2384761
## Lawsonibacter asaccharolyticus 0.015776590 1.1638086
## Bifidobacterium longum 0.010120564 0.7605456
## Adlercreutzia equolifaciens 0.009970106 0.6727378
And compare it to the adjusted p-values:
head(adjusted_pvals)
## Catenibacterium mitsuokai Ruthenibacterium lactatiformans
## 2.376743e-25 4.315711e-21
## Prevotella sp. AM42-24 Prevotella stercorea
## 2.438868e-18 6.744156e-17
## Flavonifractor plautii Prevotella sp. CAG:520
## 1.219995e-12 1.256295e-12
There is a substantial overlap, as it makes sense. But there is not exactly the same order.
The two procedures are different: the t-test did an evaluation species by species (i.e. every species independently), while the random forest takes into account all features when doing the selection.
For example, if two features have a similar information, they both will have a similar p-value, but the random forest will likely pick one of the two.
Try to predict new samples:
load(url("https://sunagawalab.ethz.ch/share/teaching_milanese/data_practical2_test_RF.Rdata"))
We can evaluate how good we can classify these new samples with the following code:
# Compute the ROC curve based on a random forest and new data (a validation dataset).
getROC<-function(RF,newdat,fct){
prediction_for_roc_curve <- predict(RF,newdat,type="prob")
true.values<-as.character(fct)
pred.values<-as.character(apply(prediction_for_roc_curve,1,function(x){names(x)[which.max(x)]}))
tmp<-prediction_for_roc_curve
colnames(tmp)<-paste("prob.",colnames(tmp),sep="")
pred.mat<-data.frame(true=true.values,predicted=pred.values,tmp)
conf.mat<-table(true.values,pred.values)
fpr<-NULL
tpr<-NULL
auc<-NULL
fct_lev<-NULL
for (i in levels(fct)){
true_values <- ifelse(fct==i,1,0)
pred <- prediction(prediction_for_roc_curve[,i],true_values)
perf <- performance(pred, "tpr", "fpr")
fpr<-c(fpr,unlist(perf@x.values))
tpr<-c(tpr,unlist(perf@y.values))
auc<-c(auc,unlist(performance(pred, measure = "auc")@y.values))
fct_lev<-c(fct_lev,rep(i,length(unlist(perf@x.values))))
}
roc.df<-data.frame(fpr,tpr,fct_lev)
names(auc)<-levels(fct)
list(conf.mat=conf.mat,pred.mat=pred.mat,ROC=roc.df,AUC=auc)
}
res<-getROC(RF,t(new_data),factor(meta_new_data))
Here we can visualise the confusion matrix:
res$conf.mat
## pred.values
## true.values non-westernized westernized
## non-westernized 9 1
## westernized 2 8
From the matrix we can understand that from the 10 non-westernised new samples, only 9 are correctly annotated.
And out of 10 westernised samples, only 8 are correctly classified.
and the AUC:
res$AUC
## non-westernized westernized
## 0.835 0.835
The AUC can get values between 0 and 1, but you will get only values between 0.5 and 1, where 0.5 is a random classification and 1 is a perfect classification.
The AUC we identified is relatively high, in fact we make only 3 errors out of 20. Also, note that here the values for the AUC are the same, because we have only two classes. In the project with the different body sites, you will have different results.
We can also plot the AUC:
ggplot(data=res$ROC,aes(x=fpr,y=tpr,col=fct_lev)) +
geom_abline(linetype=2) +
geom_line() +
scale_color_discrete(label=paste(names(res$AUC)," (AUC: ",round(res$AUC,3),")",sep="")) +
theme(legend.title=element_blank())