Exercise 1: Compare different microbial communities using beta-diversity indices

We load the table:

load(url("https://zenodo.org/record/7188406/files/human_microbiome_dataset.Rdata"))

the packages:

library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.6-4

library("ggplot2")

First we rarefy:

set.seed(0)
tax_profile_rarefied = t(rrarefy(t(tax_profile),5000))

## Warning in rrarefy(t(tax_profile), 5000): function should be used for observed
## counts, but smallest count is 2

Calculate the beta diversity:

all_vs_all_beta = as.matrix(vegdist(t(tax_profile_rarefied),method = "jaccard"))

Exercise Task 1

Let’s plot the histogram of the all values:

hist(all_vs_all_beta,main = "Histogram of all vs all samples jaccard index",xlab = "Jaccard index")

The values for the jaccard indices goes from 0 to 1. The data seems to be normally distributed, with the mean being around 0.7. But we see that there is a significant amount of values that are close to 0.

We can look at the matrix of distances:

all_vs_all_beta[1:6,1:6]

##           700002_T0 700002_T1 700002_T2 700002_T3 700004_T0 700004_T1
## 700002_T0 0.0000000 0.8204765 0.7721022 0.6775985 0.8756465 0.8718412
## 700002_T1 0.8204765 0.0000000 0.5605297 0.5426989 0.5957029 0.5460890
## 700002_T2 0.7721022 0.5605297 0.0000000 0.4716491 0.6852485 0.5881689
## 700002_T3 0.6775985 0.5426989 0.4716491 0.0000000 0.6744433 0.6490138
## 700004_T0 0.8756465 0.5957029 0.6852485 0.6744433 0.0000000 0.5996359
## 700004_T1 0.8718412 0.5460890 0.5881689 0.6490138 0.5996359 0.0000000

We can see that the matrix is symmetrical (i.e. the distance between sample1 and sample2 is the same as the distance between sample2 and sample1) and that the diagonal is zero (i.e. the distance of a sample to itself is zero).

Let’s compare within subject versus between subject beta diversity:

    same_subject_jaccard = c()
    different_subject_jaccard = c()
    
    for(subject in unique(metadata$Subject)){
      #-------------------------------------------------------------------
      # we compare T0 to T3, so the same sunject to itself after one year
      t0_vs_t3 = all_vs_all_beta[paste0(subject,"_T0"),paste0(subject,"_T3")]
      same_subject_jaccard = c(same_subject_jaccard,t0_vs_t3)
      
      #------------------------------------------------------------------
      # we compare the T0 of this subject to the T0 of all other subjects
      all_samples_at_t0 = paste(unique(metadata$Subject),"_T0",sep = "")
      this_subject_t0 = paste0(subject,"_T0")
      sel_beta_between = all_vs_all_beta[this_subject_t0,all_samples_at_t0]
      # we remove the zero because it is to the same sample
      sel_beta_between = sel_beta_between[sel_beta_between != 0]
      different_subject_jaccard = c(different_subject_jaccard,sel_beta_between)
    }

Exercise Task 2

Let’s check the median of the two distances vectors:

median(same_subject_jaccard)

## [1] 0.4722318

median(different_subject_jaccard)

## [1] 0.6734761

As we can see, the distances between samples from the same person are lower than the distances between samples from different people.

We can test it:

t.test(same_subject_jaccard,different_subject_jaccard)

## 
##  Welch Two Sample t-test
## 
## data:  same_subject_jaccard and different_subject_jaccard
## t = -18.032, df = 120.52, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2054858 -0.1648267
## sample estimates:
## mean of x mean of y 
## 0.4884167 0.6735730

It is significant since the p-value is < 0.05.

And let’s plot it:

library(ggplot2)
df = data.frame(
  jaccard = c(same_subject_jaccard,different_subject_jaccard),
  type = c(rep("within-subject",length(same_subject_jaccard)), rep("between-subject",length(different_subject_jaccard)))
)
ggplot(df,aes(type,jaccard)) + geom_boxplot() +
  xlab("") + ylab("Jaccard index")

This means that the human gut microbiome is conserved over time.

Exercise 2: Plot samples using PCoA

We load the data:

load(url("https://sunagawalab.ethz.ch/share/teaching_milanese/data_practical2.Rdata"))

Which load tax_profile_gut and meta_gut.

Let’s look at the data:

tax_profile_gut[1:3,1:3]

##                              HCO02    HCO07    HCO09
## Prevotella copri          19.24827  6.67434 41.36079
## Catenibacterium mitsuokai 10.52756  4.51712  4.53144
## Collinsella aerofaciens   10.31782 18.51913  8.37266

Is it relative abundances or counts? Let’s check the colSums:

head(colSums(tax_profile_gut))

##    HCO02    HCO07    HCO09    HCO11    HCO12    HCO53 
## 99.68424 99.80712 99.93822 99.86793 97.60920 99.08658

They sum up to 100. They are relative abundance.

Let’s look at the metadata:

head(meta_gut)

##             HCO02             HCO07             HCO09             HCO11 
## "non-westernized" "non-westernized" "non-westernized" "non-westernized" 
##             HCO12             HCO53 
## "non-westernized" "non-westernized"

The metadata in meta_gut contains the information of the samples. We will compare the human gut microbiome of urban-industrialized peoples from US (labelled as “westernized”) to the human gut microbiome of hunter-gatherer and traditional agriculturalist communities in Peru (labelled “non-westernized”).

Let’s calculate the PCA:

pca_r <- prcomp(t(tax_profile_gut), center = T, scale. = F)

Let’s now plot the first two components of the PCA, and colour by the type of sample (“westernized” or “non-westernized”):

eigen_value <- pca_r$sdev^2

# create data frame with colors
data_pca = data.frame(
  first = pca_r$x[,1],
  second = pca_r$x[,2],
  type = meta_gut[colnames(tax_profile_gut)]
)

eigs <- pca_r$sdev^2
p1_e = formatC(  ((eigs[1] / sum(eigs))*100)  , format="f", digits=2)
p2_e = formatC(  ((eigs[2] / sum(eigs))*100)  , format="f", digits=2)

ggplot(data_pca,aes(first,second,col = type))+
  geom_point()+
  xlab(paste0("PC1 [",p1_e,"%]")) +
  ylab(paste0("PC2 [",p2_e,"%]"))+
  theme_bw()

For a Principal Coordinates Analysis (PCoA) we can use different distances between samples. For example, we can use the Jaccard index that we calculated before:

all_vs_all_beta = as.matrix(vegdist(t(tax_profile_gut),method = "jaccard"))
m2 = cmdscale(all_vs_all_beta)
data_PCoA = data.frame(
  first = m2[,1],
  second = m2[,2],
  type = meta_gut[rownames(m2)]
)
ggplot(data_PCoA,aes(first,second,col = type))+
  geom_point()+
  xlab(paste0("PC1 [",p1_e,"%]")) +
  ylab(paste0("PC2 [",p2_e,"%]"))+
  theme_bw()

Exercise Task 3

In which ordination plot you see a better separation between westernised and non westernised?

Visually it seems that the PCoA based on the Jaccard indices is separating better between westernised and non westernised. The PCA is based on Euclidean distances, it is basically a PCoA where the distance used is the euclidean distance.

Let’s try to plot using the Bray-curtis distance:

all_vs_all_beta = as.matrix(vegdist(t(tax_profile_gut),method = "bray"))
m2 = cmdscale(all_vs_all_beta)
data_PCoA = data.frame(
  first = m2[,1],
  second = m2[,2],
  type = meta_gut[rownames(m2)]
)
ggplot(data_PCoA,aes(first,second,col = type))+
  geom_point()+
  xlab(paste0("PC1 [",p1_e,"%]")) +
  ylab(paste0("PC2 [",p2_e,"%]"))+
  theme_bw()

Also using the Bray-curtis we can see a good separation between the samples.

Differential abundance testing

We can test which species are differentially abundant between westernised and non-westernised.

Let’s do it using a t-test. For each species, we want to check if it is differentially abundant.

It makes sense to check only species that are measured at least in 20% of the samples:

prevalence = (rowSums(tax_profile_gut > 0) / ncol(tax_profile_gut)) * 100

How many have a prevalence greater than 20?

sel_species = names(which(prevalence > 20))
length(sel_species)

## [1] 152

Exercise Task 4

Is the data normally distributed?

Let’s see the distribution of the original data (removing the zeros):

hist(tax_profile_gut[tax_profile_gut != 0],100)

The relative abundances are not normally distributed.

And of the log transform data (of all values that are not zero):

log_data = log10(tax_profile_gut[tax_profile_gut != 0])
hist(log_data,100)

When doing a log transformation, the data is closer to a normal distribution. But remember that there are still the zeros. When we do log(0) we obtain -Inf (minus infinite), which is not really a nice number to work with. Hence we add a pseudocount (0.0001) to avoid having -Inf.

We will transform the data when we calculate the p-values.

We run the t-test per species:

t_test_results = rep(NA,length(sel_species))
names(t_test_results) = sel_species
for(species in sel_species){
  west =     log10(tax_profile_gut[species,which(meta_gut == "westernized")] + 0.0001)
  non_west = log10(tax_profile_gut[species,which(meta_gut == "non-westernized")] + 0.0001)
  t_test_results[species] = t.test(west,non_west)$p.value
}

Exercise Task 5

Check the highest values in the t-test:

head(sort(t_test_results))

##       Catenibacterium mitsuokai Ruthenibacterium lactatiformans 
##                    1.563647e-27                    5.678567e-23 
##          Prevotella sp. AM42-24            Prevotella stercorea 
##                    4.813555e-20                    1.774778e-18 
##          Flavonifractor plautii          Prevotella sp. CAG:520 
##                    4.013140e-14                    4.959058e-14

And the lowest:

tail(sort(t_test_results))

##       Schaalia odontolytica     Eubacterium sp. CAG:180 
##                   0.6990859                   0.7164666 
##       Ruminococcus callidus            Roseburia faecis 
##                   0.7513641                   0.8246538 
##          Lactococcus lactis Firmicutes bacterium CAG:83 
##                   0.8590688                   0.9161066

How many are significant:

length(which(t_test_results<0.05))

## [1] 117

Let’s visualize Catenibacterium mitsuokai, which has the lowest p value:

df_c_mit = data.frame(
  rel_ab = log10(tax_profile_gut["Catenibacterium mitsuokai",] + 0.0001),
  type = meta_gut
)
ggplot(df_c_mit,aes(type,rel_ab))+
  geom_boxplot() +
  ylab("Relative abundance (log10)")+
  xlab("") +
  ggtitle("Catenibacterium mitsuokai")

Since we did many tests, we need to do multiple testing correction.

adjusted_pvals = sort(p.adjust(t_test_results,method = "bonferroni"))

Exercise Task 6

How many are still significant?

length(which(adjusted_pvals<0.05))

## [1] 81

Let’s try to use the Benjamini & Hochberg correction. It is also called false discovery rate (fdr) correction.

adjusted_pvals = sort(p.adjust(t_test_results,method = "fdr"))
length(which(adjusted_pvals<0.05))

## [1] 113

There are many more significant. This means that the fdr correction is more relaxed than the Bonferroni correction.

Exercise 4: Build a random forest model¶

Load useful libraries

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(ROCR)

We run a random forest on the same dataset that we used before (westernised vs non westernised):

RF<-randomForest(x=t(tax_profile_gut),y=as.factor(meta_gut),importance=TRUE, proximity=TRUE)
RF

## 
## Call:
##  randomForest(x = t(tax_profile_gut), y = as.factor(meta_gut),      importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 19
## 
##         OOB estimate of  error rate: 1.72%
## Confusion matrix:
##                 non-westernized westernized class.error
## non-westernized              35           1  0.02777778
## westernized                   0          22  0.00000000

We can find the importance of the features:

importance<-as.data.frame(RF$importance)
head(importance[order(importance$MeanDecreaseAccuracy,decreasing = T),],n=10)

##                                 non-westernized westernized
## Catenibacterium mitsuokai            0.02562920 0.036512338
## Ruthenibacterium lactatiformans      0.01516892 0.028697203
## [Clostridium] leptum                 0.01717159 0.022020851
## Prevotella sp. AM42-24               0.01537170 0.022699062
## Prevotella copri                     0.01782904 0.016654834
## Flavonifractor plautii               0.01635509 0.018243506
## Prevotella stercorea                 0.01395222 0.021734776
## Lawsonibacter asaccharolyticus       0.01653545 0.015659019
## Bifidobacterium longum               0.01027489 0.010244012
## Adlercreutzia equolifaciens          0.01085134 0.008771501
##                                 MeanDecreaseAccuracy MeanDecreaseGini
## Catenibacterium mitsuokai                0.029349529        1.4383610
## Ruthenibacterium lactatiformans          0.019316505        0.9962526
## [Clostridium] leptum                     0.018659955        1.0491515
## Prevotella sp. AM42-24                   0.017812754        1.0330177
## Prevotella copri                         0.017162784        1.1416923
## Flavonifractor plautii                   0.016703733        0.9808145
## Prevotella stercorea                     0.016129271        1.2384761
## Lawsonibacter asaccharolyticus           0.015776590        1.1638086
## Bifidobacterium longum                   0.010120564        0.7605456
## Adlercreutzia equolifaciens              0.009970106        0.6727378

And compare it to the adjusted p-values:

head(adjusted_pvals)

##       Catenibacterium mitsuokai Ruthenibacterium lactatiformans 
##                    2.376743e-25                    4.315711e-21 
##          Prevotella sp. AM42-24            Prevotella stercorea 
##                    2.438868e-18                    6.744156e-17 
##          Flavonifractor plautii          Prevotella sp. CAG:520 
##                    1.219995e-12                    1.256295e-12

There is a substantial overlap, as it makes sense. But there is not exactly the same order.

The two procedures are different: the t-test did an evaluation species by species (i.e. every species independently), while the random forest takes into account all features when doing the selection.

For example, if two features have a similar information, they both will have a similar p-value, but the random forest will likely pick one of the two.

Try to predict new samples:

load(url("https://sunagawalab.ethz.ch/share/teaching_milanese/data_practical2_test_RF.Rdata"))

We can evaluate how good we can classify these new samples with the following code:

    # Compute the ROC curve based on a random forest and new data (a validation dataset).
    getROC<-function(RF,newdat,fct){
      prediction_for_roc_curve <- predict(RF,newdat,type="prob")
    true.values<-as.character(fct)
    pred.values<-as.character(apply(prediction_for_roc_curve,1,function(x){names(x)[which.max(x)]}))
    tmp<-prediction_for_roc_curve
    colnames(tmp)<-paste("prob.",colnames(tmp),sep="")
    pred.mat<-data.frame(true=true.values,predicted=pred.values,tmp)
    conf.mat<-table(true.values,pred.values)
    
    fpr<-NULL
    tpr<-NULL
    auc<-NULL
    fct_lev<-NULL
    for (i in levels(fct)){
      true_values <- ifelse(fct==i,1,0)
      pred <- prediction(prediction_for_roc_curve[,i],true_values)
      perf <- performance(pred, "tpr", "fpr")
      fpr<-c(fpr,unlist(perf@x.values))
      tpr<-c(tpr,unlist(perf@y.values))
      auc<-c(auc,unlist(performance(pred, measure = "auc")@y.values))
      fct_lev<-c(fct_lev,rep(i,length(unlist(perf@x.values))))
    }
    roc.df<-data.frame(fpr,tpr,fct_lev)
    names(auc)<-levels(fct)
    list(conf.mat=conf.mat,pred.mat=pred.mat,ROC=roc.df,AUC=auc)
    }
    
    res<-getROC(RF,t(new_data),factor(meta_new_data))

Exercise Task 7

Here we can visualise the confusion matrix:

res$conf.mat

##                  pred.values
## true.values       non-westernized westernized
##   non-westernized               9           1
##   westernized                   2           8

From the matrix we can understand that from the 10 non-westernised new samples, only 9 are correctly annotated.

And out of 10 westernised samples, only 8 are correctly classified.

and the AUC:

res$AUC

## non-westernized     westernized 
##           0.835           0.835

The AUC can get values between 0 and 1, but you will get only values between 0.5 and 1, where 0.5 is a random classification and 1 is a perfect classification.

The AUC we identified is relatively high, in fact we make only 3 errors out of 20. Also, note that here the values for the AUC are the same, because we have only two classes. In the project with the different body sites, you will have different results.

We can also plot the AUC:

ggplot(data=res$ROC,aes(x=fpr,y=tpr,col=fct_lev)) +
  geom_abline(linetype=2) +
  geom_line() +
  scale_color_discrete(label=paste(names(res$AUC)," (AUC: ",round(res$AUC,3),")",sep="")) +
  theme(legend.title=element_blank())

Solution practical Metagenomics 2

Alessio Milanese

2022-10-24