# ---- METAGENOMICS PROJECT 1 SOLUTION SCRIPT ---- # Uploading packages required for the analysis. library("ggplot2") library("moments") library("randomForest") library("ROCR") library("vegan") # Loading the data set into the environment. load("/nfs/nas22/fs2202/biol_micro_teaching/551-1299-00L/metagenomics/gut_microbiome_dataset.RData") # Before proceeding with the analysis, we need to rarefy the data. # First, checking what is the lowest sequencing depth in the data set. min(colSums(gut_profiles)) # [1] 305631 # Rarefying all samples to the lowest sequencing depth in the data set. set.seed(0) # setting a random seed so results are directly comparable between code re-runs. gut_profiles_rarefied = t(rrarefy(t(gut_profiles), 305631)) # ---- PROJECT TASK 1 ---- # Calculating observed richness. richness = t(estimateR(t(gut_profiles_rarefied)))[ , "S.obs"] # Calculating Shannon diversity. shannon_index = diversity(t(gut_profiles_rarefied), "shannon") # Calculating Pielou evenness. PielouE <- function(counts_in_sample, consider_zeros = FALSE){ if (consider_zeros == FALSE) counts_in_sample <- as.vector(counts_in_sample[counts_in_sample > 0]) shannon_diversity <- diversity(counts_in_sample) number_taxa <- length(counts_in_sample) evenness <- shannon_diversity/log(number_taxa) return(evenness) } evenness = apply(gut_profiles_rarefied, 2, PielouE) # Adding data on alpha-diversity metrics to a table for plotting. # It's useful to add all metadata so we can plot and split the data in multiple ways. alpha_diversity_table <- gut_metadata alpha_diversity_table$richness = richness[row.names(alpha_diversity_table)] alpha_diversity_table$diversity = shannon_index[row.names(alpha_diversity_table)] alpha_diversity_table$evenness = evenness[row.names(alpha_diversity_table)] # ==== Question 1.1 ==== # Plotting data on country vs richness. ggplot(alpha_diversity_table, aes(x = country, y = richness)) + geom_boxplot() + xlab("") + ylab("Observed Richness") + theme(text = element_text(size = 16)) + theme_bw() # Performing tests to compare these values between three countries. t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "richness"], alpha_diversity_table[alpha_diversity_table["country"] == "USA", "richness"]) # not significant t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "richness"], alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "richness"]) # significant t.test(alpha_diversity_table[alpha_diversity_table["country"] == "USA", "richness"], alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "richness"]) # significant # Plotting data on country vs diversity. ggplot(alpha_diversity_table, aes(x = country, y = diversity)) + geom_boxplot() + xlab("") + ylab("Shannon Index") + theme(text = element_text(size = 16)) + theme_bw() # Performing tests to compare these values between three countries. t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "diversity"], alpha_diversity_table[alpha_diversity_table["country"] == "USA", "diversity"]) # significant t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "diversity"], alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "diversity"]) # significant t.test(alpha_diversity_table[alpha_diversity_table["country"] == "USA", "diversity"], alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "diversity"]) # not significant # Plotting data on country vs diversity. ggplot(alpha_diversity_table, aes(x = country, y = evenness)) + geom_boxplot() + xlab("") + ylab("Pielou Evenness") + theme(text = element_text(size = 16)) + theme_bw() # Performing tests to compare these values between three countries. t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "evenness"], alpha_diversity_table[alpha_diversity_table["country"] == "USA", "evenness"]) # significant t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "evenness"], alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "evenness"]) # significant t.test(alpha_diversity_table[alpha_diversity_table["country"] == "USA", "evenness"], alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "evenness"]) # not significant # Based on these observations, it appears as if the samples from Venezuela have the highest alpha-diversity # according to all three metrics. Samples from the US have similar Shannon index and Pielou evenness # distributions, but have significantly lower richness. The samples from Malawi have the lowest alpha-diversity # on all three metrics. However, before we make any concrete conclusions, let's look at other # factors measured in the metadata. # ==== Question 1.2 ==== # Alpha-diversity metrics versus age ggplot(alpha_diversity_table, aes(x = age, y = richness)) + geom_point() + xlab("Age") + ylab("Observed Richness") + theme(text = element_text(size = 16)) # Using the Spearman test as the relationship is not linear cor.test(alpha_diversity_table$age, alpha_diversity_table$richness, method = "spearman") # significant ggplot(alpha_diversity_table, aes(x = age, y = diversity)) + geom_point() + xlab("Age") + ylab("Shannon Index") + theme(text = element_text(size = 16)) cor.test(alpha_diversity_table$age, alpha_diversity_table$diversity, method = "spearman") # significant ggplot(alpha_diversity_table, aes(x = age, y = evenness)) + geom_point() + xlab("Age") + ylab("Pielou Evenness") + theme(text = element_text(size = 16)) cor.test(alpha_diversity_table$age, alpha_diversity_table$evenness, method = "spearman") # significant # Alpha-diversity metrics versus gender ggplot(alpha_diversity_table, aes(x = gender, y = richness)) + geom_boxplot() + xlab("Gender") + ylab("Observed Richness") + theme(text = element_text(size = 16)) t.test(alpha_diversity_table[alpha_diversity_table["gender"] == "M", "richness"], alpha_diversity_table[alpha_diversity_table["gender"] == "F", "richness"]) # significant ggplot(alpha_diversity_table, aes(x = gender, y = diversity)) + geom_boxplot() + xlab("Gender") + ylab("Shannon Index") + theme(text = element_text(size = 16)) t.test(alpha_diversity_table[alpha_diversity_table["gender"] == "M", "diversity"], alpha_diversity_table[alpha_diversity_table["gender"] == "F", "diversity"]) # significant ggplot(alpha_diversity_table, aes(x = gender, y = evenness)) + geom_boxplot() + xlab("Gender") + ylab("Pielou Evenness") + theme(text = element_text(size = 16)) t.test(alpha_diversity_table[alpha_diversity_table["gender"] == "M", "evenness"], alpha_diversity_table[alpha_diversity_table["gender"] == "F", "evenness"]) # significant # Alpha-diversity metrics versus BMI ggplot(alpha_diversity_table, aes(x = bmi, y = richness)) + geom_point() + xlab("Body Mass Index") + ylab("Observed Richness") + theme(text = element_text(size = 16)) cor.test(alpha_diversity_table$bmi, alpha_diversity_table$richness, method = "spearman") # significant ggplot(alpha_diversity_table, aes(x = bmi, y = diversity)) + geom_point() + xlab("Body Mass Index") + ylab("Shannon Index") + theme(text = element_text(size = 16)) cor.test(alpha_diversity_table$bmi, alpha_diversity_table$diversity, method = "spearman") # significant ggplot(alpha_diversity_table, aes(x = bmi, y = evenness)) + geom_point() + xlab("Body Mass Index") + ylab("Pielou Evenness") + theme(text = element_text(size = 16)) cor.test(alpha_diversity_table$bmi, alpha_diversity_table$evenness, method = "spearman") # significant # Based on these tests, we observe a positive relationship between age and alpha-diversity, # between BMI and diversity. It also appears as if male subjects have less gut diversity than female subjects, # but let's look at factors that can potentially confound this observation. # ==== Question 1.3 ==== # Let's color the dots on the richness versus age plot by country. ggplot(alpha_diversity_table, aes(x = age, y = richness, col = country)) + geom_point() + xlab("Age") + ylab("Observed Richness") + theme(text = element_text(size = 16)) # Now, we see that the samples from the adults from the USA are actually less diverse than those from # the adults from Malawi and Venezuela. # We can look at age distribution by country: ggplot(alpha_diversity_table, aes(x = age, fill = country)) + geom_histogram(position = "identity") + facet_grid(country ~ .) # this will plot the histograms in separate panels so it's easier to compare # We see that there are more samples from USA adults than from the other two countries. # In the first years of life, the gut microbiome is less diverse, so the higher fraction of gut microbiome # samples from Malawian and Venezuelan children bring the average alpha-diversity down. # Similarly, if we look at age distribution by gender: ggplot(alpha_diversity_table, aes(x = age, fill = gender)) + geom_histogram(position = "identity") + facet_grid(gender ~ .) # There are more adult female subjects in the data set, so it appears as if male subjects display # lower gut microbiome diversity. However, if we only plot the data from adults: ggplot(alpha_diversity_table[alpha_diversity_table$age >= 5, ], aes(x = gender, y = richness)) + geom_boxplot() + xlab("Gender") + ylab("Observed Richness") + theme(text = element_text(size = 16)) t.test(alpha_diversity_table[alpha_diversity_table["gender"] == "M" & alpha_diversity_table["age"] >= 5, "richness"], alpha_diversity_table[alpha_diversity_table["gender"] == "F" & alpha_diversity_table["age"] >= 5, "richness"]) # We will observe no more significant difference in gut microbiome diversity between the two genders. # In fact, since all NA values in the gender column come from Malawi and Venezuela, we observe a higher # median richness for NA values in the boxplot. # ---- PROJECT TASK 2 ---- # Calculating Jaccard distance for the samples. gut_profiles_jaccard = as.matrix(vegdist(t(gut_profiles_rarefied), method = "jaccard")) # Since the Jaccard distance matrix is symmetric, we can take only half of the values. gut_profiles_jaccard_lower_tri <- gut_profiles_jaccard gut_profiles_jaccard_lower_tri[lower.tri(gut_profiles_jaccard_lower_tri, diag = TRUE)] <- NA # ==== Question 2.1 ==== # Vectors where we will store the results. jaccard_malawi_vs_malawi = c() jaccard_usa_vs_usa = c() jaccard_venezuela_vs_venezuela = c() jaccard_malawi_vs_usa = c() jaccard_malawi_vs_venezuela = c() jaccard_usa_vs_venezuela = c() # These are all that come from Malawi. malawi_samples = gut_metadata[gut_metadata$country == "Malawi", "sample_id"] # These are all that come from the USA. usa_samples = gut_metadata[gut_metadata$country == "USA", "sample_id"] # These are all that come from Venezuela. venezuela_samples = gut_metadata[gut_metadata$country == "Venezuela", "sample_id"] for(sample_id in unique(gut_metadata$sample_id)){ # First, let's check if the sample is from Malawi if(sample_id %in% malawi_samples) { # If this is a sample from Malawi, we can add its distances to the malawi_vs_malawi, malawi_vs_usa, # and malawi_vs_venezuela comparisons. malawi_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, malawi_samples] jaccard_malawi_vs_malawi = c(jaccard_malawi_vs_malawi, malawi_comparisons_for_sample) usa_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, usa_samples] jaccard_malawi_vs_usa = c(jaccard_malawi_vs_usa, usa_comparisons_for_sample) venezuela_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, venezuela_samples] jaccard_malawi_vs_venezuela = c(jaccard_malawi_vs_venezuela, venezuela_comparisons_for_sample) } else if(sample_id %in% usa_samples) { # Second, let's check if the sample is from the USA # If this is a sample from the USA, we can add its distances to the usa_vs_usa, malawi_vs_usa, # and usa_vs_venezuela comparisons. malawi_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, malawi_samples] jaccard_malawi_vs_usa = c(jaccard_malawi_vs_usa, malawi_comparisons_for_sample) usa_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, usa_samples] jaccard_usa_vs_usa = c(jaccard_usa_vs_usa, usa_comparisons_for_sample) venezuela_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, venezuela_samples] jaccard_usa_vs_venezuela = c(jaccard_usa_vs_venezuela, venezuela_comparisons_for_sample) } else { # If the sample is neither from Malawi or the USA, it comes from Venezuela. # If this is a sample from Venezuela, we can add its distances to the venezuela_vs_venezuela, # malawi_vs_venezuela, and usa_vs_venezuela comparisons. malawi_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, malawi_samples] jaccard_malawi_vs_venezuela = c(jaccard_malawi_vs_venezuela, malawi_comparisons_for_sample) usa_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, usa_samples] jaccard_usa_vs_venezuela = c(jaccard_usa_vs_venezuela, usa_comparisons_for_sample) venezuela_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, venezuela_samples] jaccard_venezuela_vs_venezuela = c(jaccard_venezuela_vs_venezuela, venezuela_comparisons_for_sample) } } # When we took half the values in the distance matrix, we introduced NA values into the data. Let's remove these # before performing the statistical tests. jaccard_malawi_vs_malawi = na.omit(jaccard_malawi_vs_malawi) jaccard_usa_vs_usa = na.omit(jaccard_usa_vs_usa) jaccard_venezuela_vs_venezuela = na.omit(jaccard_venezuela_vs_venezuela) jaccard_malawi_vs_usa = na.omit(jaccard_malawi_vs_usa) jaccard_malawi_vs_venezuela = na.omit(jaccard_malawi_vs_venezuela) jaccard_usa_vs_venezuela = na.omit(jaccard_usa_vs_venezuela) # Calculating average distance for samples from the same country. mean(jaccard_malawi_vs_malawi) # 0.8125623 mean(jaccard_usa_vs_usa) # 0.8259492 mean(jaccard_venezuela_vs_venezuela) # 0.8061796 # Calculating average distance from samples from different countries. mean(jaccard_malawi_vs_usa) # 0.916304 mean(jaccard_malawi_vs_venezuela) # 0.8351879 mean(jaccard_usa_vs_venezuela) # 0.8982677 # Samples from the same country tend to be more similar to each other than to samples from different countries. # However, the beta-diversity values between Malawi and Venezuela are smaller than those of these countries # and the USA. # ==== Advanced Questions ==== # Are samples from the same family more similar to each other than from different families? jaccard_same_family = c() jaccard_diff_family = c() list_of_families = unique(gut_metadata$family_id) for(family in list_of_families) { within_family_samples = gut_metadata[gut_metadata$family_id == family, "sample_id"] outside_family_samples = gut_metadata[gut_metadata$family_id != family, "sample_id"] # Selecting all cells were samples from the same family are compared. within_family_distances = as.vector(gut_profiles_jaccard_lower_tri[within_family_samples, within_family_samples]) jaccard_same_family = c(jaccard_same_family, na.omit(within_family_distances)) # Selecting all cells were samples from the same family are to samples from other families. outside_family_distances = as.vector(gut_profiles_jaccard_lower_tri[within_family_samples, outside_family_samples]) jaccard_diff_family = c(jaccard_diff_family, na.omit(outside_family_distances)) } # Looking at average distances. mean(jaccard_same_family) # 0.7445527 mean(jaccard_diff_family) # 0.8654604 # We can do the same comparison, but separating samples by country. jaccard_same_family_by_country = list() jaccard_diff_family_by_country = list() for(country in c("Malawi", "USA", "Venezuela")) { # Selecting rows that correspond to country. gut_metadata_for_country = gut_metadata[gut_metadata$country == country, ] list_of_families_country = unique(gut_metadata_for_country$family_id) jaccard_same_family_country = c() jaccard_diff_family_country = c() for(family in list_of_families_country) { within_family_samples = gut_metadata_for_country[gut_metadata_for_country$family_id == family, "sample_id"] outside_family_samples = gut_metadata_for_country[gut_metadata_for_country$family_id != family, "sample_id"] # Selecting all cells were samples from the same family are compared. within_family_distances = as.vector(gut_profiles_jaccard_lower_tri[within_family_samples, within_family_samples]) jaccard_same_family_country = c(jaccard_same_family_country, na.omit(within_family_distances)) # Selecting all cells were samples from the same family are to samples from other families. outside_family_distances = as.vector(gut_profiles_jaccard_lower_tri[within_family_samples, outside_family_samples]) jaccard_diff_family_country = c(jaccard_diff_family_country, na.omit(outside_family_distances)) } jaccard_same_family_by_country = c(jaccard_same_family_by_country, list(jaccard_same_family_country)) jaccard_diff_family_by_country = c(jaccard_diff_family_by_country, list(jaccard_diff_family_country)) } # We can put the results into a table for plotting. plot_distances_by_family = data.frame("country" = c(rep("Malawi", length(jaccard_same_family_by_country[[1]]) + length(jaccard_diff_family_by_country[[1]])), rep("USA", length(jaccard_same_family_by_country[[2]]) + length(jaccard_diff_family_by_country[[2]])), rep("Venezuela", length(jaccard_same_family_by_country[[3]]) + length(jaccard_diff_family_by_country[[3]]))), "family" = c(rep("Same", length(jaccard_same_family_by_country[[1]])), rep("Different", length(jaccard_diff_family_by_country[[1]])), rep("Same", length(jaccard_same_family_by_country[[2]])), rep("Different", length(jaccard_diff_family_by_country[[2]])), rep("Same", length(jaccard_same_family_by_country[[3]])), rep("Different", length(jaccard_diff_family_by_country[[3]]))), "jaccard" = c(jaccard_same_family_by_country[[1]], jaccard_diff_family_by_country[[1]], jaccard_same_family_by_country[[2]], jaccard_diff_family_by_country[[2]], jaccard_same_family_by_country[[3]], jaccard_diff_family_by_country[[3]])) ggplot(plot_distances_by_family, aes(x = country, y = jaccard, fill = family)) + geom_boxplot() + ylim(c(0, 1)) + xlab("") + ylab("Jaccard Distance") + scale_fill_discrete(name = "Family") + theme_bw() + theme(text = element_text(size = 16)) # Overall, individuals originating from the same family tend to have more similar microbiomes than # individuals from different families. In the USA samples, the separation is especially clear. # Another question we can test is whether twins have more similar microbiomes than siblings. selected_family_relationships = c("Sibling", "Twin1", "Twin2", "Twin3", "Brother1", "Sister1", "Brother2", "Sister", "Brother") metadata_families_w_siblings = gut_metadata[gut_metadata$family_relationship %in% selected_family_relationships, ] # So it's easier, let's remove the numbers at the end of the words. metadata_families_w_siblings$family_relationship = sapply(metadata_families_w_siblings$family_relationship, function(x) gsub("[0-9]", "", x)) # For some siblings, the age is undetermined, so we set it to a high number that it always return FALSE during comparison. metadata_families_w_siblings[is.na(metadata_families_w_siblings$age), "age"] = 100 jaccard_twins = c() jaccard_siblings = c() for(family in unique(metadata_families_w_siblings$family_id)) { family_data = metadata_families_w_siblings[metadata_families_w_siblings$family_id == family, ] for(sample_1 in family_data$sample_id) { for(sample_2 in family_data$sample_id) { if(sample_1 == sample_2) next if(family_data[sample_1, "age"] == family_data[sample_2, "age"]){ jaccard_twins = c(jaccard_twins, gut_profiles_jaccard_lower_tri[sample_1, sample_2]) } else { jaccard_siblings = c(jaccard_siblings, gut_profiles_jaccard_lower_tri[sample_1, sample_2]) } } } } plot_distances_siblings = data.frame("relationship" = c(rep("Twins", length(jaccard_twins)), rep("Siblings", length(jaccard_siblings))), "jaccard" = c(jaccard_twins, jaccard_siblings)) ggplot(plot_distances_siblings, aes(x = relationship, y = jaccard)) + geom_boxplot() + ylim(c(0, 1)) + xlab("") + ylab("Jaccard Distance") + theme_bw() + theme(text = element_text(size = 16)) # It appears that twins tend to have more similar microbiomes to each other when compared to siblings. # ---- PROJECT TASK 3 ---- # Performing Principle Coordinates Analysis based on the Jaccard distance matrix. jaccard_ordination = cmdscale(gut_profiles_jaccard) # Adding these values to the gut metadata table. ordination_table <- gut_metadata ordination_table$first <- jaccard_ordination[rownames(ordination_table), 1] ordination_table$second <- jaccard_ordination[rownames(ordination_table), 2] # ==== Question 3.1 ==== # Coloring ordination plot by country. ggplot(ordination_table, aes(x = first, y = second, col = country)) + geom_point() + xlab("PCoA1") + ylab("PCoA2") + theme(text = element_text(size = 16)) # In PCoA1, USA samples are well separated from samples from Malawi and Venezuela. # The samples from Malawi and Venezuela show less separation between each other. # Coloring ordination plot by region. ggplot(ordination_table, aes(x = first, y = second, col = region)) + geom_point() + xlab("PCoA1") + ylab("PCoA2") + theme(text = element_text(size = 16)) # We observe that samples originating from different regions of the same country # don't separate into clear clusters. # ==== Question 3.2 ==== # Coloring ordination plot by age. # Since about 1/6 of the age data originates from subjects less than a year old, taking the # logarithm of the age will allow us to see the trends more clearly. ggplot(ordination_table, aes(x = first, y = second, col = log2(age))) + geom_point() + xlab("PCoA1") + ylab("PCoA2") + theme(text = element_text(size = 16)) # In PCoA2, we see some variation associated with age - samples on the bottom tend to come from # young individuals while samples on the top tend to come from adults. # Coloring ordination plot by BMI. ggplot(ordination_table, aes(x = first, y = second, col = bmi)) + geom_point() + xlab("PCoA1") + ylab("PCoA2") + theme(text = element_text(size = 16)) # There appears to be no separation by BMI. # Coloring ordination plot by gender ggplot(ordination_table, aes(x = first, y = second, col = gender)) + geom_point() + xlab("PCoA1") + ylab("PCoA2") + theme(text = element_text(size = 16)) # There appears to be no separation by gender. # ---- PROJECT TASK 4 ---- # Calculating OTU prevalence. prevalence = (rowSums(gut_profiles_rarefied > 0) / ncol(gut_profiles_rarefied)) * 100 # Retaining OTUs that have at least 20% prevalence. selected_otus = names(which(prevalence > 20)) gut_profiles_subset <- gut_profiles_rarefied[selected_otus, ] # Assessing the distribution of values in this table. hist(gut_profiles_subset[gut_profiles_subset != 0]) skewness(gut_profiles_subset[gut_profiles_subset != 0]) # What if we log transform the data? log_data = log10(gut_profiles_subset[gut_profiles_subset != 0]) hist(log_data, breaks = 100) skewness(log_data) # Because the data don't follow a normal distribution, we will use the Wilcoxon rank-sum test. # Create three empty lists of the correct length that the test results can go into. wilcox_malawi_vs_usa = rep(NA, length(selected_otus)) wilcox_malawi_vs_venezuela = rep(NA, length(selected_otus)) wilcox_usa_vs_venezuela = rep(NA, length(selected_otus)) # Then we assign the OTU names as the names in the list. names(wilcox_malawi_vs_usa) = selected_otus names(wilcox_malawi_vs_venezuela) = selected_otus names(wilcox_usa_vs_venezuela) = selected_otus # Run through the dataset in a loop, calculate the log10 abundance of each OTU # for each category and then perform a Wilcoxon test. for(otu in selected_otus) { malawi_data = gut_profiles_subset[otu, malawi_samples] usa_data = gut_profiles_subset[otu, usa_samples] venezuela_data = gut_profiles_subset[otu, venezuela_samples] wilcox_malawi_vs_usa[otu] = wilcox.test(malawi_data, usa_data)$p.value wilcox_malawi_vs_venezuela[otu] = wilcox.test(malawi_data, venezuela_data)$p.value wilcox_usa_vs_venezuela[otu] = wilcox.test(usa_data, venezuela_data)$p.value } # Applying multiple testing correction using the FDR method. wilcox_malawi_vs_usa = p.adjust(wilcox_malawi_vs_usa, method = "fdr") wilcox_malawi_vs_venezuela = p.adjust(wilcox_malawi_vs_venezuela, method = "fdr") wilcox_usa_vs_venezuela = p.adjust(wilcox_usa_vs_venezuela, method = "fdr") # ==== Question 4.1 ==== # Counting the number of differentially abundant OTUs between different countries. sum(wilcox_malawi_vs_usa <= 0.05) # 1482 sum(wilcox_malawi_vs_venezuela <= 0.05, na.rm = TRUE) # 1081 sum(wilcox_usa_vs_venezuela <= 0.05) # 1434 # How many of these OTUs have a species-level taxonomic annotation? # First, let's transform the taxonomic data into a format that's easy to parse. otu_taxonomy_table = as.data.frame(t(sapply(otu_taxonomy, function(x) strsplit(x, ";")[[1]]))) colnames(otu_taxonomy_table) = c("kingdom", "phylum", "class", "order", "family", "genus", "species") # Differentially abundant OTUs with species level annotation. otus_malawi_vs_usa = names(wilcox_malawi_vs_usa[wilcox_malawi_vs_usa <= 0.05]) sum(otu_taxonomy_table[otus_malawi_vs_usa, "species"] != "s__") # 154 otus_malawi_vs_venezuela = names(na.omit(wilcox_malawi_vs_venezuela[wilcox_malawi_vs_venezuela <= 0.05])) sum(otu_taxonomy_table[otus_malawi_vs_venezuela, "species"] != "s__") # 98 otus_usa_vs_venezuela = names(wilcox_usa_vs_venezuela[wilcox_usa_vs_venezuela <= 0.05]) sum(otu_taxonomy_table[otus_usa_vs_venezuela, "species"] != "s__") # 151 # On average, about a tenth of the differentially abundant OTUs have a taxonomic assignment. # ==== Advanced Questions ==== # We will define a function to be able to generate the differential abundance analysis based on # the level input by the user. perform_differential_abundance_analysis <- function(tax_profile, aggregation_vector = c()) { # If aggregation vector is provided, aggregating counts. if (length(aggregation_vector) > 0) { tax_profile = apply(tax_profile, 2, function(sample) { tapply(sample, aggregation_vector, sum) }) } # Retaining only taxa above indicated prevalence. prevalence = (rowSums(tax_profile > 0) / ncol(tax_profile)) * 100 selected_taxa = names(which(prevalence > 20)) tax_profile_subset <- tax_profile[selected_taxa, ] # Create three empty lists of the correct length that the test results can go into. wilcox_malawi_vs_usa = rep(NA, length(selected_taxa)) wilcox_malawi_vs_venezuela = rep(NA, length(selected_taxa)) wilcox_usa_vs_venezuela = rep(NA, length(selected_taxa)) # Then we assign the names as the names in the list. names(wilcox_malawi_vs_usa) = selected_taxa names(wilcox_malawi_vs_venezuela) = selected_taxa names(wilcox_usa_vs_venezuela) = selected_taxa # Run through the dataset in a loop, calculate the log10 abundance of each clade # for each category and then perform a Wilcoxon test. for(taxon in selected_taxa) { malawi_data = tax_profile_subset[taxon, malawi_samples] usa_data = tax_profile_subset[taxon, usa_samples] venezuela_data = tax_profile_subset[taxon, venezuela_samples] wilcox_malawi_vs_usa[taxon] = wilcox.test(malawi_data, usa_data)$p.value wilcox_malawi_vs_venezuela[taxon] = wilcox.test(malawi_data, venezuela_data)$p.value wilcox_usa_vs_venezuela[taxon] = wilcox.test(usa_data, venezuela_data)$p.value } # Applying multiple testing correction using the FDR method. wilcox_malawi_vs_usa = p.adjust(wilcox_malawi_vs_usa, method = "fdr") wilcox_malawi_vs_venezuela = p.adjust(wilcox_malawi_vs_venezuela, method = "fdr") wilcox_usa_vs_venezuela = p.adjust(wilcox_usa_vs_venezuela, method = "fdr") # Returning data s a dataframe. differential_test = data.frame("taxon" = names(wilcox_malawi_vs_usa), "Mal_vs_Usa" = wilcox_malawi_vs_usa, "Mal_vs_Ven" = wilcox_malawi_vs_venezuela, "Usa_vs_Ven" = wilcox_usa_vs_venezuela) return(differential_test) } # As an example, let's run this function on species level. results_species <- perform_differential_abundance_analysis(gut_profiles_rarefied, otu_taxonomy_table$species) # Clades with assigned taxonomy can be easier to interpret when, for example, discussing the results with # someone trained in classical microbiology who may be interested in pursuing a specific microorganism for follow-up. # Its is also more likely that we would have some phenotypic and genomic data for these taxa, however, depending on # how the species has been defined and how large is the within-species variation, this information may be only partially # transferable to the taxon identified in the study. # ---- PROJECT TASK 5 ---- # Aggregating data on species level. gut_profiles_species = apply(gut_profiles_rarefied, 2, function(sample) { tapply(sample, otu_taxonomy_table$species, sum) }) # Removing the unassigned species. gut_profiles_species = gut_profiles_species[row.names(gut_profiles_species) != "s__", ] # We take 10% of samples from each country for the test set. set.seed(0) train_proportion <- 0.9 train_indices_malawi <- sample(1:length(malawi_samples), size = floor(train_proportion*length(malawi_samples))) train_indices_venezuela <- sample(1:length(venezuela_samples), size = floor(train_proportion*length(venezuela_samples))) train_indices_usa <- sample(1:length(usa_samples), size = floor(train_proportion*length(usa_samples))) # Classifier to distinguish samples between Malawi and USA. # Create training and test sets for OTU data and labels. train_malawi_vs_usa <- gut_profiles_species[ , c(malawi_samples[train_indices_malawi], usa_samples[train_indices_usa])] test_malawi_vs_usa <- gut_profiles_species[ , c(malawi_samples[-train_indices_malawi], usa_samples[-train_indices_usa])] train_malawi_vs_usa_labels <- c(rep("Malawi", length(train_indices_malawi)), rep("USA", length(train_indices_usa))) test_malawi_vs_usa_labels <- c(rep("Malawi", length(malawi_samples[-train_indices_malawi])), rep("USA", length(usa_samples[-train_indices_usa]))) # Fit the model using training data classifier_malawi_vs_usa <- randomForest(x = t(train_malawi_vs_usa), y = as.factor(train_malawi_vs_usa_labels)) classifier_malawi_vs_usa # Malawi USA class.error # Malawi 101 1 0.009803922 # USA 0 282 0.000000000 # Extracting the importance of species from the classifier. importance_malawi_vs_usa <- data.frame(Feature = rownames(classifier_malawi_vs_usa$importance), classifier_malawi_vs_usa$importance) # Classifier to distinguish samples between Venezuela and USA. # Create training and test sets for OTU data and labels. train_venezuela_vs_usa <- gut_profiles_species[ , c(venezuela_samples[train_indices_venezuela], usa_samples[train_indices_usa])] test_venezuela_vs_usa <- gut_profiles_species[ , c(venezuela_samples[-train_indices_venezuela], usa_samples[-train_indices_usa])] train_venezuela_vs_usa_labels <- c(rep("Venezuela", length(train_indices_venezuela)), rep("USA", length(train_indices_usa))) test_venezuela_vs_usa_labels <- c(rep("Venezuela", length(venezuela_samples[-train_indices_venezuela])), rep("USA", length(usa_samples[-train_indices_usa]))) # Fit the model using training data classifier_venezuela_vs_usa <- randomForest(x = t(train_venezuela_vs_usa), y = as.factor(train_venezuela_vs_usa_labels)) classifier_venezuela_vs_usa # Confusion matrix: #USA Venezuela class.error #USA 282 0 0.00000000 #Venezuela 7 83 0.07777778 # Extracting the importance of species from the classifier. importance_venezuela_vs_usa <- data.frame(Feature = rownames(classifier_venezuela_vs_usa$importance), classifier_venezuela_vs_usa$importance) # Classifier to distinguish samples between Venezuela and Malawi. # Create training and test sets for OTU data and labels. train_venezuela_vs_malawi <- gut_profiles_species[ , c(venezuela_samples[train_indices_venezuela], malawi_samples[train_indices_malawi])] test_venezuela_vs_malawi <- gut_profiles_species[ , c(venezuela_samples[-train_indices_venezuela], malawi_samples[-train_indices_malawi])] train_venezuela_vs_malawi_labels <- c(rep("Venezuela", length(train_indices_venezuela)), rep("Malawi", length(train_indices_malawi))) test_venezuela_vs_malawi_labels <- c(rep("Venezuela", length(venezuela_samples[-train_indices_venezuela])), rep("Malawi", length(malawi_samples[-train_indices_malawi]))) # Fit the model using training data classifier_venezuela_vs_malawi <- randomForest(x = t(train_venezuela_vs_malawi), y = as.factor(train_venezuela_vs_malawi_labels)) classifier_venezuela_vs_malawi #Confusion matrix: # Malawi Venezuela class.error #Malawi 99 3 0.02941176 #Venezuela 15 75 0.16666667 # Extracting the importance of species from the classifier. importance_venezuela_vs_malawi <- data.frame(Feature = rownames(classifier_venezuela_vs_malawi$importance), classifier_venezuela_vs_malawi$importance) # ==== Question 5.1 ==== # Performing the predictions on the test data using 'predict' function to output probability of classification. malawi_vs_usa_output <- as.data.frame(predict(classifier_malawi_vs_usa, t(test_malawi_vs_usa), type = "prob")) # Assigning predicted label based on the country with higher probability of classification. malawi_vs_usa_output$predicted <- names(malawi_vs_usa_output)[1:2][apply(malawi_vs_usa_output[,1:2], 1, which.max)] malawi_vs_usa_output$observed <- test_malawi_vs_usa_labels # Confusion Matrix table(malawi_vs_usa_output[,3:4]) # observed # predicted Malawi USA # Malawi 12 0 # USA 0 32 # Plotting the ROC curve. malawi_vs_usa_prediction <- prediction(malawi_vs_usa_output[, 2], as.factor(test_malawi_vs_usa_labels)) roc_malawi_vs_us = performance(malawi_vs_usa_prediction, measure = "tpr", x.measure = "fpr") plot(roc_malawi_vs_us) abline(a = 0, b = 1) # Performing the predictions on the test data using 'predict' function to output probability of classification. venezuela_vs_usa_output <- as.data.frame(predict(classifier_venezuela_vs_usa, t(test_venezuela_vs_usa), type = "prob")) # Assigning predicted label based on the country with higher probability of classification. venezuela_vs_usa_output$predicted <- names(venezuela_vs_usa_output)[1:2][apply(venezuela_vs_usa_output[,1:2], 1, which.max)] venezuela_vs_usa_output$observed <- test_venezuela_vs_usa_labels # Confusion Matrix table(venezuela_vs_usa_output[,3:4]) # observed # predicted Venezuela USA # Venezuela 32 1 # USA 0 9 # Plotting the ROC curve. venezuela_vs_usa_prediction <- prediction(venezuela_vs_usa_output[, 2], as.factor(test_venezuela_vs_usa_labels)) roc_venezuela_vs_us = performance(venezuela_vs_usa_prediction, measure = "tpr", x.measure = "fpr") plot(roc_venezuela_vs_us) abline(a = 0, b = 1) # Performing the predictions on the test data using 'predict' function to output probability of classification. venezuela_vs_malawi_output <- as.data.frame(predict(classifier_venezuela_vs_malawi, t(test_venezuela_vs_malawi), type = "prob")) # Assigning predicted label based on the country with higher probability of classification. venezuela_vs_malawi_output$predicted <- names(venezuela_vs_malawi_output)[1:2][apply(venezuela_vs_malawi_output[,1:2], 1, which.max)] venezuela_vs_malawi_output$observed <- test_venezuela_vs_malawi_labels # Confusion Matrix table(venezuela_vs_malawi_output[,3:4]) # observed # predicted Venezuela Malawi # Venezuela 12 1 # Malawi 0 9 # Plotting the ROC curve. venezuela_vs_malawi_prediction <- prediction(venezuela_vs_malawi_output[, 2], as.factor(test_venezuela_vs_malawi_labels)) roc_venezuela_vs_us = performance(venezuela_vs_malawi_prediction, measure = "tpr", x.measure = "fpr") plot(roc_venezuela_vs_us) abline(a = 0, b = 1) performance(venezuela_vs_malawi_prediction, measure = "auc")@y.values[[1]] # [1] 0.975 # We see that all three classifiers can successfully distinguish samples originating from populations in different # countries, correctly predicting either all samples within the test dataset or all but one. # ==== Question 5.2 ==== # Displaying the 10 most important species in each classifier. head(importance_malawi_vs_usa[order(importance_malawi_vs_usa$MeanDecreaseGini, decreasing = TRUE), ], n = 10) head(importance_venezuela_vs_usa[order(importance_venezuela_vs_usa$MeanDecreaseGini, decreasing = TRUE), ], n = 10) head(importance_venezuela_vs_malawi[order(importance_venezuela_vs_malawi$MeanDecreaseGini, decreasing = TRUE), ], n = 10) # You will notice there's quite a bit overlap in the species that distinguish samples originating from non-westernized # countries as opposed to samples from the USA.