# ---- METAGENOMICS PROJECT 1 SOLUTION SCRIPT ----

# Uploading packages required for the analysis.
library("ggplot2")
library("moments")
library("randomForest")
library("ROCR")
library("vegan")

# Loading the data set into the environment.
load("/nfs/nas22/fs2202/biol_micro_teaching/551-1299-00L/metagenomics/gut_microbiome_dataset.RData")

# Before proceeding with the analysis, we need to rarefy the data.

# First, checking what is the lowest sequencing depth in the data set.
min(colSums(gut_profiles))
# [1] 305631

# Rarefying all samples to the lowest sequencing depth in the data set.
set.seed(0)  # setting a random seed so results are directly comparable between code re-runs.
gut_profiles_rarefied = t(rrarefy(t(gut_profiles), 305631))


# ---- PROJECT TASK 1 ----

# Calculating observed richness.
richness = t(estimateR(t(gut_profiles_rarefied)))[ , "S.obs"]

# Calculating Shannon diversity.
shannon_index = diversity(t(gut_profiles_rarefied), "shannon")

# Calculating Pielou evenness.
PielouE <- function(counts_in_sample, consider_zeros = FALSE){
  if (consider_zeros == FALSE) counts_in_sample <- as.vector(counts_in_sample[counts_in_sample > 0])
  
  shannon_diversity <- diversity(counts_in_sample)
  number_taxa <- length(counts_in_sample)
  evenness <- shannon_diversity/log(number_taxa)
  
  return(evenness)
}
evenness = apply(gut_profiles_rarefied, 2, PielouE)

# Adding data on alpha-diversity metrics to a table for plotting.
# It's useful to add all metadata so we can plot and split the data in multiple ways.
alpha_diversity_table <- gut_metadata
alpha_diversity_table$richness  = richness[row.names(alpha_diversity_table)]
alpha_diversity_table$diversity = shannon_index[row.names(alpha_diversity_table)]
alpha_diversity_table$evenness  = evenness[row.names(alpha_diversity_table)]

# ==== Question 1.1 ====

# Plotting data on country vs richness.
ggplot(alpha_diversity_table, aes(x = country, y = richness)) +
  geom_boxplot() +
  xlab("") +
  ylab("Observed Richness") +
  theme(text = element_text(size = 16)) +
  theme_bw()
# Performing tests to compare these values between three countries.
t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "richness"], 
       alpha_diversity_table[alpha_diversity_table["country"] == "USA", "richness"])  # not significant
t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "richness"], 
       alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "richness"])  # significant
t.test(alpha_diversity_table[alpha_diversity_table["country"] == "USA", "richness"], 
       alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "richness"])  # significant


# Plotting data on country vs diversity.
ggplot(alpha_diversity_table, aes(x = country, y = diversity)) +
  geom_boxplot() +
  xlab("") +
  ylab("Shannon Index") +
  theme(text = element_text(size = 16)) +
  theme_bw()
# Performing tests to compare these values between three countries.
t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "diversity"], 
       alpha_diversity_table[alpha_diversity_table["country"] == "USA", "diversity"])  # significant
t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "diversity"], 
       alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "diversity"])  # significant
t.test(alpha_diversity_table[alpha_diversity_table["country"] == "USA", "diversity"],   
       alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "diversity"])  # not significant


# Plotting data on country vs diversity.
ggplot(alpha_diversity_table, aes(x = country, y = evenness)) +
  geom_boxplot() +
  xlab("") +
  ylab("Pielou Evenness") +
  theme(text = element_text(size = 16)) +
  theme_bw()
# Performing tests to compare these values between three countries.
t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "evenness"], 
       alpha_diversity_table[alpha_diversity_table["country"] == "USA", "evenness"])  # significant
t.test(alpha_diversity_table[alpha_diversity_table["country"] == "Malawi", "evenness"], 
       alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "evenness"])  # significant
t.test(alpha_diversity_table[alpha_diversity_table["country"] == "USA", "evenness"], 
       alpha_diversity_table[alpha_diversity_table["country"] == "Venezuela", "evenness"])  # not significant


# Based on these observations, it appears as if the samples from Venezuela have the highest alpha-diversity
# according to all three metrics. Samples from the US have similar Shannon index and Pielou evenness
# distributions, but have significantly lower richness. The samples from Malawi have the lowest alpha-diversity
# on all three metrics. However, before we make any concrete conclusions, let's look at other 
# factors measured in the metadata.

# ==== Question 1.2 ====

# Alpha-diversity metrics versus age
ggplot(alpha_diversity_table, aes(x = age, y = richness)) +
  geom_point() +
  xlab("Age") +
  ylab("Observed Richness") +
  theme(text = element_text(size = 16))
# Using the Spearman test as the relationship is not linear
cor.test(alpha_diversity_table$age, alpha_diversity_table$richness, method = "spearman") # significant

ggplot(alpha_diversity_table, aes(x = age, y = diversity)) +
  geom_point() +
  xlab("Age") +
  ylab("Shannon Index") +
  theme(text = element_text(size = 16))
cor.test(alpha_diversity_table$age, alpha_diversity_table$diversity, method = "spearman") # significant

ggplot(alpha_diversity_table, aes(x = age, y = evenness)) +
  geom_point() +
  xlab("Age") +
  ylab("Pielou Evenness") +
  theme(text = element_text(size = 16))
cor.test(alpha_diversity_table$age, alpha_diversity_table$evenness, method = "spearman") # significant


# Alpha-diversity metrics versus gender
ggplot(alpha_diversity_table, aes(x = gender, y = richness)) +
  geom_boxplot() +
  xlab("Gender") +
  ylab("Observed Richness") +
  theme(text = element_text(size = 16))
t.test(alpha_diversity_table[alpha_diversity_table["gender"] == "M", "richness"], 
       alpha_diversity_table[alpha_diversity_table["gender"] == "F", "richness"])  # significant

ggplot(alpha_diversity_table, aes(x = gender, y = diversity)) +
  geom_boxplot() +
  xlab("Gender") +
  ylab("Shannon Index") +
  theme(text = element_text(size = 16))
t.test(alpha_diversity_table[alpha_diversity_table["gender"] == "M", "diversity"], 
       alpha_diversity_table[alpha_diversity_table["gender"] == "F", "diversity"])  # significant

ggplot(alpha_diversity_table, aes(x = gender, y = evenness)) +
  geom_boxplot() +
  xlab("Gender") +
  ylab("Pielou Evenness") +
  theme(text = element_text(size = 16))
t.test(alpha_diversity_table[alpha_diversity_table["gender"] == "M", "evenness"], 
       alpha_diversity_table[alpha_diversity_table["gender"] == "F", "evenness"])  # significant


# Alpha-diversity metrics versus BMI
ggplot(alpha_diversity_table, aes(x = bmi, y = richness)) +
  geom_point() +
  xlab("Body Mass Index") +
  ylab("Observed Richness") +
  theme(text = element_text(size = 16))
cor.test(alpha_diversity_table$bmi, alpha_diversity_table$richness, method = "spearman") # significant

ggplot(alpha_diversity_table, aes(x = bmi, y = diversity)) +
  geom_point() +
  xlab("Body Mass Index") +
  ylab("Shannon Index") +
  theme(text = element_text(size = 16))
cor.test(alpha_diversity_table$bmi, alpha_diversity_table$diversity, method = "spearman") # significant

ggplot(alpha_diversity_table, aes(x = bmi, y = evenness)) +
  geom_point() +
  xlab("Body Mass Index") +
  ylab("Pielou Evenness") +
  theme(text = element_text(size = 16))
cor.test(alpha_diversity_table$bmi, alpha_diversity_table$evenness, method = "spearman") # significant

# Based on these tests, we observe a positive relationship between age and alpha-diversity,
# between BMI and diversity. It also appears as if male subjects have less gut diversity than female subjects,
# but let's look at factors that can potentially confound this observation.

# ==== Question 1.3 ====

# Let's color the dots on the richness versus age plot by country.
ggplot(alpha_diversity_table, aes(x = age, y = richness, col = country)) +
  geom_point() +
  xlab("Age") +
  ylab("Observed Richness") +
  theme(text = element_text(size = 16))
# Now, we see that the samples from the adults from the USA are actually less diverse than those from
# the adults from Malawi and Venezuela.


# We can look at age distribution by country:
ggplot(alpha_diversity_table, aes(x = age, fill = country)) +
  geom_histogram(position = "identity") +
  facet_grid(country ~ .) # this will plot the histograms in separate panels so it's easier to compare
# We see that there are more samples from USA adults than from the other two countries.
# In the first years of life, the gut microbiome is less diverse, so the higher fraction of gut microbiome
# samples from Malawian and Venezuelan children bring the average alpha-diversity down.


# Similarly, if we look at age distribution by gender:
ggplot(alpha_diversity_table, aes(x = age, fill = gender)) +
  geom_histogram(position = "identity") +
  facet_grid(gender ~ .) 
# There are more adult female subjects in the data set, so it appears as if male subjects display
# lower gut microbiome diversity. However, if we only plot the data from adults:
ggplot(alpha_diversity_table[alpha_diversity_table$age >= 5, ], aes(x = gender, y = richness)) +
  geom_boxplot() +
  xlab("Gender") +
  ylab("Observed Richness") +
  theme(text = element_text(size = 16))
t.test(alpha_diversity_table[alpha_diversity_table["gender"] == "M" & alpha_diversity_table["age"] >= 5, "richness"], 
       alpha_diversity_table[alpha_diversity_table["gender"] == "F" & alpha_diversity_table["age"] >= 5, "richness"])

# We will observe no more significant difference in gut microbiome diversity between the two genders.
# In fact, since all NA values in the gender column come from Malawi and Venezuela, we observe a higher
# median richness for NA values in the boxplot.


# ---- PROJECT TASK 2 ----

# Calculating Jaccard distance for the samples.
gut_profiles_jaccard = as.matrix(vegdist(t(gut_profiles_rarefied), method = "jaccard"))

# Since the Jaccard distance matrix is symmetric, we can take only half of the values.
gut_profiles_jaccard_lower_tri <- gut_profiles_jaccard
gut_profiles_jaccard_lower_tri[lower.tri(gut_profiles_jaccard_lower_tri, diag = TRUE)] <- NA

# ==== Question 2.1 ====

# Vectors where we will store the results.
jaccard_malawi_vs_malawi = c()
jaccard_usa_vs_usa = c()
jaccard_venezuela_vs_venezuela = c()
jaccard_malawi_vs_usa = c()
jaccard_malawi_vs_venezuela = c()
jaccard_usa_vs_venezuela = c()

# These are all that come from Malawi.
malawi_samples = gut_metadata[gut_metadata$country == "Malawi", "sample_id"]
# These are all that come from the USA.
usa_samples = gut_metadata[gut_metadata$country == "USA", "sample_id"]
# These are all that come from Venezuela.
venezuela_samples = gut_metadata[gut_metadata$country == "Venezuela", "sample_id"]


for(sample_id in unique(gut_metadata$sample_id)){
  # First, let's check if the sample is from Malawi
  if(sample_id %in% malawi_samples) {
    
    # If this is a sample from Malawi, we can add its distances to the malawi_vs_malawi, malawi_vs_usa,
    # and malawi_vs_venezuela comparisons.
    malawi_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, malawi_samples]
    jaccard_malawi_vs_malawi = c(jaccard_malawi_vs_malawi, malawi_comparisons_for_sample)
    
    usa_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, usa_samples]
    jaccard_malawi_vs_usa = c(jaccard_malawi_vs_usa, usa_comparisons_for_sample)
    
    venezuela_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, venezuela_samples]
    jaccard_malawi_vs_venezuela = c(jaccard_malawi_vs_venezuela, venezuela_comparisons_for_sample)
    
  } else if(sample_id %in% usa_samples) {  # Second, let's check if the sample is from the USA
    
    # If this is a sample from the USA, we can add its distances to the usa_vs_usa, malawi_vs_usa,
    # and usa_vs_venezuela comparisons.
    malawi_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, malawi_samples]
    jaccard_malawi_vs_usa = c(jaccard_malawi_vs_usa, malawi_comparisons_for_sample)
    
    usa_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, usa_samples]
    jaccard_usa_vs_usa = c(jaccard_usa_vs_usa, usa_comparisons_for_sample)
    
    venezuela_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, venezuela_samples]
    jaccard_usa_vs_venezuela = c(jaccard_usa_vs_venezuela, venezuela_comparisons_for_sample)
    
  } else {  # If the sample is neither from Malawi or the USA, it comes from Venezuela.

    # If this is a sample from Venezuela, we can add its distances to the venezuela_vs_venezuela,
    # malawi_vs_venezuela, and usa_vs_venezuela comparisons.
    malawi_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, malawi_samples]
    jaccard_malawi_vs_venezuela = c(jaccard_malawi_vs_venezuela, malawi_comparisons_for_sample)
    
    usa_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, usa_samples]
    jaccard_usa_vs_venezuela = c(jaccard_usa_vs_venezuela, usa_comparisons_for_sample)
    
    venezuela_comparisons_for_sample = gut_profiles_jaccard_lower_tri[sample_id, venezuela_samples]
    jaccard_venezuela_vs_venezuela = c(jaccard_venezuela_vs_venezuela, venezuela_comparisons_for_sample)
  }

}

# When we took half the values in the distance matrix, we introduced NA values into the data. Let's remove these
# before performing the statistical tests.
jaccard_malawi_vs_malawi = na.omit(jaccard_malawi_vs_malawi)
jaccard_usa_vs_usa = na.omit(jaccard_usa_vs_usa)
jaccard_venezuela_vs_venezuela = na.omit(jaccard_venezuela_vs_venezuela)
jaccard_malawi_vs_usa = na.omit(jaccard_malawi_vs_usa)
jaccard_malawi_vs_venezuela = na.omit(jaccard_malawi_vs_venezuela)
jaccard_usa_vs_venezuela = na.omit(jaccard_usa_vs_venezuela)

# Calculating average distance for samples from the same country.
mean(jaccard_malawi_vs_malawi)  # 0.8125623
mean(jaccard_usa_vs_usa)  # 0.8259492
mean(jaccard_venezuela_vs_venezuela) # 0.8061796

# Calculating average distance from samples from different countries.
mean(jaccard_malawi_vs_usa)  # 0.916304
mean(jaccard_malawi_vs_venezuela)  # 0.8351879
mean(jaccard_usa_vs_venezuela) # 0.8982677

# Samples from the same country tend to be more similar to each other than to samples from different countries.
# However, the beta-diversity values between Malawi and Venezuela are smaller than those of these countries
# and the USA.


# ==== Advanced Questions ====

# Are samples from the same family more similar to each other than from different families?
jaccard_same_family = c()
jaccard_diff_family = c()

list_of_families = unique(gut_metadata$family_id)
for(family in list_of_families) {
  
  within_family_samples = gut_metadata[gut_metadata$family_id == family, "sample_id"]
  outside_family_samples = gut_metadata[gut_metadata$family_id != family, "sample_id"]
  
  # Selecting all cells were samples from the same family are compared.
  within_family_distances = as.vector(gut_profiles_jaccard_lower_tri[within_family_samples, within_family_samples])
  jaccard_same_family = c(jaccard_same_family, na.omit(within_family_distances))
  
  # Selecting all cells were samples from the same family are to samples from other families.
  outside_family_distances = as.vector(gut_profiles_jaccard_lower_tri[within_family_samples, outside_family_samples])
  jaccard_diff_family = c(jaccard_diff_family, na.omit(outside_family_distances))
  
}
# Looking at average distances.
mean(jaccard_same_family)  # 0.7445527
mean(jaccard_diff_family)  # 0.8654604


# We can do the same comparison, but separating samples by country.
jaccard_same_family_by_country = list()
jaccard_diff_family_by_country = list()

for(country in c("Malawi", "USA", "Venezuela")) {
  
  # Selecting rows that correspond to country.
  gut_metadata_for_country = gut_metadata[gut_metadata$country == country, ]
  list_of_families_country = unique(gut_metadata_for_country$family_id)
  
  jaccard_same_family_country = c()
  jaccard_diff_family_country = c()
  
  for(family in list_of_families_country) {
    
    within_family_samples = gut_metadata_for_country[gut_metadata_for_country$family_id == family, "sample_id"]
    outside_family_samples = gut_metadata_for_country[gut_metadata_for_country$family_id != family, "sample_id"]
    
    # Selecting all cells were samples from the same family are compared.
    within_family_distances = as.vector(gut_profiles_jaccard_lower_tri[within_family_samples, within_family_samples])
    jaccard_same_family_country = c(jaccard_same_family_country, na.omit(within_family_distances))
    
    # Selecting all cells were samples from the same family are to samples from other families.
    outside_family_distances = as.vector(gut_profiles_jaccard_lower_tri[within_family_samples, outside_family_samples])
    jaccard_diff_family_country = c(jaccard_diff_family_country, na.omit(outside_family_distances))
  }
  
  jaccard_same_family_by_country = c(jaccard_same_family_by_country, list(jaccard_same_family_country))
  jaccard_diff_family_by_country = c(jaccard_diff_family_by_country, list(jaccard_diff_family_country))
}

# We can put the results into a table for plotting.
plot_distances_by_family = data.frame("country" = c(rep("Malawi", length(jaccard_same_family_by_country[[1]]) + length(jaccard_diff_family_by_country[[1]])),
                                                    rep("USA", length(jaccard_same_family_by_country[[2]]) + length(jaccard_diff_family_by_country[[2]])),
                                                    rep("Venezuela", length(jaccard_same_family_by_country[[3]]) + length(jaccard_diff_family_by_country[[3]]))),
                                      "family"  = c(rep("Same", length(jaccard_same_family_by_country[[1]])),
                                                    rep("Different", length(jaccard_diff_family_by_country[[1]])),
                                                    rep("Same", length(jaccard_same_family_by_country[[2]])),
                                                    rep("Different", length(jaccard_diff_family_by_country[[2]])),
                                                    rep("Same", length(jaccard_same_family_by_country[[3]])),
                                                    rep("Different", length(jaccard_diff_family_by_country[[3]]))),
                                      "jaccard" = c(jaccard_same_family_by_country[[1]], jaccard_diff_family_by_country[[1]],
                                                    jaccard_same_family_by_country[[2]], jaccard_diff_family_by_country[[2]],
                                                    jaccard_same_family_by_country[[3]], jaccard_diff_family_by_country[[3]]))


ggplot(plot_distances_by_family, aes(x = country, y = jaccard, fill = family)) + 
  geom_boxplot() +
  ylim(c(0, 1)) +
  xlab("") +
  ylab("Jaccard Distance") +
  scale_fill_discrete(name = "Family") +
  theme_bw() +
  theme(text = element_text(size = 16))
# Overall, individuals originating from the same family tend to have more similar microbiomes than
# individuals from different families. In the USA samples, the separation is especially clear.

# Another question we can test is whether twins have more similar microbiomes than siblings.
selected_family_relationships = c("Sibling", "Twin1", "Twin2", "Twin3", "Brother1", "Sister1", "Brother2", "Sister", "Brother")
metadata_families_w_siblings = gut_metadata[gut_metadata$family_relationship %in% selected_family_relationships, ]
# So it's easier, let's remove the numbers at the end of the words.
metadata_families_w_siblings$family_relationship = sapply(metadata_families_w_siblings$family_relationship, function(x) gsub("[0-9]", "", x))
# For some siblings, the age is undetermined, so we set it to a high number that it always return FALSE during comparison.
metadata_families_w_siblings[is.na(metadata_families_w_siblings$age), "age"] = 100

jaccard_twins = c()
jaccard_siblings = c()

for(family in unique(metadata_families_w_siblings$family_id)) {
  
  family_data = metadata_families_w_siblings[metadata_families_w_siblings$family_id == family, ]
  
  for(sample_1 in family_data$sample_id) {
    
    for(sample_2 in family_data$sample_id) {
      
      if(sample_1 == sample_2) next
      
      if(family_data[sample_1, "age"] == family_data[sample_2, "age"]){
        jaccard_twins = c(jaccard_twins, gut_profiles_jaccard_lower_tri[sample_1, sample_2])
      } else {
        jaccard_siblings = c(jaccard_siblings, gut_profiles_jaccard_lower_tri[sample_1, sample_2])
      }
    }
  }
}
plot_distances_siblings = data.frame("relationship" = c(rep("Twins", length(jaccard_twins)),
                                                        rep("Siblings", length(jaccard_siblings))),
                                     "jaccard" = c(jaccard_twins, jaccard_siblings))
ggplot(plot_distances_siblings, aes(x = relationship, y = jaccard)) + 
  geom_boxplot() +
  ylim(c(0, 1)) +
  xlab("") +
  ylab("Jaccard Distance") +
  theme_bw() +
  theme(text = element_text(size = 16))
# It appears that twins tend to have more similar microbiomes to each other when compared to siblings.


# ---- PROJECT TASK 3 ----

# Performing Principle Coordinates Analysis based on the Jaccard distance matrix.
jaccard_ordination = cmdscale(gut_profiles_jaccard)

# Adding these values to the gut metadata table.
ordination_table <- gut_metadata
ordination_table$first <- jaccard_ordination[rownames(ordination_table), 1]
ordination_table$second <- jaccard_ordination[rownames(ordination_table), 2]


# ==== Question 3.1 ====

# Coloring ordination plot by country.
ggplot(ordination_table, aes(x = first, y = second, col = country)) +
  geom_point() +
  xlab("PCoA1") +
  ylab("PCoA2") +
  theme(text = element_text(size = 16))
# In PCoA1, USA samples are well separated from samples from Malawi and Venezuela.
# The samples from Malawi and Venezuela show less separation between each other.

# Coloring ordination plot by region.
ggplot(ordination_table, aes(x = first, y = second, col = region)) +
  geom_point() +
  xlab("PCoA1") +
  ylab("PCoA2") +
  theme(text = element_text(size = 16))
# We observe that samples originating from different regions of the same country
# don't separate into clear clusters.


# ==== Question 3.2 ====

# Coloring ordination plot by age.
# Since about 1/6 of the age data originates from subjects less than a year old, taking the
# logarithm of the age will allow us to see the trends more clearly.
ggplot(ordination_table, aes(x = first, y = second, col = log2(age))) +
  geom_point() +
  xlab("PCoA1") +
  ylab("PCoA2") +
  theme(text = element_text(size = 16))
# In PCoA2, we see some variation associated with age - samples on the bottom tend to come from
# young individuals while samples on the top tend to come from adults.

# Coloring ordination plot by BMI.
ggplot(ordination_table, aes(x = first, y = second, col = bmi)) +
  geom_point() +
  xlab("PCoA1") +
  ylab("PCoA2") +
  theme(text = element_text(size = 16))
# There appears to be no separation by BMI.

# Coloring ordination plot by gender
ggplot(ordination_table, aes(x = first, y = second, col = gender)) +
  geom_point() +
  xlab("PCoA1") +
  ylab("PCoA2") +
  theme(text = element_text(size = 16))
# There appears to be no separation by gender.


# ---- PROJECT TASK 4 ----

# Calculating OTU prevalence.
prevalence = (rowSums(gut_profiles_rarefied > 0) / ncol(gut_profiles_rarefied)) * 100

# Retaining OTUs that have at least 20% prevalence.
selected_otus = names(which(prevalence > 20))
gut_profiles_subset <- gut_profiles_rarefied[selected_otus, ]

# Assessing the distribution of values in this table.
hist(gut_profiles_subset[gut_profiles_subset != 0])
skewness(gut_profiles_subset[gut_profiles_subset != 0])

# What if we log transform the data?
log_data = log10(gut_profiles_subset[gut_profiles_subset != 0])
hist(log_data, breaks = 100)
skewness(log_data)
# Because the data don't follow a normal distribution, we will use the Wilcoxon rank-sum test.

# Create three empty lists of the correct length that the test results can go into.
wilcox_malawi_vs_usa = rep(NA, length(selected_otus))
wilcox_malawi_vs_venezuela = rep(NA, length(selected_otus))
wilcox_usa_vs_venezuela = rep(NA, length(selected_otus))

# Then we assign the OTU names as the names in the list.
names(wilcox_malawi_vs_usa) = selected_otus
names(wilcox_malawi_vs_venezuela) = selected_otus
names(wilcox_usa_vs_venezuela) = selected_otus

# Run through the dataset in a loop, calculate the log10 abundance of each OTU
# for each category and then perform a Wilcoxon test.
for(otu in selected_otus) {
  malawi_data = gut_profiles_subset[otu, malawi_samples]
  usa_data = gut_profiles_subset[otu, usa_samples]
  venezuela_data = gut_profiles_subset[otu, venezuela_samples]

  wilcox_malawi_vs_usa[otu] = wilcox.test(malawi_data, usa_data)$p.value
  wilcox_malawi_vs_venezuela[otu] = wilcox.test(malawi_data, venezuela_data)$p.value
  wilcox_usa_vs_venezuela[otu] = wilcox.test(usa_data, venezuela_data)$p.value
}

# Applying multiple testing correction using the FDR method.
wilcox_malawi_vs_usa = p.adjust(wilcox_malawi_vs_usa, method = "fdr")
wilcox_malawi_vs_venezuela = p.adjust(wilcox_malawi_vs_venezuela, method = "fdr")
wilcox_usa_vs_venezuela = p.adjust(wilcox_usa_vs_venezuela, method = "fdr")


# ==== Question 4.1 ====

# Counting the number of differentially abundant OTUs between different countries.
sum(wilcox_malawi_vs_usa <= 0.05)  # 1482
sum(wilcox_malawi_vs_venezuela <= 0.05, na.rm = TRUE)  # 1081
sum(wilcox_usa_vs_venezuela <= 0.05)  # 1434

# How many of these OTUs have a species-level taxonomic annotation?
# First, let's transform the taxonomic data into a format that's easy to parse.
otu_taxonomy_table = as.data.frame(t(sapply(otu_taxonomy, function(x) strsplit(x, ";")[[1]])))
colnames(otu_taxonomy_table) = c("kingdom", "phylum", "class", "order",
                                 "family", "genus", "species")

# Differentially abundant OTUs with species level annotation.
otus_malawi_vs_usa = names(wilcox_malawi_vs_usa[wilcox_malawi_vs_usa <= 0.05])
sum(otu_taxonomy_table[otus_malawi_vs_usa, "species"] != "s__")  # 154

otus_malawi_vs_venezuela = names(na.omit(wilcox_malawi_vs_venezuela[wilcox_malawi_vs_venezuela <= 0.05]))
sum(otu_taxonomy_table[otus_malawi_vs_venezuela, "species"] != "s__")  # 98

otus_usa_vs_venezuela = names(wilcox_usa_vs_venezuela[wilcox_usa_vs_venezuela <= 0.05])
sum(otu_taxonomy_table[otus_usa_vs_venezuela, "species"] != "s__")  # 151
# On average, about a tenth of the differentially abundant OTUs have a taxonomic assignment.


# ==== Advanced Questions ====

# We will define a function to be able to generate the differential abundance analysis based on
# the level input by the user.
perform_differential_abundance_analysis <- function(tax_profile, aggregation_vector = c()) {
  # If aggregation vector is provided, aggregating counts.
  if (length(aggregation_vector) > 0) {
    tax_profile = apply(tax_profile, 2, function(sample) {
      tapply(sample, aggregation_vector, sum)
    })
  }

  # Retaining only taxa above indicated prevalence.
  prevalence = (rowSums(tax_profile > 0) / ncol(tax_profile)) * 100
  selected_taxa = names(which(prevalence > 20))
  tax_profile_subset <- tax_profile[selected_taxa, ]
  
  # Create three empty lists of the correct length that the test results can go into.
  wilcox_malawi_vs_usa = rep(NA, length(selected_taxa))
  wilcox_malawi_vs_venezuela = rep(NA, length(selected_taxa))
  wilcox_usa_vs_venezuela = rep(NA, length(selected_taxa))
  
  # Then we assign the names as the names in the list.
  names(wilcox_malawi_vs_usa) = selected_taxa
  names(wilcox_malawi_vs_venezuela) = selected_taxa
  names(wilcox_usa_vs_venezuela) = selected_taxa
  
  # Run through the dataset in a loop, calculate the log10 abundance of each clade
  # for each category and then perform a Wilcoxon test.
  for(taxon in selected_taxa) {
    malawi_data = tax_profile_subset[taxon, malawi_samples]
    usa_data = tax_profile_subset[taxon, usa_samples]
    venezuela_data = tax_profile_subset[taxon, venezuela_samples]
    
    wilcox_malawi_vs_usa[taxon] = wilcox.test(malawi_data, usa_data)$p.value
    wilcox_malawi_vs_venezuela[taxon] = wilcox.test(malawi_data, venezuela_data)$p.value
    wilcox_usa_vs_venezuela[taxon] = wilcox.test(usa_data, venezuela_data)$p.value
  }
  
  # Applying multiple testing correction using the FDR method.
  wilcox_malawi_vs_usa = p.adjust(wilcox_malawi_vs_usa, method = "fdr")
  wilcox_malawi_vs_venezuela = p.adjust(wilcox_malawi_vs_venezuela, method = "fdr")
  wilcox_usa_vs_venezuela = p.adjust(wilcox_usa_vs_venezuela, method = "fdr")
  
  # Returning data s a dataframe.
  differential_test = data.frame("taxon" = names(wilcox_malawi_vs_usa),
                                 "Mal_vs_Usa" = wilcox_malawi_vs_usa,
                                 "Mal_vs_Ven" = wilcox_malawi_vs_venezuela,
                                 "Usa_vs_Ven" = wilcox_usa_vs_venezuela)
  
  return(differential_test)
  

}

# As an example, let's run this function on species level.
results_species <- perform_differential_abundance_analysis(gut_profiles_rarefied, otu_taxonomy_table$species)

# Clades with assigned taxonomy can be easier to interpret when, for example, discussing the results with
# someone trained in classical microbiology who may be interested in pursuing a specific microorganism for follow-up.
# Its is also more likely that we would have some phenotypic and genomic data for these taxa, however, depending on
# how the species has been defined and how large is the within-species variation, this information may be only partially
# transferable to the taxon identified in the study.


# ---- PROJECT TASK 5 ----

# Aggregating data on species level.
gut_profiles_species = apply(gut_profiles_rarefied, 2, function(sample) {
    tapply(sample, otu_taxonomy_table$species, sum)
  })
# Removing the unassigned species.
gut_profiles_species = gut_profiles_species[row.names(gut_profiles_species) != "s__", ]

# We take 10% of samples from each country for the test set.
set.seed(0)
train_proportion <- 0.9
train_indices_malawi <- sample(1:length(malawi_samples), size = floor(train_proportion*length(malawi_samples)))
train_indices_venezuela <- sample(1:length(venezuela_samples), size = floor(train_proportion*length(venezuela_samples)))
train_indices_usa <- sample(1:length(usa_samples), size = floor(train_proportion*length(usa_samples)))

# Classifier to distinguish samples between Malawi and USA.
# Create training and test sets for OTU data and labels.
train_malawi_vs_usa <- gut_profiles_species[ , c(malawi_samples[train_indices_malawi], usa_samples[train_indices_usa])]
test_malawi_vs_usa  <- gut_profiles_species[ , c(malawi_samples[-train_indices_malawi], usa_samples[-train_indices_usa])]

train_malawi_vs_usa_labels <- c(rep("Malawi", length(train_indices_malawi)), rep("USA", length(train_indices_usa)))
test_malawi_vs_usa_labels <- c(rep("Malawi", length(malawi_samples[-train_indices_malawi])), rep("USA", length(usa_samples[-train_indices_usa])))

# Fit the model using training data
classifier_malawi_vs_usa <- randomForest(x = t(train_malawi_vs_usa), y = as.factor(train_malawi_vs_usa_labels))
classifier_malawi_vs_usa
#        Malawi USA class.error
# Malawi    101   1 0.009803922
# USA         0 282 0.000000000
# Extracting the importance of species from the classifier.
importance_malawi_vs_usa <- data.frame(Feature = rownames(classifier_malawi_vs_usa$importance), classifier_malawi_vs_usa$importance)

# Classifier to distinguish samples between Venezuela and USA.
# Create training and test sets for OTU data and labels.
train_venezuela_vs_usa <- gut_profiles_species[ , c(venezuela_samples[train_indices_venezuela], usa_samples[train_indices_usa])]
test_venezuela_vs_usa  <- gut_profiles_species[ , c(venezuela_samples[-train_indices_venezuela], usa_samples[-train_indices_usa])]

train_venezuela_vs_usa_labels <- c(rep("Venezuela", length(train_indices_venezuela)), rep("USA", length(train_indices_usa)))
test_venezuela_vs_usa_labels <- c(rep("Venezuela", length(venezuela_samples[-train_indices_venezuela])), rep("USA", length(usa_samples[-train_indices_usa])))

# Fit the model using training data
classifier_venezuela_vs_usa <- randomForest(x = t(train_venezuela_vs_usa), y = as.factor(train_venezuela_vs_usa_labels))
classifier_venezuela_vs_usa
# Confusion matrix:
#USA Venezuela class.error
#USA       282         0  0.00000000
#Venezuela   7        83  0.07777778
# Extracting the importance of species from the classifier.
importance_venezuela_vs_usa <- data.frame(Feature = rownames(classifier_venezuela_vs_usa$importance), classifier_venezuela_vs_usa$importance)

# Classifier to distinguish samples between Venezuela and Malawi.
# Create training and test sets for OTU data and labels.
train_venezuela_vs_malawi <- gut_profiles_species[ , c(venezuela_samples[train_indices_venezuela], malawi_samples[train_indices_malawi])]
test_venezuela_vs_malawi  <- gut_profiles_species[ , c(venezuela_samples[-train_indices_venezuela], malawi_samples[-train_indices_malawi])]

train_venezuela_vs_malawi_labels <- c(rep("Venezuela", length(train_indices_venezuela)), rep("Malawi", length(train_indices_malawi)))
test_venezuela_vs_malawi_labels <- c(rep("Venezuela", length(venezuela_samples[-train_indices_venezuela])), rep("Malawi", length(malawi_samples[-train_indices_malawi])))

# Fit the model using training data
classifier_venezuela_vs_malawi <- randomForest(x = t(train_venezuela_vs_malawi), y = as.factor(train_venezuela_vs_malawi_labels))
classifier_venezuela_vs_malawi
#Confusion matrix:
#  Malawi Venezuela class.error
#Malawi        99         3  0.02941176
#Venezuela     15        75  0.16666667
# Extracting the importance of species from the classifier.
importance_venezuela_vs_malawi <- data.frame(Feature = rownames(classifier_venezuela_vs_malawi$importance), classifier_venezuela_vs_malawi$importance)


# ==== Question 5.1 ====

# Performing the predictions on the test data using 'predict' function to output probability of classification.
malawi_vs_usa_output <- as.data.frame(predict(classifier_malawi_vs_usa, t(test_malawi_vs_usa), type = "prob"))
# Assigning predicted label based on the country with higher probability of classification.
malawi_vs_usa_output$predicted <- names(malawi_vs_usa_output)[1:2][apply(malawi_vs_usa_output[,1:2], 1, which.max)]
malawi_vs_usa_output$observed <- test_malawi_vs_usa_labels

# Confusion Matrix
table(malawi_vs_usa_output[,3:4])
#            observed
# predicted  Malawi USA
#    Malawi      12   0
#       USA       0  32

# Plotting the ROC curve.
malawi_vs_usa_prediction <- prediction(malawi_vs_usa_output[, 2], as.factor(test_malawi_vs_usa_labels))
roc_malawi_vs_us = performance(malawi_vs_usa_prediction, measure = "tpr", x.measure = "fpr")
plot(roc_malawi_vs_us)
abline(a = 0, b = 1)


# Performing the predictions on the test data using 'predict' function to output probability of classification.
venezuela_vs_usa_output <- as.data.frame(predict(classifier_venezuela_vs_usa, t(test_venezuela_vs_usa), type = "prob"))
# Assigning predicted label based on the country with higher probability of classification.
venezuela_vs_usa_output$predicted <- names(venezuela_vs_usa_output)[1:2][apply(venezuela_vs_usa_output[,1:2], 1, which.max)]
venezuela_vs_usa_output$observed <- test_venezuela_vs_usa_labels

# Confusion Matrix
table(venezuela_vs_usa_output[,3:4])
#            observed
# predicted  Venezuela USA
# Venezuela         32   1
#       USA          0   9

# Plotting the ROC curve.
venezuela_vs_usa_prediction <- prediction(venezuela_vs_usa_output[, 2], as.factor(test_venezuela_vs_usa_labels))
roc_venezuela_vs_us = performance(venezuela_vs_usa_prediction, measure = "tpr", x.measure = "fpr")
plot(roc_venezuela_vs_us)
abline(a = 0, b = 1)


# Performing the predictions on the test data using 'predict' function to output probability of classification.
venezuela_vs_malawi_output <- as.data.frame(predict(classifier_venezuela_vs_malawi, t(test_venezuela_vs_malawi), type = "prob"))
# Assigning predicted label based on the country with higher probability of classification.
venezuela_vs_malawi_output$predicted <- names(venezuela_vs_malawi_output)[1:2][apply(venezuela_vs_malawi_output[,1:2], 1, which.max)]
venezuela_vs_malawi_output$observed <- test_venezuela_vs_malawi_labels

# Confusion Matrix
table(venezuela_vs_malawi_output[,3:4])
#            observed
# predicted  Venezuela Malawi
# Venezuela         12      1
#    Malawi          0      9

# Plotting the ROC curve.
venezuela_vs_malawi_prediction <- prediction(venezuela_vs_malawi_output[, 2], as.factor(test_venezuela_vs_malawi_labels))
roc_venezuela_vs_us = performance(venezuela_vs_malawi_prediction, measure = "tpr", x.measure = "fpr")
plot(roc_venezuela_vs_us)
abline(a = 0, b = 1)
performance(venezuela_vs_malawi_prediction, measure = "auc")@y.values[[1]]
# [1] 0.975

# We see that all three classifiers can successfully distinguish samples originating from populations in different
# countries, correctly predicting either all samples within the test dataset or all but one.

# ==== Question 5.2 ====

# Displaying the 10 most important species in each classifier.
head(importance_malawi_vs_usa[order(importance_malawi_vs_usa$MeanDecreaseGini, decreasing = TRUE), ], n = 10)
head(importance_venezuela_vs_usa[order(importance_venezuela_vs_usa$MeanDecreaseGini, decreasing = TRUE), ], n = 10)
head(importance_venezuela_vs_malawi[order(importance_venezuela_vs_malawi$MeanDecreaseGini, decreasing = TRUE), ], n = 10)

# You will notice there's quite a bit overlap in the species that distinguish samples originating from non-westernized
# countries as opposed to samples from the USA.