##### 

### Concepts in Bioinformatics - Ocean project

#####

### Load dataset
load(url("https://sunagawalab.ethz.ch/share/teaching_materials/MGPROJ_TO_data.Rdata"))

### Load libraries

library(vegan)
library(ggplot2)

### Assess the structure of the gene profile and metadata
TO_gene_profile[1:3,1:3]
TO_gene_meta[1:3,1:3]

#####

### Project II - Task I - Data analysis

#####

### Q1: How many genes were identified in the dataset?
# We can use 'dim' to determine the dimensions of our dataframe. With the 
# number of rows representing the number of genes captured
dim(TO_gene_profile)

# Answer: 10,914

### Q2: How many genes are found in all samples? (i.e. 100% prevalence)
# Using the same approach as in Metagenomics I, we can use 
# rowSums to identify the samples where a gene had a relative abundance > 0
# This gives us a boolean result, i.e. TRUE/FALSE
# We then divide the number of 'TRUEs' by the number of columns present and 
# multiple by 100 to place into a percentage. Telling us the proportion of 
# samples that each gene was detected
gene_prevalence = (rowSums(TO_gene_profile > 0) / ncol(TO_gene_profile)) * 100

# Visualise prevalence as histogram
hist(gene_prevalence, xlab="Gene prevalence (%)")

# With our above created 'gene_prevalence' dataframe, it is then easy to assess
# how many genes had a 100% prevalence, by checking for values = 100 and 
# counting the length of the result
genes_100_prev = names(which(gene_prevalence == 100))
length(genes_100_prev)

# Answer: 4870

### Q3: Is there a difference in the proportion of genes captured by the 
### KEGG database in the two depth layers?
# Firstly, we need to calculate the summed relative abundance of genes in each
# sample. We can do this with 'colSums' and use the data.frame to place this
# result into a new dataframe along with the names of each sample
gene_profile_sums <- data.frame(summed_abundance = colSums(TO_gene_profile),
                                Sample_name = names(TO_gene_profile))

# We then can use the 'merge' function to combine this with the metadata table
gene_profile_sums_with_meta <- merge(gene_profile_sums, TO_gene_meta)

# Use the aggregate function to find out the mean proportion captured in the two depth layers
aggregate(summed_abundance~Depth_layer, data=gene_profile_sums_with_meta, FUN=mean)

# Answer: Yes! On average, 50.5% of genes are captured by the KEGG database in Mesopelagic samples compared to 52.7% in surface samples

# We could also visualise this difference in a boxplot
ggplot(gene_profile_sums_with_meta, aes(x=Depth_layer, y=summed_abundance)) + 
  geom_boxplot() +
  labs(y = "Relative abundance (%)") +
  theme_bw() + 
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 10),
        axis.title.y = element_text(size = 14),
        axis.title.x = element_blank())

# As a bonus, you could also statistically test this difference using a 
# t-test/wilcoxon test (depending upon if the data is normally or non-normally distributed)
# For example:
SRF_abundances = gene_profile_sums_with_meta[gene_profile_sums_with_meta$Depth_layer == "SRF", ]$summed_abundance
MES_abundances = gene_profile_sums_with_meta[gene_profile_sums_with_meta$Depth_layer == "MES", ]$summed_abundance

wilcox.test(SRF_abundances,MES_abundances)

#####

### Project II - Task II - Beta-diversity and ordination

#####

### Task a) Calculate Bray-Curtis dissimilarities between all samples.
# Use the 'vegdist' function from the vegan package with method set to 'bray'
TO_gene_profile_bray = as.matrix(vegdist(t(TO_gene_profile),method = "bray"))

### Task b) Compare and visualise the Bray-Curtis dissimilarities of samples through a Principle Coordinates Analysis.
# Use the cmdscale function from the vegan package to calculate the PCoA
TO_gene_profile_bray_pcoa = cmdscale(TO_gene_profile_bray)

# Extract the sample coordinates along the PCoA axis 1 and 2
TO_gene_profile_bray_pcoa_df = data.frame(
  first = TO_gene_profile_bray_pcoa[,1],
  second = TO_gene_profile_bray_pcoa[,2],
  Sample = rownames(TO_gene_profile_bray))

# Combine this dataframe with the metadata table using 'merge'
TO_gene_profile_bray_pcoa_df_w_meta <- merge(TO_gene_profile_bray_pcoa_df, TO_gene_meta, by="Sample_name")

# Visualise the results using ggplot
ggplot(data=TO_gene_profile_bray_pcoa_df_w_meta,
       aes(x=first, y=second)) +
  geom_point(aes(colour = Depth_layer), size = 4)+
  xlab(paste0("PCoA1")) +
  ylab(paste0("PCoA2"))+
  theme_bw()

### ADVANCED: How do the Bray-Curtis dissimilarities between samples from the same depth compare to those between samples from different depths?
# Reformat Bray-Curtis distance matrix to long format dataframe using:
TO_gene_profile_bray_diss_df <- as.data.frame(as.table(TO_gene_profile_bray))

# Merge with metadata table to add in depth layer information for each sample. As you would of had to provide each sample column with a different name, you can use the 'by.x' and 'by.y' in merge to specify which columns you want to merge the dataset by. Below is an example (you will need to perform two merge steps):
tmp = merge(TO_gene_profile_bray_diss_df, TO_gene_meta, by.x = "Var1", by.y = "Sample_name")
TO_gene_profile_bray_diss_df_w_meta = merge(tmp, TO_gene_meta, by.x = "Var2", by.y = "Sample_name")

# Finally, you can introduce a column that indicates if the depth layer of the samples is the same or different using an 'ifelse' statement:
TO_gene_profile_bray_diss_df_w_meta$Category = ifelse(TO_gene_profile_bray_diss_df_w_meta$Depth_layer.x == TO_gene_profile_bray_diss_df_w_meta$Depth_layer.y, "Same_depth", "Different_depth")

# Statistically test the differences using a Wilcox test and visualise in a boxplot format using the code employed in Metagenomics II
wilcox.test(TO_gene_profile_bray_diss_df_w_meta[TO_gene_profile_bray_diss_df_w_meta$Category == "Same_depth", ]$Bray_dissimilarities, TO_gene_profile_bray_diss_df_w_meta[TO_gene_profile_bray_diss_df_w_meta$Category == "Different_depth", ]$Bray_dissimilarities)

ggplot(TO_gene_profile_bray_diss_df_w_meta, aes(x=Category, y=Bray_dissimilarities)) + 
  geom_boxplot() +
  labs(y = "Bray-Curtis dissimilarities") +
  theme_bw() + 
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 10),
        axis.title.y = element_text(size = 14),
        axis.title.x = element_blank())


#####

### Project II - Task II - Differential abundance testing

#####

### Task a) Filter the gene profile to retain genes that have >= 20% prevalence
# As we have already calculated the gene prevalence earlier, we can simply
# filter this by our chosen threshold (20%) and keep the names of the genes
# that meet our criteria.
sel_genes = names(which(gene_prevalence >= 20))
length(sel_genes)

# We can then use the list of names to filter the corresponding genes from the 
# relative abundance gene profile 
TO_gene_profile_subset <- as.matrix(TO_gene_profile[rownames(TO_gene_profile) %in% sel_genes, ])


### Task b) Visualise the distribution of genes using a histogram
hist(TO_gene_profile_subset, xlab = "Relative abundance (%)")

### Task c) Prepare the data for differential abundance testing. That is, generate a table for SRF abundances and MES abundances 
### and align the sample indices with those of the metadata table (see code from Metagenomics II).

# Subset metadata dataframe to samples derived from SRF and MES depth layers
SRF_samples <- TO_gene_meta[Depth_layer == "SRF"]
MES_samples <- TO_gene_meta[Depth_layer == "MES"]

# Identify those stations where samples were collected at both depths, as we want to remove spatial influence when we test for differential
# abundance patterns with depth. Then subset the metadata tables to include only samples from those stations
common_stations <- intersect(SRF_samples$Sample_station, MES_samples$Sample_station)
SRF_filtered <- SRF_samples[SRF_samples$Sample_station %in% common_stations, ]
MES_filtered <- MES_samples[MES_samples$Sample_station %in% common_stations, ]

# Now subset the gene profile to create one profile for SRF and one profile for MES samples
SRF_sample_cols <- colnames(TO_gene_profile_subset) %in% SRF_filtered$Sample_name
MES_sample_cols <- colnames(TO_gene_profile_subset) %in% MES_filtered$Sample_name
TO_gene_profile_subset_SRF <- as.matrix(TO_gene_profile_subset[, SRF_sample_cols])
TO_gene_profile_subset_MES <- as.matrix(TO_gene_profile_subset[, MES_sample_cols])

### Task c) Perform differential abundance testing using a paired Wilcox test and identify the five genes with the lowest p-values.
# Prepare an empty vector to store the results. As we have already created a list of genes earlier, we can use this to set the dimensions and names of the vector
wilcox_results = rep(NA,length(sel_genes))
names(wilcox_results) = sel_genes

# Loop through each gene and run a paired Wilcox test
for(gene in sel_genes){
  SRF = TO_gene_profile_subset_SRF[gene,]
  MES = TO_gene_profile_subset_MES[gene,]
  wilcox_results[gene] = wilcox.test(SRF,MES, paired=T)$p.value
}

### Q1: How many genes were identified as differentially abundant?
length(which(wilcox_results<0.05))
# Answer: 8115

### Task d) Apply Bonferroni multiple-testing correction and retain only those genes with a p-value < 0.05.
wilcox_results_p_adj_bon = p.adjust(wilcox_results, method = "bonferroni")

### Q2: How many genes had a p-value <0.05 after multiple testing correction?
length(which(wilcox_results_p_adj_bon<0.05))
# Answer: 4262

#####

### Project II - Task III - Explore results

#####

### Task a) Subset the gene profile to retain the genes that show a statistically significant differential abundance between depth layers.
# Extract genes with adjusted p-value < 0.05
genes_adjust_pvals = data.frame(p_val = wilcox_results_p_adj_bon, Gene=names(wilcox_results_p_adj_bon))
genes_signif_pvals = genes_adjust_pvals[genes_adjust_pvals$p_val < 0.05, ]

# Filter the relative abundance gene profile to only include selected genes
genes_signif_rel = data.frame(TO_gene_profile_subset[rownames(TO_gene_profile_subset) %in% genes_signif_pvals$Gene,], Gene = genes_signif_pvals$Gene)

### Task b) Determine the mean abundance of genes in SRF and MES depths. Then create a dataframe that combines the mean abundance information with the adjusted p-values 
### from the multiple-testing correction and the functional annotation of the genes (provided in TO_gene_annotations).
# Calculate the mean relative abundance in SRF and MES depth layers for each gene
SRF_mean_abundance = rowMeans(genes_signif_rel[, grep("SRF", colnames(genes_signif_rel))])
MES_mean_abundance = rowMeans(genes_signif_rel[, grep("MES", colnames(genes_signif_rel))])

# Combine the lists into a dataframe
genes_signif_rel_mean = data.frame(SRF = SRF_mean_abundance, MES = MES_mean_abundance, Gene = names(SRF_mean_abundance))

# Add the adjusted p-values to the dataframe
genes_signif_rel_mean_w_pal = merge(genes_signif_rel_mean, genes_signif_pvals, by="Gene")

# Merge with functional annotation information
genes_signif_rel_mean_w_pal_and_annot = merge(genes_signif_rel_mean_w_pal, TO_gene_annotations, by = 'Gene')

# Finally, you can introduce a column that indicates the depth layer that the gene was most abundant in using an 'ifelse':
genes_signif_rel_mean_w_pal_and_annot$Enriched_depth = ifelse(genes_signif_rel_mean_w_pal_and_annot$SRF > genes_signif_rel_mean_w_pal_and_annot$MES, "SRF_enriched", "MES_enriched")

### Q1: How many of the differentially abundant genes were enriched in each of the two depth layers?
length(genes_signif_rel_mean_w_pal_and_annot[genes_signif_rel_mean_w_pal_and_annot$Enriched_depth == "SRF_enriched", ]$Gene)
length(genes_signif_rel_mean_w_pal_and_annot[genes_signif_rel_mean_w_pal_and_annot$Enriched_depth == "MES_enriched", ]$Gene)
# Answer: 2168 were more abundant in SRF waters compared to 1230 in MES waters

### ADVANCED What are the metabolisms that have the largest number of differentially abundant genes associated with them in SRF and MES depths?
# For this, we can subset the dataframe to those that are enriched in a certain depth layer, then keep only the Metabolism_Name column and use the 'table' function to calculate the
# number of times each Metabolism_Name is identified.
tail(sort(table(genes_signif_rel_mean_w_pal_and_annot[genes_signif_rel_mean_w_pal_and_annot$Enriched_depth == "SRF_enriched", ]$Metabolism_Name)))
tail(sort(table(genes_signif_rel_mean_w_pal_and_annot[genes_signif_rel_mean_w_pal_and_annot$Enriched_depth == "MES_enriched", ]$Metabolism_Name)))
# Answer: 
# In SRF waters, enriched Metabolisms include: Transporters, Membrane traficking, Peptidases, DNA repair
# In MES waters, enriched Metabolisms include: Transporters, Secretion system, Two-component system 

### ADVANCED: Combine the relative abundances of genes in the gene profile based on the Metabolism they belong to and repeat the differential abundance analysis at the metabolism level
# Merge the TO_gene_profile with the TO_gene_annotations and remove all character columns except for Metabolism_Name
TO_gene_profile$Gene <- rownames(TO_gene_profile)
TO_gene_profile_w_annotations = merge(TO_gene_profile,TO_gene_annotations, by="Gene")
TO_gene_profile_w_annotations_subset = subset(TO_gene_profile_w_annotations, select=-c(Gene,Category_Name,Gene_description))

# You can sum up the relative abundances at the Metabolism_Name level using the aggregate function in R
metabolism_profile =  aggregate(. ~ Metabolism_Name, data=TO_gene_profile_w_annotations_subset, FUN=sum)

# After this, you will just need to move Metabolism_Name to rownames and remove the original column to mimic the original gene profile file