cp -r /nfs/nas22/fs2202/biol_micro_teaching/551-1119-00L-2024/s07_hands_on .

sample_dna = "ATGCGATACGCTTGA"

def find_orfs(dna_sequence):
    """ 
    This function scans the sequence for ATG, then continues 
    in triplets until it finds a stop codon. 
    It then extracts and saves each ORF. 
    """
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]
    orfs = []

    # Complete the code :)
    
    return orfs

# Test the ORF finder with the sample DNA
orfs = find_orfs(sample_dna)
print("ORFs Found:")
for orf in orfs:
    print(orf)

from Bio.Seq import Seq

# Create a Seq object
dna_seq = Seq("ATGCGTCTAA")

# Translate the DNA sequence to a protein
protein_seq = dna_seq.translate()
print(f"Protein Sequence: {protein_seq}")

from Bio import SeqIO

# Load the sequence from a FASTA file
with open("./data/test_genome.fna") as file:
    sequence_record = SeqIO.read(file, "fasta")

# Display the sequence and its metadata
print(f"ID: {sequence_record.id}")
print(f"Description: {sequence_record.description}")
print(f"Sequence: {sequence_record.seq}")

from Bio.Seq import Seq

def find_and_translate_orfs(dna_sequence):
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]
    orf_proteins = []

    # Complete the code

    return orf_proteins

# Test with a longer DNA sequence
proteins = find_and_translate_orfs(sequence_record.seq)
print("Translated Proteins from ORFs:")
for protein in proteins:
    print(protein)

# Install Biopython if needed: !pip install biopython
from Bio.Seq import Seq
from Bio import SeqIO

# Sample DNA sequence
dna_sequence = Seq("ATGCTAGCTAGCTCGTAGCT")
print("Sequence:", dna_sequence)
print("Reverse Complement:", dna_sequence.reverse_complement())
print("Protein Translation:", dna_sequence.translate())

# Load a FASTA file (replace with actual file path if needed)
for record in SeqIO.parse("./data/test_contigs.fna", "fasta"):
    print(f"ID: {record.id}, Length: {len(record.seq)}")

import numpy as np

# Define the target sequence
target_motif = "AGGAGG"

# Define PWM for a motif of 6 bases (length of AGGAGG)
# Rows: A, C, G, T. Columns correspond to each position in "AGGAGG"
pwm = np.array([
    [1, 0.25, 0.5, 0, 1, 0],  # Probability of 'A' at each position
    [0, 0.25, 0  , 0, 0, 0],  # Probability of 'C' at each position
    [0, 0.25, 0.5, 0, 0, 1],  # Probability of 'G' at each position
    [0, 0.25, 0  , 1, 0, 0]   # Probability of 'T' at each position
], dtype=float)

# To handle probabilities in a real PWM, add a small pseudocount to avoid zeros
pwm = (pwm + 0.01) / pwm.sum(axis=0)  # Normalize each column

def calculate_pwm_score(sequence, pwm):
    scores = []
    for i in range(len(sequence) - pwm.shape[1] + 1):
        window = sequence[i:i + pwm.shape[1]]
        score = 1.0
        for j, nucleotide in enumerate(window):
            if nucleotide == 'A':
                score *= pwm[0, j]
            elif nucleotide == 'C':
                score *= pwm[1, j]
            elif nucleotide == 'G':
                score *= pwm[2, j]
            elif nucleotide == 'T':
                score *= pwm[3, j]
        scores.append(score)
    return scores

# Define a test sequence
test_sequence = "TTTAGGAGGCTAGGAGGATGGAGGTTAGGAGGGT"

# Calculate the scores for each window
scores = calculate_pwm_score(test_sequence, pwm)

import matplotlib.pyplot as plt

# Your code here

import pandas as pd

# Read the file with pandas and store it in blast_results variable

blast_results

blast_results = pd.read_csv('./data/omd2_candidate_blast.out', sep='\t', header=None)
blast_results.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", 
                        "gapopen", "qstart", "qend", "sstart", "send", 
                        "evalue", "bitscore"]

blast_results

import pandas as pd
df = pd.read_csv('../s0506_python/data/genome_summary.csv.gz', compression='gzip', index_col=0)
md = pd.read_csv('../s0506_python/data/metadata.tsv', sep='\t', index_col=0)

df

import pandas as pd
import pyhmmer
import glob
import pyhmmer.easel as easel
import collections

def retrieve_hits(seqs_path, hmms, fields=["query", "subject", "bitscore", "evalue"]):

    # Load cluster proteins
    with pyhmmer.easel.SequenceFile(seqs_path, digital=True, alphabet=easel.Alphabet.amino()) as seqs_file:
        proteins = seqs_file.read_block()

    # Run HMMs
    Result = collections.namedtuple("Result", fields)

    results = []
    for hits in pyhmmer.hmmsearch(hmms, proteins, E=1):
        cog = hits.query_name.decode()
        for hit in hits:
            if hit.included:
                results.append(Result(hit.name.decode(), cog, hit.score, hit.evalue))

    # Results --> df
    hits_df = {}
    c = 0
    for i in results:
        hits_df[c] = list(i)
        c += 1
    hits_df = pd.DataFrame.from_dict(hits_df, orient='index', columns=fields)
    
    return hits_df

# Find and load a collection of HMMs
HMMS = []
for fil in glob.glob('./data/hmms_nifHDK/*.hmm'):
    with pyhmmer.plan7.HMMFile(fil) as hmm_file:
        HMMS.append(hmm_file.read())
HMMS

results = retrieve_hits('./data/omd2_candidate.faa', HMMS)
results

retrieve_hits('./data/cyanobact.faa', HMMS)

antismash = pd.read_csv('./data/omd2_candidate-antismash.tsv', sep='\t')
antismash.head(10)

omd_bgcs = pd.read_csv('./data/bgcs.csv.gz', compression='gzip')
omd_bgcs['biosample'] = [i.split('_')[1] for i in omd_bgcs['GENOME']]
md = pd.read_csv('./data/metadata.tsv', sep='\t', index_col=0)

omd_bgcs

egg = pd.read_csv('/nfs/home/smiravet/bc/bc2024/s07_hands_on/data/eggnog_output.tsv', sep='\t')

from Bio import SeqIO
import pandas as pd
import peptides

# Define a function to load sequences from a FASTA file
def load_sequences(fasta_file):
    sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences[record.id] = str(record.seq)
    return sequences

# Load sequences from the fasta file
training_seqs = load_sequences("./data/amps/training_set.faa")  # Positive AMP sequences
training_seqs

aas = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
cols = ['aliphatic_index', 'boman', 'charge'] + [f'count_{i}' for i in aas] + [f'freq_{i}' for i in aas] 
cols += ['hydrophobic_moment', 'hydrophobicity', 'instability_index', 'isoelectric_point', 'mass_shift']
cols += ['molecular_weight', 'mz']
def predict_additional(peptide):
    aas = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
    l = [peptide.aliphatic_index(), peptide.boman(), peptide.charge()]+[peptide.counts().get(i) for i in aas]+[peptide.frequencies().get(i) for i in aas]
    l += [peptide.hydrophobic_moment(), peptide.hydrophobicity(), peptide.instability_index(), peptide.isoelectric_point(), peptide.mass_shift()]
    l += [peptide.molecular_weight(), peptide.mz() ]
    additional_feats = {k:v for k, v in zip(cols, l)}
    return additional_feats

def featurize_seq(seq):
    peptide = peptides.Peptide(seq)
    feats = peptide.descriptors()
    feats.update(predict_additional(peptide))
    return feats

featurize_seq('MGMRMMFTVFLLVVLATTVVSIPSDRASDGRNAVVHERAPELVVTATTNCCGYNPMTICPPCMCTYSCPPKRKPGRRND')

training_seqs = load_sequences("./data/amps/training_set.faa")  # Positive AMP sequences

# Predict features for all the training sequences
df = pd.DataFrame.from_dict({k:featurize_seq(v) for k, v in training_seqs.items()}, orient='index')
df

# Add a label +/-
df['label'] = [0 if 'NAMP' in i else 1 for i in df.index]
df

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Separate features and labels
X = df.drop(columns=["label"])
y = df["label"]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

print("Random Forest model training complete.")

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Predict probabilities for the validation set
y_probs = rf_model.predict_proba(X_val)[:, 1]  # Probability of the positive class

# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_val, y_probs)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Extract features from the test sequences
to_test = load_sequences("./data/amps/test_set.faa")  # Positive AMP sequences

# Predict features for all the training sequences
df2 = pd.DataFrame.from_dict({k:featurize_seq(v) for k, v in to_test.items()}, orient='index')

# Predict the probability of each test sequence being positive
test_probs = rf_model.predict_proba(df2)[:, 1]
test_predictions = rf_model.predict(df2)

# Display predictions
for i, seq in enumerate(to_test):
    print(f"Sequence: {seq}")
    print(f"Prediction: {'Positive' if test_predictions[i] == 1 else 'Negative'}")
    print(f"Probability of being AMP: {test_probs[i]:.2f}\n")

import matplotlib.pyplot as plt
import numpy as np

# Get feature importance from the model
feature_importances = rf_model.feature_importances_
feature_names = X.columns

# Sort features by importance
sorted_idx = np.argsort(feature_importances)[::-1]
top_features = sorted_idx[:10]
top_feature_names = [feature_names[i] for i in top_features]
top_feature_importances = feature_importances[top_features]

# Plot the top 10 important features
plt.figure(figsize=(10, 6))
plt.barh(top_feature_names[::-1], top_feature_importances[::-1], color='skyblue')
plt.xlabel("Feature Importance Score")
plt.title("Top 10 Most Important Features")
plt.show()

import seaborn as sns

# Subset data to include only top 10 features and labels
top_features_data = df2[top_feature_names]
top_features_data['label'] = test_predictions

# Plot each feature as a boxplot to compare distributions in positive vs. negative sets
plt.figure(figsize=(12, 10))
for i, feature in enumerate(top_feature_names, 1):
    plt.subplot(5, 2, i)
    sns.boxplot(x='label', y=feature, data=top_features_data, palette=["#FFA07A", "#8FBC8F"])
    plt.title(f"Distribution of {feature} by Class")
    plt.xlabel("Class (0 = Negative, 1 = Positive)")
    plt.ylabel(feature)

plt.tight_layout()
plt.show()

subdf = top_features_data[['label', top_feature_names[0]]]
subdf

from scipy.stats import mannwhitneyu

U1, p = mannwhitneyu(subdf[subdf['label']==1][top_feature_names[0]], subdf[subdf['label']==0][top_feature_names[0]], method="exact")
p

omd_amps = pd.read_csv('./data/amps/smorfs_summary.tsv.gz', compression='gzip', sep='\t', index_col=0)
omd_bgcs = pd.read_csv('./data/amps/morfs_bgcs_amps.tsv.gz', compression='gzip', sep='\t', index_col=0)
md = pd.read_csv('../s0506_python/data/metadata.tsv', sep='\t', index_col=0)

Hands-on session on genome mining¶

Copying this tutorial:¶

1. Gene annotation¶

1.1. Working with DNA Sequences in Python¶

1.2. Basic ORF Finder¶

1.3. Using Biopython for Sequence Handling¶

1.4. Loading a genome with Biopython¶

1.5. Putting It All Together¶

1.6. Using Biopython for multifasta Handling¶

2. Alignment-based approaches¶

2.1. Introduction to Sequence Homology and Identity¶

2.2. Position weight matrices¶

2.3. Running BLAST¶

2.3.1 Running BLAST locally¶

2.4. Parsing BLAST Results and Checking Hits with Pandas¶

3. HMMs-based approaches¶

3.2. Working with antismash outputs¶

3.3. Working with merged antismash outputs¶

3.4. Working with EggNOG outputs¶

4. Feature based approaches¶

4.1. Defining a Random Forest Classifier to identify AMPs¶

4.2. Test the Model on New Data¶

4.3. Exploring importances from a RF model¶

Additional: statistical tests¶

4.4. OMD AMPs¶