cp -r /nfs/nas22/fs2202/biol_micro_teaching/551-1119-00L-2024/s0506_python .

1     # This a comment: 1 is a number

2

-3

1
2

3.14

print(type(1), type(3.14))    # type() tells you the class of the object

'ap"ple'

"apple"

True

False

a = [1, 2, 3]    # Equal is used to define a variable 'a'
a

a[::2]   # Lists can be accessed using square brackets and the index (it is 0-based!)

a.append(10)  # append adds a new element to a list
a

a[0] = 10    # We can edit values in the list
a            # No need to use print when working in a notebook

b = (1, 2, 3, 3, 3) # This is a tuple
b

c = set([1,2,3,3,3])
c

c[0] = 1

capitals = {'France': 'Paris', 'England': 'London', 'Canada': 'Toronto'}

capitals['Canada']

capitals['Canada'] = 'Ottawa'   # What happens here?

complement = {'A': 'T', 
              'C': 'G', 
              'T': 'A', 
              'G':'C'}
complement['A']

seq = 'AATTGCAT'

seq[0:3]

len(seq)

reverse_seq = seq[::-1]
reverse_seq

for letter in reverse_seq:
    print(f'{letter} -> {complement[letter]}')

print(1)
print(2)
print(3)

1
2

1 + 2

# Other operators:
print(1 - 2)
print(1 * 2)
print(1 / 2)
print(4 % 2)
print(4 ** 2)

# Variable can be operated on the fly
a = 1
a += 10
print(a)

b = 10
b *= 10
print(b)

# Other operations
a = [1,3,-3]

abs(a[2]), sum(a), len(a)

None

def plus(a_l, b):
    
    a = a_l[2]
    
    return a + b

plus([3, 4, 5], 4)

def plus(a, b):
    a + b

a = 1
plus(a, 4)
a

"a" + 1

1 != 0

a = 1
[] is []

list() is list()

tuple() is tuple()

57663463467 == 57663463469

57663463467 != 57663463469

import this

import operator    # import can be used to use installed packages or your own functions
operator.add(1, 2)

import operator as op     # libraries can be imported to abbreviations
op.add(1,2)

from operator import add  # specific libraries can be imported 
add(1,10)

y = 0
for x in range(10):
    y = x

y

{x:x+10 for x in range(10, 20)}

x

x = 3
def foo():
    x=4
    def bar():
        print(x)  # Accesses x from foo's scope
    bar()  # Prints 4
    x=5
    bar()  # Prints 5

a = 0

def test(a):
    a +=1 
    return a
test(a)

def function():
    for i in range(10):
        yield i

[i for i in function()]

for y in function():
    print(f'current_number {y}')

def do_something(a, b, c=3):
    return (a, b, c)

do_something(1, 2, 6)

def do_something_else(a=1, b=2, c=3):
    return (a, b, c)

do_something_else()

def some_function(start=[]):
    start.append(1)
    return start

result = some_function()

result

result.append(2)

other_result = some_function()

other_result

[x ** 2 for x in range(10)]

temp_list = []
for x in range(10):
    temp_list.append(x ** 2)
temp_list

for i in ['hello']:
    print(i)

a = 3
b = 4

if a>b:
    print(f'{a} is greater than {b}')
elif a==b:
    print(f'{a} == {b}')
else:
    print(f'{b} is greater than {a}')

def gc1(seq):
    c = 0
    for i in seq:
        if i in ['C', 'G']:
            c += 1
    return 100*c/len(seq)

def gc2(seq):
    c = 0
    for i in range(len(seq)):
        if seq[i] == 'C':
            c += 1
        elif seq[i] == 'G':
            c += 1
        else:
            pass
    return 100*c/len(seq)
                   
def gc3(seq):

    """
    Calculates the GC content of <seq>
    """
    return 100*(seq.count('C') + seq.count('G'))/len(seq)

seq = 'ACGTATTGCTTAGGCTGAGGCTAGGAGAGGGGACCCCCTAGCTAGGATCGT'

print(gc1(seq), gc2(seq), gc3(seq))

import numpy

# Display value of pi
numpy.pi

# Calculate log2 of a number
# log2 is a function from numpy library. It takes one argument.
numpy.log2(8)

# Learn more about the function and it's arguments
?numpy.log2

# argument doesn't have to be a number, can be a list
numpy.log2([8, 16, 32])

from numpy import floor
# floor function returns floor of a number (largest integer i, such that i <= x)
floor([4.5, 3.2, 1.9])

import numpy as np
# ceiling function returns ceiling of a number (smallest integer i, such that i >= x)
np.ceil([4.5, 3.2, 1.9])

# import and give an alias
import pandas as pd

%ls data

# Read in a csv file
data = pd.read_csv('data/2022-04-13.LeeKA_2022.colData.csv', index_col=0)

# Look at the top 5 rows of the data
# The columns in a dataframe are the observed variables, and the rows are the observations.
data.head()

# View the last five rows

# Check number of rows and columns
data.shape

# Find out mroe about a dataframe
data.info()

# Are all the indices (i.e. row names) unique
data.index

data.index.unique()

len(data.index.unique()) == len(data.index)

data.index.nunique() == len(data.index)

# Use DataFrame.columns to access or change column names
data.columns

# Get summary statistics 
# Only for numerical columns
data.describe()

data['gender']

data.gender

data['gender'].unique()

data['gender'].value_counts()

data['gender'].value_counts(normalize=True)

# Calculate what % of subjects belonged to each of the subcohorts (Hint, there is a column 'subcohort')

# You can select more than one column
columns_to_show = ['subject_id',  'country', 'gender', 'age_category',  'BMI', 'PFS12']
data = data[columns_to_show]

data.head()

data[0:3]

data[-1:]

data.iloc[0:2, 0:2]

data.loc[['BCN-01_1', 'BCN-02_1'], ['subject_id', 'country']]

# Look at all the entries from Spain
data[data.country == 'ESP']

# Look at all the entries not from Spain
data[data.country != 'ESP']

# Look at all the entries not from Spain
data[(data.country == 'ESP') & (data.gender == 'female')]

data[(data.BMI < 26) & (data.BMI > 25)]

data[(data.BMI.between(25, 26))]

data[data.subject_id.str.contains('BCN')]

data[data['country'].isin(['GBR', 'ESP'])]

# You can calculate any summary statistics on the numberic columns with min(), max(), mean(), std(), count(), etc.
data['BMI'].min()

data['BMI'].median()

data.groupby('gender').median(numeric_only=True)

# Calculate BMI for different genders for each country
data.groupby(['country', 'gender']).BMI.mean()

# Calculate how gender distribution for each of the countries
data.groupby(['country', 'gender']).subject_id.count()

# For each country and gender, calculate % with PFS12
data.groupby(['country', 'gender']).PFS12.value_counts(normalize=True)

data[data.PFS12.isnull()]

# Drop all observations with missing values
data.dropna()

rel_abundance = pd.read_csv("data/2022-04-13.LeeKA_2022.relative_abundance.SAMPLE.csv", index_col=0)

rel_abundance

df = data.merge(rel_abundance, left_index=True, right_index=True)

df.head()

df.groupby('gender').mean(numeric_only=True).T

df.to_csv("data/ploting_ws.csv")

import matplotlib.pyplot as plt
%matplotlib inline

plt.plot([5, 8, 2, 6, 1, 8, 2, 3, 4, 5, 6])
plt.xlabel('X')
plt.ylabel('Y')
plt.title('A plot')

plt.boxplot([[1,2,3], [3,4,5]])
# plt.show()    # To show the plot, not required in notebooks
plt.savefig('./data/boxplot.svg')    # You can save in png, svg, pdf, etc

subdf = df.groupby('gender').mean(numeric_only=True).T.copy()
subdf

plt.boxplot([subdf['female'], subdf['male']], labels=['female', 'male'])
plt.show()

import plotly.express as px

df = pd.read_csv("data/ploting_ws.csv", index_col=0)

df.head()

px.histogram(df, x='BMI', color='gender', nbins=100)

fig = px.scatter(df, x="BMI", y="Dorea_longicatena")
fig.show()

fig = px.scatter(df, x="BMI", y="Dorea_longicatena", color='gender',
                hover_data=['country', 'subject_id', 'BMI', 'gender', 'age_category'])
fig.show()

# Is there difference in PFS12 across countries/genders/age categories
fig = px.box(df, x='country', y='BMI')
fig

# Is there difference in PFS12 across countries/gender/age categories
fig = px.box(df, x='country', y='BMI', 
             facet_col='gender', 
             facet_row='age_category', 
             color='PFS12',
             width=800, height=600, 
             template='simple_white')

fig

# Pivoting dataframe for plotting
bar_df = df.melt(id_vars = ['subject_id', 'country', 'gender', 'age_category', 'BMI', 'PFS12'], value_name = 'RelAb', 
           var_name = 'species')
bar_df.head()

new_df = bar_df.groupby(['PFS12', 'species']).RelAb.mean().reset_index()
px.bar(new_df, x='PFS12', y= 'RelAb', color='PFS12', facet_col='species', facet_col_wrap=4,  height=1000)

fig = px.bar(bar_df[bar_df.subject_id.isin(df.subject_id.unique()[0:5])], x="subject_id", y="RelAb", color="species",
             template='plotly_white',
             hover_data =['gender', 'age_category'], width=800, height=600,
             )
fig.show()

px.strip(bar_df[bar_df.species == 'Akkermansia_muciniphila'], x='country', y='RelAb', color='PFS12', 
         facet_col='gender', log_y=True, template = 'plotly_white')

import pandas as pd
df = pd.read_csv('./data/genome_summary.csv.gz', compression='gzip', index_col=0)
md = pd.read_csv('./data/metadata.tsv', sep='\t', index_col=0)

df.columns

md.columns

# Assuming md and df are your dataframes and both contain a 'biosample' column
# We select only the necessary columns from md: 'biosample', 'A', 'B', 'C'
md_subset = md[['biosample', 'lat', 'lon', 'date', 'temp', 'depth_str', 'depth', 'log_depth', 'depth_layer', 'size_fraction']]

# Merge md_subset into df based on the 'biosample' column
df = df.merge(md_subset, on='biosample', how='left')

df

df['genus']

df.iloc[7]

df[df['genome']=='ZORZ22-1_SAMN30647033_MAG_00000149']

df[(df['completeness']>=90) & 
   (df['contamination']<5) & 
   (df['genome'].str.contains('TARA'))]

df.sort_values('completeness', ascending=False).head(100)

from collections import Counter
Counter(df['environment'])

import seaborn as sns

sns.scatterplot(x='genome_size', y='completeness', data=df, alpha=0.2)

sns.jointplot(x='genome_size', y='completeness', data=df)

sns.boxplot(x='depth_layer', y='gc_content', data=df)

sns.violinplot(x='depth_layer', y='gc_content', data=df)

sns.stripplot(x='depth_layer', y='gc_content', data=df, alpha=0.1)

sns.boxplot(x='depth_layer', y='gc_content', hue='domain', data=df.head(10000), palette='viridis')

heatmap_data = df[df['environment']=='coral metagenome'][['completeness', 'contamination', 'genome_size']]   # What is happening here?

heatmap_data

import seaborn as sns

normalized_heatmap = (heatmap_data - heatmap_data.min()) / (heatmap_data.max() - heatmap_data.min())    # What is this line doing? 
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_heatmap, cmap='YlGnBu')
plt.show()

plt.figure(figsize=(10, 8))
sns.clustermap(normalized_heatmap, cmap='YlGnBu')
plt.show()

import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd

# Direct URL to the shapefile
url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"

def plot_lat_lon_on_world_map(df):    
    # Convert latitude and longitude in DataFrame to GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lon'], df['lat']))
    
    # Load world shapefile directly from URL
    world = gpd.read_file(url)
    
    # Plotting
    world.plot(figsize=(10, 10), edgecolor=None, color='lightgrey')
    gdf.plot(marker='o', color='#4884af', markersize=5, ax=plt.gca())
    
    # Labeling the axes and title
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('Latitude and Longitude Points on World Map')
    plt.show()

# Call the function with your DataFrame, 'md'
plot_lat_lon_on_world_map(md)

# Or modify it to plot more data as color, shape...
import matplotlib.colors as mcolors

def plot_lat_lon_with_temp(df):    
    # Create a GeoDataFrame from the input DataFrame
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lon'], df['lat']))
    
    # Load the world map shapefile
    world = gpd.read_file(url)
    
    # Plot the world map
    fig, ax = plt.subplots(figsize=(10, 10))
    world.plot(ax=ax, edgecolor=None, color='lightgrey')
    
    # Plot points with color mapped to the temp column
    norm = mcolors.Normalize(vmin=df['temp'].min(), vmax=df['temp'].max())
    gdf.plot(
        ax=ax,
        marker='o',
        column='temp',       # Use the 'temp' column for color
        cmap='coolwarm',     # You can change the colormap if desired
        markersize=5,
        norm=norm,
        legend=True
    )
    
    # Set plot labels and title
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('Latitude and Longitude Points on World Map (Colored by Temp)')
    plt.show()

# Example usage:
plot_lat_lon_with_temp(md)

Introduction to Python Programming, Data Analysis and Data Visualization¶

Copying this tutorial:¶

1. Python Introduction¶

1.1 Statements¶

1.1.1 Expressions¶

1.1.1.1 Numbers¶

1.1.1.2 Strings¶

1.1.1.3 Boolean Values¶

1.1.1.4 Iterables¶

1.1.4.1 Dictionaires¶

1.1.2 Function Calls¶

1.1.2.1 Print¶

1.1.2.2 Operators¶

1.1.3 Special Values¶

1.1.4 Defining Functions¶

1.2 Equality¶

1.2.1 ==¶

1.2.2 is¶

1.3 Advanced Topics¶

2.1 Scope of variables¶

2.2 Scope¶

2.3 Generators¶

2.4 Default arguments¶

2.5 List comprehension¶

2.6 Conditions¶

2. Introduction to data analysis with pandas¶

Libraries¶

Most of the power of a programming language is in its libraries.¶

Import specific itmes from a library¶

Create an alias for a library¶

Reading tabular data into DataFrames¶

Use pandas library to do statistics on tabular data¶

Indexing, slicing, and subsetting¶

Subsetting Data using Criteria¶

Calculating Statistics From Data In A Pandas¶

Extra¶

Missing values¶

Joining dataframes¶

3. Data visualization in Python¶

3.1 Basic graphs with matplotlib¶

3.2 Interactive graphs with plotly¶

Plotly express¶

Histogram¶

Scatter plot¶

Box plot¶

Bar plots¶

Strip plot¶

3.3 Quick exploratory analyses in OMD¶

3.3.1. Loading OMD¶

3.3.2. Plotting directly from dataframes¶

3.3.3. Basic analysis example¶

3.3.4. Other visualizations¶