In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import brewer2mpl
from datetime import datetime
from Bio import AlignIO, SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalOmegaCommandline
from __future__ import division
%matplotlib inline
The FASTA files are aligned, thus, there are gaps in the sequence. Also, there are "partial CDS" sequences present in the alignments. I have already used find-and-replace to remove the gap characters from the FASTA files.
Tasks:
In [2]:
# Remove partial CDS sequences in both the HA and NA fasta files.
ha_sequences = [s for s in SeqIO.parse('H9N2_HA.fasta', 'fasta') if 'complete cds' in s.description]
print(len(ha_sequences))
na_sequences = [s for s in SeqIO.parse('H9N2_NA.fasta', 'fasta') if 'complete cds' in s.description]
print(len(na_sequences))
In [3]:
# Function to find longest ORFs:
def longest_orfs(list_of_seqrecords):
aa_sequences = []
for record in list_of_seqrecords:
longest_protein = SeqRecord(id=record.id, seq='')
for frame in range(3):
length = 3 * ((len(record) - frame) // 3)
for pro in record.seq[frame:frame + length].translate().split("*"):
if len(pro) > len(longest_protein.seq):
longest_protein.seq = pro
aa_sequences.append(longest_protein)
return aa_sequences
In [4]:
# # Grab out longest ORFs and write to disk
na_orfs = longest_orfs(na_sequences)
SeqIO.write(na_orfs, 'H9N2_NA_CDS.fasta','fasta')
ha_orfs = longest_orfs(ha_sequences)
SeqIO.write(ha_orfs, 'H9N2_HA_CDS.fasta', 'fasta')
# na_orfs
Out[4]:
In [5]:
# # Perform multiple sequence alignment using Clustal Omega.
# # Uncomment this cell only if you need to run another multiple sequence alignment.
# genes = ["HA", 'NA']
# for gene in genes:
# in_file = "H9N2_%s_CDS.fasta" % gene
# out_file = "H9N2_%s_CDS_Aligned.fasta" % gene
# cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True, force=True)
# cline()
In [6]:
# Read in the aligned sequences
gene = "HA"
df = pd.DataFrame(np.array([s for s in AlignIO.parse('H9N2_%s_CDS_Aligned.fasta' % gene, 'fasta')])[0])
# ha_df
# Capture accession and wild or domestic status as an array.
labels = []
for sequence in SeqIO.parse("H9N2_%s_CDS_Aligned.fasta" % gene, "fasta"):
sequence_id = sequence.id.split("_")
label = sequence_id[2]
if label == 'D':
labels.append('domestic')
if label == 'W':
labels.append('wild')
# Remove any positions that do not contain any variation
for column in df.columns:
if len(np.unique(df[column].values)) == 1:
del df[column]
# df
In [7]:
from sklearn.preprocessing import LabelEncoder
from random import shuffle
# Encode the amino acids with categorial labels 1-20
le = LabelEncoder()
amino_acid_code = (list('ABCDEFGHIJKLMNPQRSTVWXY-'))
shuffle(amino_acid_code)
print(len(amino_acid_code))
le.fit(amino_acid_code)
a_encoded = le.transform(amino_acid_code)
# a_encoded
# Replace amino acids with integer numbers.
encode_dict = {letter:int(encode) for letter, encode in zip(amino_acid_code, a_encoded)}
# encode_dict
encoded_df = df.replace(to_replace=encode_dict.keys(), value=encode_dict.values())
encoded_df
Out[7]:
In [8]:
from sklearn.ensemble import RandomForestClassifier
# Run 100 runs of the Random Forest classifier, and average the feature importances.
feature_importance_dict = {pos:[] for pos in encoded_df.columns}
for i in range(2000):
rf = RandomForestClassifier()
rf.fit_transform(encoded_df, labels)
for pos, importance in zip(encoded_df.columns, rf.feature_importances_):
feature_importance_dict[pos].append(importance)
# feature_importance_dict
In [9]:
# Get the mean for each feature_importance
mean_importances = {pos:None for key in feature_importance_dict.keys()}
for pos, importances in feature_importance_dict.items():
mean_importances[pos] = np.mean(importances)
color = brewer2mpl.get_map('Blues', 'Sequential', 3)
plt.scatter(mean_importances.keys(), mean_importances.values(), color='r')
plt.title("Positional Importance for %s Gene" % gene)
plt.xlabel('Position in Amino Acid Alignment')
plt.ylabel('Feature Importance')
plt.savefig("%s Positional Feature Importances.pdf" % gene)
In [10]:
# What positions are the best predictors?
pos_importance = sorted(mean_importances.items(), key=lambda x:x[1])[::-1]
pos_by_importance = [pos for (pos, score) in pos_importance]
importances_df = pd.DataFrame(pos_importance)
importances_df.columns=['Position', 'Relative Importance']
importances_df.to_csv('%s Relative Importances.csv' % gene)
importances_df
Out[10]:
In [11]:
zip(df[305].values, labels)
Out[11]:
In [12]:
# Create pie charts of porportions for each position
from collections import Counter
counts = dict()
for pos in df.columns:
counts[pos] = Counter(zip(df[pos].values, labels))
counts
Out[12]:
In [13]:
def wild_domestic_proportions_piecharts(counter, pos):
wild_letters = [letter for (letter, state) in counter[pos].keys() if state == 'wild']
domestic_letters = [letter for (letter, state) in counter[pos].keys() if state == 'domestic']
letters = set([letter for (letter, state) in counter[pos].keys()])
wild_proportions = dict()
domestic_proportions = dict()
for letter in letters:
if letter in wild_letters:
wild_proportions[letter] = counter[pos][(letter, 'wild')]
if letter not in wild_letters:
wild_proportions[letter] = 0
if letter in domestic_letters:
domestic_proportions[letter] = counter[pos][(letter, 'domestic')]
if letter not in domestic_letters:
domestic_proportions[letter] = 0
fig = plt.figure(figsize=(6.5,3))
if len(letters) < 3:
bmap = brewer2mpl.get_map('Set1', 'qualitative', 3)
else:
bmap = brewer2mpl.get_map('Set1', 'qualitative', len(letters))
colors = bmap.mpl_colors
# Wild Bird Proportion of AAs
ax1 = fig.add_subplot(121)
plt.pie(wild_proportions.values(), colors=colors)
plt.title('Wild, Pos %s, n=%s' % (pos, sum(wild_proportions.values())))
# Domestic Bird Proportion of AAs
ax2 = fig.add_subplot(122)
plt.pie(domestic_proportions.values(), colors=colors)
plt.title('Domestic, Pos %s, n=%s' % (pos, sum(domestic_proportions.values())))
plt.legend(labels=letters, bbox_to_anchor=[1.3,1])
plt.savefig('%s Position %s Proportion of Amino Acids.pdf' % (gene, pos))
plt.show()
In [14]:
# def wild_domestic_proportions(counter, ranked_positions):
In [15]:
for pos, importance in pos_importance[0:10]:
wild_domestic_proportions_piecharts(counts, pos)
In [15]:
In [15]:
In [15]: