We'll try to desribe our loci of interest procedure with details and illustrations here.
Let's start with some modules:
In [1]:
%matplotlib inline
import os
import sys
from Bio import SeqRecord
from Bio import AlignIO
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Then import the Codon Table for standard genetic code, with the slight modification - add STOP codon * as a fully-fledged member of the table:
In [17]:
from Bio.Data import CodonTable
genetic_code = CodonTable.standard_dna_table.forward_table
stop_codons = dict([ (codon,'*') for codon in CodonTable.standard_dna_table.stop_codons ])
genetic_code.update(stop_codons)
Kyte-Doolittle hydrophobicity scale is needed for us as well. We add the STOP codon there as well, and assign it artificially large KD value, simply to make all the mutations to/from the STOP codon distinguishable.
In [18]:
from Bio.SeqUtils import ProtParamData
KD = ProtParamData.kd
KD['*'] = 25.0
In [19]:
sys.path.append("../RNA_SNP_association")
import flu_module as flu
Reference flu-genome sequence is a consensus of the pandemic H1N1: pH1N1. Let's load the reference with the description of ORF location.
In [20]:
ref_fname = "./pH1N1_coding_dat/pH1N1.fa"
orf_fname = "./pH1N1_coding_dat/pH1N1_noPB1F.orf"
ph1n1 = flu.influenza(ref_fname, orf_fname)
Our segment-wide alignments are stored in files with particular names:
In [21]:
aligned_seg_fname = lambda number: "seg%d.afa"%number
Let's load all of the alignments using these file names and store them as a dictionary:
In [22]:
segment_aln = {}
for seg_idx in range(1, ph1n1.segnum+1):
fname = aligned_seg_fname(seg_idx)
segment_aln['seg%d'%seg_idx] = AlignIO.read(fname,'fasta')
Let's output some characteristics of the alignment:
In [23]:
for seg in sorted(segment_aln):
print "%s of len %d has %d sequences aligned"%(seg,segment_aln[seg].get_alignment_length(),len(segment_aln[seg]))
In [24]:
def get_loci_orf_assoc(orf,loci):
"""checks if a given position is in any of the ORFs. Returns
a list of the hosting ORFs with corresponding coords. of codons."""
which_orf = []
for orf_id in orf:
orf_bounds = orf[orf_id]
orf_positions = sum([range(start,stop+1) for start,stop in orf_bounds], [])
# pos residence in ORF or not in ORF?
if loci in orf_positions:
# relative loci, simply an index in the list of orf_positions ...
relative_pos = orf_positions.index(loci)
# the index of the codon it belongs to ...
codon_idx = relative_pos//3
# relative codon coordinates (indexes within CDS):
relative_codon_coord = [codon_idx*3+i for i in range(3)]
# loci is 1st,2nd or 3rd position in the codon?
codon_shift = relative_pos - relative_codon_coord[0]
# absolute codon coordinates (indexes in the genome):
codon_coord = [orf_positions[_] for _ in relative_codon_coord]
which_orf.append( (orf_id, (codon_idx,codon_shift,codon_coord) ) )
return dict(which_orf)
Then we perform the analysis, exploring observed mutations and their basic features:
In [25]:
# lists to store some info ...
codon_position_mutated = []
codon_position_mutated_weight = []
dKD = []
dKD_weight = []
#
segment_var_info = {}
#
# analysis for each segment ...
for seg in sorted(segment_aln):
# turn aln to an array and describe aln ...
aln = np.array([list(rec) for rec in segment_aln[seg]], np.character)
aln = pd.DataFrame(aln)
aln = aln.where(aln!='-',None) # gaps to Nones
descr = aln.describe().transpose() #built-in
descr['freq_ratio'] = descr['freq']/descr['count'] # freq_ratio ignores gaps ...
descr['gaps'] = aln.isnull().sum()
descr['variation'] = descr['freq']<descr['count'] # variable positions ...
# ORF of this segment ...
seg_orf = ph1n1.orf[seg+'_pH1N1']
seg_seq = ph1n1.genome[seg+'_pH1N1']
# we would like to store some averaged statisticsfor each var position:
codon_position_avg = []
codon_position_mutated_weight = []
dKD = []
dKD_weight = []
# go over all variable positions ...
for pos in descr[descr['variation']].index:
# unpack codon information ...
prod_codon = get_loci_orf_assoc(seg_orf,pos)
for product in prod_codon:
codon_idx, codon_shift, codon_coords = prod_codon[product]
# the consensus codon, AA and KD ...
codon_itself = ''.join(seg_seq[i] for i in codon_coords)
aa_itself = genetic_code[codon_itself]
KD_itself = KD[aa_itself]
# SNPs at this position ...
nts_present = aln.loc[:,pos].value_counts().to_dict()
# CODON SHIFT STATISTICS GATHERING ...
codon_position_mutated.append(codon_shift)
# variation frequency, non-consensus % aka mutation 'weight'
weight = 1.0 - descr['freq_ratio'][pos]
codon_position_mutated_weight.append(weight)
# hence, possible codons are:
possible_codons = []
for snp in nts_present:
mut_codon = list(codon_itself)
mut_codon[codon_shift] = snp
possible_codons.append(mut_codon)
# STATISTICS GATHERING ...
the_alt_codon = ''.join(mut_codon)
if the_alt_codon != codon_itself:
the_alt_aa = genetic_code[the_alt_codon]
the_alt_KD = KD[the_alt_aa]
weight = nts_present[snp]*1.0/sum(nts_present.values())
#################################
dKD.append(the_alt_KD - KD_itself)
dKD_weight.append(weight)
# # amino acids ...
# print "Product %s, position %d in protein (codon %s, aa %s)"%(product, codon_shift+1, codon_itself, genetic_code[codon_itself])
# other_possible_codons = [''.join(codon) for codon in possible_codons if ''.join(codon)!=codon_itself]
# print "outcome AAs are: %s"%str([genetic_code[ccc] for ccc in other_possible_codons]).strip('[]').replace('\'','')
# # print str([genetic_code[ccc] for ccc in other_possible_codons]).strip('[]').replace('\'','')
# print "dKD for AA subs: %s"%str([ KD[genetic_code[ccc]]-KD[genetic_code[codon_itself]] for ccc in other_possible_codons]).strip('[]')
In [ ]:
print descr.head(15)
print seg
f,ax = plt.subplots(1,1,figsize=(20,4))
ax.plot(descr[descr['variation']].index,descr[descr['variation']]['freq_ratio'],'ro')
# ax = plt.gca()
# ax.set_yscale('log')
# ax.set_ylim(0.99,1)
plt.show()
In [27]:
plt.hist(codon_position_mutated,bins=3)
ax = plt.gca()
ax.set_xlabel("codon position: 1,2,3")
plt.show()
plt.hist(codon_position_mutated,bins=3,weights=codon_position_mutated_weight)
ax = plt.gca()
ax.set_xlabel("codon position: 1,2,3")
plt.show()
In [ ]: