Here we plot the major base frequency for different time points. We also detect positions that consistently rise in frequency through time, i.e. positions that rise monotonously in frequency with time. Positions that only rise in frequency between the first and the last time points have not been investigated.
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from pylab import rcParams
import seaborn as sns
from array import array
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import linregress
from scipy.stats import mannwhitneyu
%matplotlib inline
In [2]:
begins=[]
ends=[]
names =[]
with open ("sequence.gb") as f:
in_pep = False
for l in f:
if "mat_peptide" in l:
begins.append(int(l.split()[1].split("..")[0]))
ends.append(int(l.split()[1].split("..")[1]))
in_pep = True
elif in_pep :
names.append(l.split("=")[1])
in_pep = False
print(begins)
print(ends)
print(names)
In [3]:
# Interesting positions
positions=[316,1670,1785,2340,5935,7172,8449,9165]
def plot_positions():
for x in positions:
plt.axvline(x=x, linewidth=1, linestyle=':')
def plot_genes():
for i in range(len(begins)):
plt.plot([begins[i], begins[i]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
if i%2==0:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.005, (names[i].replace('"', ''))[0:3], size='xx-small')
else:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.015, (names[i].replace('"', ''))[0:3], size='xx-small')
plt.plot([ends[-1], ends[-1]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
In [4]:
def synonymous (row):
if row['null'] or (row['Consensus_aa']==row['Secondbase_aa'] ):
return "synonymous"
else:
return "non-synonymous"
def add_columns(table):
table['null'] = (table['Secondbase_aa']).isnull()
table['is_synonymous'] = table.apply (lambda row: synonymous (row),axis=1)
table['1_major_variant_frequency'] = 1.0 - table['Major_variant_frequency_quality_corrected']
In [5]:
def is_increasing(minor_frequencies):
#print(minor_frequencies)
previous = minor_frequencies[0]
for m in range(1,len(minor_frequencies)):
if previous < minor_frequencies[m]:
#print(str(previous) + " < " + str(minor_frequencies[m]))
previous = minor_frequencies[m]
else:
return False
return True
def get_variant_frequency(variant, table, i):
sum_of_bases = table['As_quality_corrected'][i]+table['Cs_quality_corrected'][i]+table['Gs_quality_corrected'][i]+table['Ts_quality_corrected'][i]+table['Ns_quality_corrected'][i]
if variant == "A":
return table["As_quality_corrected"][i] / sum_of_bases
elif variant == "C":
return table["Cs_quality_corrected"][i] / sum_of_bases
elif variant == "G":
return table["Gs_quality_corrected"][i] / sum_of_bases
elif variant == "T":
return table["Ts_quality_corrected"][i] / sum_of_bases
else:
return np.nan
def get_increasing_variants(tables):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
increasingVariants = dict()
for i in first["Position"]:
major = first["Major_variant"][i]
#print(last['Major_variant_frequency_quality_corrected'][i])
major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
if major == last["Major_variant"][i]:
minor = last["Second_variant"][i]
else:
minor = last["Major_variant"][i]
minor_frequencies[0] = get_variant_frequency(minor, first, i)
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
minor_frequencies[table_id] = get_variant_frequency(minor, tables[table_id], i)
if is_increasing(minor_frequencies):
increasingVariants[i] = [major_frequencies.tolist(), minor_frequencies.tolist()]
return increasingVariants
In [6]:
# Clone at days 12, 15, 19
clone12 = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1cloneD12_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
clone15 = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1cloneD15_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
clone19 = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1cloneD19_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(clone12)
add_columns(clone15)
add_columns(clone19)
In [7]:
# Control runs, replicate A
DD18_A = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD18_A)
In [8]:
# Control runs, replicate D
DD18_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD18_D)
In [9]:
# Control runs, replicate E
DD3_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD3E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD12_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD12E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD18_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD24E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24crude_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD24Ecrude_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD3_E)
add_columns(DD12_E)
add_columns(DD18_E)
add_columns(DD24_E)
add_columns(DD24crude_E)
In [10]:
# TLR3 activation runs, replicate A
TD18_A = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD18_A)
In [11]:
# TLR3 activation runs, replicate D
TD9_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD18_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_D)
add_columns(TD12_D)
add_columns(TD18_D)
add_columns(TD24_D)
In [12]:
# TLR3 activation runs, replicate E
TD9_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD12E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD18_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD24E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_E)
add_columns(TD12_E)
add_columns(TD18_E)
add_columns(TD24_E)
In [13]:
variable = 'Coverage'
sample = len(clone12 [variable])*["Clone 12"]+len(clone15 [variable])*["Clone 15"]+len(clone19 [variable])*["Clone 19"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([clone12['Position'],clone15['Position'],clone19['Position']]), variable:pd.concat([clone12 [variable],clone15 [variable], clone19 [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
Basically, we find the same areas of low and high coverage.
In [14]:
variable = 'Major_variant_frequency'
sample = len(clone12 [variable])*["Clone 12"]+len(clone15 [variable])*["Clone 15"]+len(clone19 [variable])*["Clone 19"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([clone12['Position'],clone15['Position'],clone19['Position']]), variable:pd.concat([clone12 [variable],clone15 [variable], clone19 [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [15]:
tables_clone = [clone12, clone15, clone19]
increasing_clone = get_increasing_variants(tables_clone)
print("There are "+str(len(increasing_clone))+" positions that rise in frequency.")
print("Those are:")
print(increasing_clone.keys())
In [16]:
variable = 'Coverage'
sample = len(DD18_A [variable])*["DD18_A"]+len(DD18_D [variable])*["DD18_D"]+len(DD3_E [variable])*["DD3_E"]+len(DD12_E [variable])*["DD12_E"]+len(DD18_E [variable])*["DD18_E"]+len(DD24_E [variable])*["DD24_E"]+len(DD24crude_E [variable])*["DD24crude_E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD18_A['Position'],DD18_D['Position'],DD3_E['Position'],DD12_E['Position'],DD18_E['Position'],DD24_E['Position'],DD24crude_E['Position']]), variable:pd.concat([DD18_A [variable], DD18_D [variable], DD3_E [variable], DD12_E [variable], DD18_E [variable], DD24_E [variable], DD24crude_E [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [17]:
variable = 'Major_variant_frequency'
sample = len(DD18_A [variable])*["DD18_A"]+len(DD18_D [variable])*["DD18_D"]+len(DD3_E [variable])*["DD3_E"]+len(DD12_E [variable])*["DD12_E"]+len(DD18_E [variable])*["DD18_E"]+len(DD24_E [variable])*["DD24_E"]+len(DD24crude_E [variable])*["DD24crude_E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD18_A['Position'],DD18_D['Position'],DD3_E['Position'],DD12_E['Position'],DD18_E['Position'],DD24_E['Position'],DD24crude_E['Position']]), variable:pd.concat([DD18_A [variable], DD18_D [variable], DD3_E [variable], DD12_E [variable], DD18_E [variable], DD24_E [variable], DD24crude_E [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [18]:
variable = 'Major_variant_frequency'
sample = len(DD3_E [variable])*["DD3_E"]+len(DD12_E [variable])*["DD12_E"]+len(DD18_E [variable])*["DD18_E"]+len(DD24_E [variable])*["DD24_E"]+len(DD24crude_E [variable])*["DD24crude_E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD3_E['Position'],DD12_E['Position'],DD18_E['Position'],DD24_E['Position'],DD24crude_E['Position']]), variable:pd.concat([DD3_E [variable], DD12_E [variable], DD18_E [variable], DD24_E [variable], DD24crude_E [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [19]:
tables_control = [DD3_E, DD12_E, DD18_E, DD24_E]
increasing_control = get_increasing_variants(tables_control)
print("There are "+str(len(increasing_control))+" positions that rise in frequency.")
print("Those are:")
print(increasing_control.keys())
In [20]:
variable = 'Coverage'
sample = len(TD18_A [variable])*["TD18_A"]+len(TD9_D [variable])*["TD9_D"]+len(TD12_D [variable])*["TD12_D"]+len(TD18_D [variable])*["TD18_D"]+len(TD24_D [variable])*["TD24_D"]+len(TD9_E [variable])*["TD9_E"]+len(TD12_E [variable])*["TD12_E"]+len(TD18_E [variable])*["TD18_E"]+len(TD24_E [variable])*["TD24_E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([TD18_A['Position'],TD9_D['Position'],TD12_D['Position'],TD18_D['Position'],TD24_D['Position'],TD9_E['Position'],TD12_E['Position'],TD18_E['Position'],TD24_E['Position']]), variable:pd.concat([TD18_A[variable],TD9_D[variable],TD12_D[variable],TD18_D[variable],TD24_D[variable],TD9_E[variable],TD12_E[variable],TD18_E[variable],TD24_E[variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()