Here we plot the major base frequency for different time points. We also detect positions that consistently rise in frequency through time, i.e. positions that rise monotonously in frequency with time. Positions that only rise in frequency between the first and the last time points have not been investigated.
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from pylab import rcParams
import seaborn as sns
from array import array
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import linregress
from scipy.stats import mannwhitneyu
%matplotlib inline
In [2]:
begins=[]
ends=[]
names =[]
with open ("sequence.gb") as f:
in_pep = False
for l in f:
if "mat_peptide" in l:
begins.append(int(l.split()[1].split("..")[0]))
ends.append(int(l.split()[1].split("..")[1]))
in_pep = True
elif in_pep :
names.append(l.split("=")[1])
in_pep = False
print(begins)
print(ends)
print(names)
In [3]:
# Interesting positions
positions=[316,1670,1785,2340,5935,7172,8449,9165]
def plot_positions():
for x in positions:
plt.axvline(x=x, linewidth=1, linestyle=':')
def plot_genes():
for i in range(len(begins)):
plt.plot([begins[i], begins[i]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
if i%2==0:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.005, (names[i].replace('"', ''))[0:3], size='xx-small')
else:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.015, (names[i].replace('"', ''))[0:3], size='xx-small')
plt.plot([ends[-1], ends[-1]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
In [4]:
def synonymous (row):
if row['null'] or (row['Consensus_aa']==row['Secondbase_aa'] ):
return "synonymous"
else:
return "non-synonymous"
def add_columns(table):
table['null'] = (table['Secondbase_aa']).isnull()
table['is_synonymous'] = table.apply (lambda row: synonymous (row),axis=1)
table['1_major_variant_frequency'] = 1.0 - table['Major_variant_frequency_quality_corrected']
In [5]:
def is_increasing(minor_frequencies):
#print(minor_frequencies)
previous = minor_frequencies[0]
for m in range(1,len(minor_frequencies)):
if previous < minor_frequencies[m]:
#print(str(previous) + " < " + str(minor_frequencies[m]))
previous = minor_frequencies[m]
else:
return False
return True
def get_variant_frequency(variant, table, i):
sum_of_bases = table['As_quality_corrected'][i]+table['Cs_quality_corrected'][i]+table['Gs_quality_corrected'][i]+table['Ts_quality_corrected'][i]+table['Ns_quality_corrected'][i]
if variant == "A":
return table["As_quality_corrected"][i] / sum_of_bases
elif variant == "C":
return table["Cs_quality_corrected"][i] / sum_of_bases
elif variant == "G":
return table["Gs_quality_corrected"][i] / sum_of_bases
elif variant == "T":
return table["Ts_quality_corrected"][i] / sum_of_bases
else:
return np.nan
def get_increasing_variants(tables):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
increasingVariants = dict()
for i in first["Position"]:
major = first["Major_variant"][i]
#print(last['Major_variant_frequency_quality_corrected'][i])
major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
if major == last["Major_variant"][i]:
minor = last["Second_variant"][i]
else:
minor = last["Major_variant"][i]
minor_frequencies[0] = get_variant_frequency(minor, first, i)
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
minor_frequencies[table_id] = get_variant_frequency(minor, tables[table_id], i)
if is_increasing(minor_frequencies):
increasingVariants[i] = [major_frequencies.tolist(), minor_frequencies.tolist()]
return increasingVariants
In [6]:
# Clone at days 12, 15, 19
clone12 = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1cloneD12_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
clone15 = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1cloneD15_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
clone19 = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1cloneD19_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(clone12)
add_columns(clone15)
add_columns(clone19)
In [7]:
# Control runs, replicate A
DD18_A = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD18_A)
In [8]:
# Control runs, replicate D
DD18_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD18_D)
In [9]:
# Control runs, replicate E
DD3_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD3E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD12_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD12E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD18_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD24E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24crude_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD24Ecrude_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD3_E)
add_columns(DD12_E)
add_columns(DD18_E)
add_columns(DD24_E)
add_columns(DD24crude_E)
In [10]:
# TLR3 activation runs, replicate A
TD18_A = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD18_A)
In [11]:
# TLR3 activation runs, replicate D
TD9_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD18_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_D = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_D)
add_columns(TD12_D)
add_columns(TD18_D)
add_columns(TD24_D)
In [12]:
# TLR3 activation runs, replicate E
TD9_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD12E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD18_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_E = pd.read_csv ("HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD24E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_E)
add_columns(TD12_E)
add_columns(TD18_E)
add_columns(TD24_E)
In [13]:
variable = 'Coverage'
sample = len(clone12 [variable])*["Clone 12"]+len(clone15 [variable])*["Clone 15"]+len(clone19 [variable])*["Clone 19"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([clone12['Position'],clone15['Position'],clone19['Position']]), variable:pd.concat([clone12 [variable],clone15 [variable], clone19 [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
Basically, we find the same areas of low and high coverage.
In [14]:
variable = 'Major_variant_frequency'
sample = len(clone12 [variable])*["Clone 12"]+len(clone15 [variable])*["Clone 15"]+len(clone19 [variable])*["Clone 19"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([clone12['Position'],clone15['Position'],clone19['Position']]), variable:pd.concat([clone12 [variable],clone15 [variable], clone19 [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [15]:
tables_clone = [clone12, clone15, clone19]
increasing_clone = get_increasing_variants(tables_clone)
print("There are "+str(len(increasing_clone))+" positions that rise in frequency.")
print("Those are:")
print(increasing_clone.keys())
In [16]:
variable = 'Coverage'
sample = len(DD18_A [variable])*["DD18_A"]+len(DD18_D [variable])*["DD18_D"]+len(DD3_E [variable])*["DD3_E"]+len(DD12_E [variable])*["DD12_E"]+len(DD18_E [variable])*["DD18_E"]+len(DD24_E [variable])*["DD24_E"]+len(DD24crude_E [variable])*["DD24crude_E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD18_A['Position'],DD18_D['Position'],DD3_E['Position'],DD12_E['Position'],DD18_E['Position'],DD24_E['Position'],DD24crude_E['Position']]), variable:pd.concat([DD18_A [variable], DD18_D [variable], DD3_E [variable], DD12_E [variable], DD18_E [variable], DD24_E [variable], DD24crude_E [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [17]:
variable = 'Major_variant_frequency'
sample = len(DD18_A [variable])*["DD18_A"]+len(DD18_D [variable])*["DD18_D"]+len(DD3_E [variable])*["DD3_E"]+len(DD12_E [variable])*["DD12_E"]+len(DD18_E [variable])*["DD18_E"]+len(DD24_E [variable])*["DD24_E"]+len(DD24crude_E [variable])*["DD24crude_E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD18_A['Position'],DD18_D['Position'],DD3_E['Position'],DD12_E['Position'],DD18_E['Position'],DD24_E['Position'],DD24crude_E['Position']]), variable:pd.concat([DD18_A [variable], DD18_D [variable], DD3_E [variable], DD12_E [variable], DD18_E [variable], DD24_E [variable], DD24crude_E [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [18]:
variable = 'Major_variant_frequency'
sample = len(DD3_E [variable])*["DD3_E"]+len(DD12_E [variable])*["DD12_E"]+len(DD18_E [variable])*["DD18_E"]+len(DD24_E [variable])*["DD24_E"]+len(DD24crude_E [variable])*["DD24crude_E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD3_E['Position'],DD12_E['Position'],DD18_E['Position'],DD24_E['Position'],DD24crude_E['Position']]), variable:pd.concat([DD3_E [variable], DD12_E [variable], DD18_E [variable], DD24_E [variable], DD24crude_E [variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [19]:
tables_control = [DD3_E, DD12_E, DD18_E, DD24_E]
increasing_control = get_increasing_variants(tables_control)
print("There are "+str(len(increasing_control))+" positions that rise in frequency.")
print("Those are:")
print(increasing_control.keys())
In [20]:
variable = 'Coverage'
sample = len(TD18_A [variable])*["TD18_A"]+len(TD9_D [variable])*["TD9_D"]+len(TD12_D [variable])*["TD12_D"]+len(TD18_D [variable])*["TD18_D"]+len(TD24_D [variable])*["TD24_D"]+len(TD9_E [variable])*["TD9_E"]+len(TD12_E [variable])*["TD12_E"]+len(TD18_E [variable])*["TD18_E"]+len(TD24_E [variable])*["TD24_E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([TD18_A['Position'],TD9_D['Position'],TD12_D['Position'],TD18_D['Position'],TD24_D['Position'],TD9_E['Position'],TD12_E['Position'],TD18_E['Position'],TD24_E['Position']]), variable:pd.concat([TD18_A[variable],TD9_D[variable],TD12_D[variable],TD18_D[variable],TD24_D[variable],TD9_E[variable],TD12_E[variable],TD18_E[variable],TD24_E[variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [21]:
variable = 'Major_variant_frequency'
sample = len(TD18_A [variable])*["TD18_A"]+len(TD9_D [variable])*["TD9_D"]+len(TD12_D [variable])*["TD12_D"]+len(TD18_D [variable])*["TD18_D"]+len(TD24_D [variable])*["TD24_D"]+len(TD9_E [variable])*["TD9_E"]+len(TD12_E [variable])*["TD12_E"]+len(TD18_E [variable])*["TD18_E"]+len(TD24_E [variable])*["TD24_E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([TD18_A['Position'],TD9_D['Position'],TD12_D['Position'],TD18_D['Position'],TD24_D['Position'],TD9_E['Position'],TD12_E['Position'],TD18_E['Position'],TD24_E['Position']]), variable:pd.concat([TD18_A[variable],TD9_D[variable],TD12_D[variable],TD18_D[variable],TD24_D[variable],TD9_E[variable],TD12_E[variable],TD18_E[variable],TD24_E[variable] ]), 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [22]:
tables_TD = [TD9_D, TD12_D, TD18_D, TD24_D]
increasing_TD = get_increasing_variants(tables_TD)
print("There are "+str(len(increasing_TD))+" positions that rise in frequency.")
print("Those are:")
print(increasing_TD.keys())
In [23]:
tables_TE = [TD9_E, TD12_E, TD18_E, TD24_E]
increasing_TE = get_increasing_variants(tables_TE)
print("There are "+str(len(increasing_TE))+" positions that rise in frequency.")
print("Those are:")
print(increasing_TE.keys())
In [40]:
toVincent = pd.DataFrame()
allExps = [clone12,clone15,clone19,DD18_A,DD18_D,DD3_E,DD12_E,DD18_E,DD24_E,DD24crude_E,TD18_A,TD9_D,TD12_D,TD18_D,TD24_D,TD9_E,TD12_E,TD18_E,TD24_E]
print(len(allExps))
for d in allExps:
# print(d.loc[2340])
# print(d.loc[7172])
temp_316 = d[d['Position']==316]
temp_2340 = d[d['Position']==2340]
temp_7172 = d[d['Position']==7172]
temp_9165 = d[d['Position']==9165]
toVincent=pd.concat([toVincent,temp_316])
toVincent=pd.concat([toVincent,temp_2340])
toVincent=pd.concat([toVincent,temp_7172])
toVincent=pd.concat([toVincent,temp_9165])
expnames=["clone12","clone15","clone19","DD18_A","DD18_D","DD3_E","DD12_E","DD18_E","DD24_E","DD24crude_E","TD18_A","TD9_D","TD12_D","TD18_D","TD24_D","TD9_E","TD12_E","TD18_E","TD24_E"]
expnames4 = ["clone12"]*4,["clone15"]*4,["clone19"]*4,["DD18_A"]*4,["DD18_D"]*4,["DD3_E"]*4,["DD12_E"]*4,["DD18_E"]*4,["DD24_E"]*4,["DD24crude_E"]*4,["TD18_A"]*4,["TD9_D"]*4,["TD12_D"]*4,["TD18_D"]*4,["TD24_D"]*4,["TD9_E"]*4,["TD12_E"]*4,["TD18_E"]*4,["TD24_E"]*4,
expnames = [item for sublist in expnames4 for item in sublist]
print(len(expnames))
toVincent['expName']=expnames
toVincent.to_csv( "pos_316_2340_7172_9165_2ndLibrary.csv")
print(temp_7172)
In [13]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=cirseq, fit_reg=False, legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [14]:
cirseq['Position'][cirseq['Major_variant_frequency_quality_corrected']<0.95]
Out[14]:
There is already position 1785... What are its states?
In [15]:
print("Position 1785: ")
print ("Major variant: "+cirseq['Major_variant'][1785])
print ("Second variant: "+cirseq['Second_variant'][1785])
print ("Major variant, aa: "+cirseq['consensus_aa'][1785])
print ("Second variant, aa: "+cirseq['secondbase_aa'][1785])
print("Frequency of the major variant: "+str(cirseq['Major_variant_frequency_quality_corrected'][1785]))
print("Frequency of the second variant: "+str(cirseq['Second_variant_frequency_quality_corrected'][1785]))
In [16]:
cirseq['1_major_variant_frequency'] = 1.0 - cirseq['Major_variant_frequency_quality_corrected']
#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=cirseq, fit_reg=False, legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
plt.legend(loc='lower right')
Here the distribution of diversity seems unimodal.
In [17]:
table_for_correlations = pd.DataFrame ({'51A_Cov':DD51_A ['Coverage'],'51ANR_Cov':DD51_A_no_reamp['Coverage'], '51A_MV':DD51_A ['Major_variant_frequency_quality_corrected'], '51ANR_MV':DD51_A_no_reamp ['Major_variant_frequency_quality_corrected'] })
table_for_correlations_highCov = table_for_correlations.loc[ (table_for_correlations['51A_Cov'] > 1000) & (table_for_correlations['51ANR_Cov'] > 1000) ]
sns.pairplot(table_for_correlations_highCov, vars=['51A_Cov','51ANR_Cov','51A_MV','51ANR_MV'], kind="scatter")
Out[17]:
In [18]:
table_for_correlations_highFreq = table_for_correlations.loc[ (table_for_correlations['51A_MV'] > 0.1) & (table_for_correlations['51ANR_MV'] > 0.1) ]
lm=sns.lmplot(x='51A_MV',y='51ANR_MV', data=table_for_correlations_highFreq)
axes = lm.axes
#axes[0,0].set_ylim(0.001,0.05)
#axes[0,0].set(yscale="log", xscale="log")
res=linregress(table_for_correlations_highFreq['51A_MV'], table_for_correlations_highFreq['51ANR_MV'])
print(res)
There is a correlation between the frequencies of the major variants in the two libraries, but the correlation coefficient is not great (0.55).
In [19]:
table_for_correlations_highFreq = table_for_correlations.loc[ (table_for_correlations['51ANR_MV'] > 0.9) ]
lm=sns.lmplot(x='51A_MV',y='51ANR_MV', data=table_for_correlations_highFreq)
axes = lm.axes
#axes[0,0].set_ylim(0.001,0.05)
#axes[0,0].set(yscale="log")
res=linregress(table_for_correlations_highFreq['51A_MV'], table_for_correlations_highFreq['51ANR_MV'])
print(res)
The correlation is not great for positions where there is little variation. It is also surprising to see that lots of positions with frequencies between 0.9 and 1.0 in the non-reamplified library have a frequency of nearly 1.0 in the reamplified one. This seems to suggest that the amplification by PCR overly amplified the dominant variant in lots of cases.
In [20]:
table_for_correlations = pd.DataFrame ({'51A_Cov':DD51_A ['Coverage'],'51ANR_Cov':DD51_A_no_reamp['Coverage'], '51A_SV':DD51_A ['Second_variant_frequency_quality_corrected'], '51ANR_SV':DD51_A_no_reamp ['Second_variant_frequency_quality_corrected'] })
table_for_correlations_highCov = table_for_correlations.loc[ (table_for_correlations['51A_Cov'] > 1000) & (table_for_correlations['51ANR_Cov'] > 1000) ]
In [21]:
#table_for_correlations_highFreq = table_for_correlations.loc[ (table_for_correlations['51A_MV'] > 0.95) ]
lm=sns.lmplot(x='51A_SV',y='51ANR_SV', data=table_for_correlations_highCov)
axes = lm.axes
#axes[0,0].set_ylim(0.001,0.05)
#axes[0,0].set(yscale="log")
res=linregress(table_for_correlations_highCov['51A_SV'], table_for_correlations_highCov['51ANR_SV'])
print(res)
The correlation between the frequencies of the second most frequent variants is not too bad (correlation coefficient 0.63).
Positions that differ in the most frequent base:
In [22]:
DD51_A ['Position'][DD51_A ['Major_variant'] != DD51_A_no_reamp ['Major_variant']]
Out[22]:
Number of positions that differ in the second most frequent base:
In [23]:
len(DD51_A ['Position'][DD51_A ['Second_variant'] != DD51_A_no_reamp ['Second_variant']])
Out[23]:
In [24]:
sns.kdeplot(DD51_A ['Second_variant_frequency_quality_corrected'][DD51_A ['Second_variant'] != DD51_A_no_reamp ['Second_variant']])
Out[24]:
In [25]:
(DD51_A ['Second_variant_frequency_quality_corrected'][DD51_A ['Second_variant'] != DD51_A_no_reamp ['Second_variant']]).describe()
Out[25]:
The second most frequent variants that differ between with and without reamplification usually have a low frequency; for frequent variants, the correlation is not too bad.
In [26]:
DD51_A_no_reamp['1_major_variant_frequency'] = 1.0 - DD51_A_no_reamp['Major_variant_frequency_quality_corrected']
#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=DD51_A_no_reamp, fit_reg=False, legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
plt.legend(loc='lower right')
In [27]:
DD51_A['1_major_variant_frequency'] = 1.0 - DD51_A['Major_variant_frequency_quality_corrected']
#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=DD51_A, fit_reg=False, legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
plt.legend(loc='lower right')
In [28]:
f, ax = plt.subplots(figsize=(15, 7))
lm=sns.kdeplot(np.log(DD51_A["1_major_variant_frequency"]), label="DD51_A")
axes = lm.axes
axes.set_xlim(-8,-4)
lm=sns.kdeplot(np.log(DD51_A_no_reamp["1_major_variant_frequency"]), label="DD51_A_no_reamp")
In [29]:
tables_A = [DD3_A, DD6_A, DD9_A, DD12_A, DD24_A, DD51_A]
increasing_A = get_increasing_variants(tables_A)
print("There are "+str(len(increasing_A))+" positions that rise in frequency.")
print("Those are:")
print(increasing_A.keys())
In [30]:
tables_D = [DD3_D, DD6_D, DD9_D, DD12_D, DD24_D]
increasing_D = get_increasing_variants(tables_D)
print("There are "+str(len(increasing_D))+" positions that rise in frequency.")
print("Those are:")
print(increasing_D.keys())
In [31]:
tables_E = [DD6_E, DD9_E]
increasing_E = get_increasing_variants(tables_E)
print("There are "+str(len(increasing_E))+" positions that rise in frequency.")
print("There are too many of them, we choose not to print them.")
In [32]:
tables_TA = [TD9_A, TD12_A, TD24_A, TD51_A]
increasing_TA = get_increasing_variants(tables_TA)
print("There are "+str(len(increasing_TA))+" positions that rise in frequency.")
print("Those are:")
print(increasing_TA.keys())
Here we plot the frequency of the major variant at day 51 in the control experiment, A. We color the dots according to whether they were consistently rising in frequency or not.
In [33]:
sns.set_palette("hls")
sns.set_context("poster")
In [34]:
increasing_A_keys = increasing_A.keys()
is_increasing = []
for i in DD51_A['Position']:
if i in increasing_A_keys:
is_increasing.append("Increasing")
else:
is_increasing.append("Not increasing")
to_plot = pd.DataFrame ({'Position':DD51_A['Position'], 'Major_variant_frequency_quality_corrected':DD51_A ['Major_variant_frequency_quality_corrected'],'Is_increasing':is_increasing})
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=to_plot, fit_reg=False, hue='Is_increasing', legend=False, size=7, aspect=2)
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
Here we plot the increase in frequency of the second variant, over the course of the entire experiment, in the different replicates.
In [35]:
def plot_increase_per_increasing_position (increasing_list, last_table):
increasing_list_keys = increasing_list.keys()
increase = []
is_increasing = []
for i in last_table['Position']:
if i in increasing_list_keys:
increase.append(increasing_list[i][1][len(increasing_list[i][1])-1] - increasing_list[i][1][0])
is_increasing.append("Increasing")
else:
increase.append(0.0)
is_increasing.append("Not increasing")
to_plot = pd.DataFrame ({'Position':last_table['Position'], 'Increase':increase,'Is_increasing':is_increasing})
sns.lmplot( x="Position", y="Increase", data=to_plot, fit_reg=False, hue='Is_increasing', legend=False, size=7, aspect=2)
# plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
return
In [36]:
plot_increase_per_increasing_position (increasing_A, DD51_A)
In [37]:
plot_increase_per_increasing_position (increasing_D, DD24_D)
In [38]:
plot_increase_per_increasing_position (increasing_E, DD9_E)
In [39]:
plot_increase_per_increasing_position (increasing_TA, TD51_A)
Here we plot the frequencies of the major variants, for the different time points of an experiment.
In [40]:
overlay_table = pd.DataFrame ({'Position':DD3_A['Position'], 'DD3_A':DD3_A ['Major_variant_frequency_quality_corrected'], 'DD6_A':DD6_A ['Major_variant_frequency_quality_corrected'],'DD9_A':DD9_A ['Major_variant_frequency_quality_corrected'],'DD12_A':DD12_A ['Major_variant_frequency_quality_corrected'], 'DD24_A':DD24_A ['Major_variant_frequency_quality_corrected'], 'DD51_A':DD51_A ['Major_variant_frequency_quality_corrected']})
siz = len(DD3_A ['Position'])
sample = siz*["DD3A"]+siz*["DD6A"]+siz*["DD9A"]+siz*["DD12A"]+siz*["DD24A"]+siz*["DD51A"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD3_A['Position'],DD6_A['Position'],DD9_A['Position'],DD12_A['Position'],DD24_A['Position'],DD51_A['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD3_A ['Major_variant_frequency_quality_corrected'],DD6_A ['Major_variant_frequency_quality_corrected'], DD9_A ['Major_variant_frequency_quality_corrected'],DD12_A ['Major_variant_frequency_quality_corrected'], DD24_A ['Major_variant_frequency_quality_corrected'], DD51_A ['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
In [41]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Frequency of major variant")
plot_positions()
#plot_genes()
Same thing, smaller dots
In [42]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, scatter_kws={"s": 10})
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Frequency of major variant")
plot_positions()
#plot_genes()
In [43]:
overlay_table_concat_no51 = overlay_table_concat.loc[overlay_table_concat['sample']!= "DD51A"]
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data= overlay_table_concat_no51, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Frequency of major variant")
plot_positions()
#plot_genes()
Here we want to know if over all positions, the diversity has increased between day 3 and day 51.
In [44]:
overlay_table_concat['1_major_variant_frequency'] = 1.0 - overlay_table_concat['Major_variant_frequency_quality_corrected']
#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")
overlay_table_concat_nointer = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A")]
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=overlay_table_concat_nointer, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
In [45]:
overlay_table_concat_nointer = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") &(overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A")]
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=overlay_table_concat_nointer, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
#plt.legend(loc='lower right')
#plot_positions()
#plot_genes()
Out[45]:
In [46]:
overlay_table_concat_nointer = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A")]
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat_nointer, hue='sample', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = overlay_table_concat_nointer.groupby(['sample'])['1_major_variant_frequency'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
print(medians)
syn = overlay_table_concat_nointer['1_major_variant_frequency'][overlay_table_concat_nointer['sample']=="DD3A"]
nonsyn = overlay_table_concat_nointer['1_major_variant_frequency'][overlay_table_concat_nointer['sample']=="DD51A"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
print((syn).median())
print((nonsyn).median())
# giving title to the plot
plt.title("Diversity through time");
In [47]:
f, ax = plt.subplots(figsize=(15, 7))
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD3A")
lm.set_ylabel('Density')
lm.set_xlabel('Logarithm of the diversity per position')
axes = lm.axes
axes.set_xlim(-8,-4)
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD6A")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD6A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD9A")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD12A")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD24A")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD51A")
In [48]:
overlay_table = pd.DataFrame ({'Position':DD3_D['Position'], 'DD3_D':DD3_D ['Major_variant_frequency_quality_corrected'], 'DD6_D':DD6_D ['Major_variant_frequency_quality_corrected'],'DD9_D':DD9_D ['Major_variant_frequency_quality_corrected'],'DD12_D':DD12_D ['Major_variant_frequency_quality_corrected'], 'DD24_D':DD24_D ['Major_variant_frequency_quality_corrected']})
siz = len(DD3_D ['Position'])
sample = siz*["DD3D"]+siz*["DD6D"]+siz*["DD9D"]+siz*["DD12D"]+siz*["DD24D"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD3_D['Position'],DD6_D['Position'],DD9_D['Position'],DD12_D['Position'],DD24_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD3_D ['Major_variant_frequency_quality_corrected'],DD6_D ['Major_variant_frequency_quality_corrected'], DD9_D ['Major_variant_frequency_quality_corrected'],DD12_D ['Major_variant_frequency_quality_corrected'], DD24_D ['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
overlay_table_concat["1_major_variant_frequency"] = 1-overlay_table_concat['Major_variant_frequency_quality_corrected']
In [49]:
f, ax = plt.subplots(figsize=(15, 7))
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24D") & (overlay_table_concat['sample']!= "DD12D") & (overlay_table_concat['sample']!= "DD9D") & (overlay_table_concat['sample']!= "DD6D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD3D")
lm.set_ylabel('Density')
lm.set_xlabel('Logarithm of the diversity per position')
axes = lm.axes
axes.set_xlim(-8,-4)
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24D") & (overlay_table_concat['sample']!= "DD12D") & (overlay_table_concat['sample']!= "DD9D") & (overlay_table_concat['sample']!= "DD3D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD6D")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24D") & (overlay_table_concat['sample']!= "DD12D") & (overlay_table_concat['sample']!= "DD6D") & (overlay_table_concat['sample']!= "DD3D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD9D")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24D") & (overlay_table_concat['sample']!= "DD9D") & (overlay_table_concat['sample']!= "DD6D") & (overlay_table_concat['sample']!= "DD3D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD12D")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD12D") & (overlay_table_concat['sample']!= "DD9D") & (overlay_table_concat['sample']!= "DD6D") & (overlay_table_concat['sample']!= "DD3D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD24D")
In [50]:
overlay_table = pd.DataFrame ({'Position':TD9_A['Position'], 'TD9':TD9_A ['Major_variant_frequency_quality_corrected'], 'TD12':TD12_A ['Major_variant_frequency_quality_corrected'],'TD24':TD24_A ['Major_variant_frequency_quality_corrected'],'TD51':TD51_A ['Major_variant_frequency_quality_corrected'] })
siz = len(TD9_A ['Position'])
sample = siz*["TD9"]+siz*["TD12"]+siz*["TD24"]+siz*["TD51"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([TD9_A['Position'], TD12_A['Position'], TD24_A['Position'], TD51_A['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([ TD9_A ['Major_variant_frequency_quality_corrected'], TD12_A ['Major_variant_frequency_quality_corrected'], TD24_A ['Major_variant_frequency_quality_corrected'], TD51_A ['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
overlay_table_concat["1_major_variant_frequency"] = 1-overlay_table_concat['Major_variant_frequency_quality_corrected']
In [51]:
f, ax = plt.subplots(figsize=(15, 7))
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "TD12") & (overlay_table_concat['sample']!= "TD24") & (overlay_table_concat['sample']!= "TD51") ]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="TD9")
lm.set_ylabel('Density')
lm.set_xlabel('Logarithm of the diversity per position')
axes = lm.axes
axes.set_xlim(-8,-4)
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "TD9") & (overlay_table_concat['sample']!= "TD24") & (overlay_table_concat['sample']!= "TD51") ]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="TD12")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "TD9") & (overlay_table_concat['sample']!= "TD12") & (overlay_table_concat['sample']!= "TD51") ]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="TD24")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "TD9") & (overlay_table_concat['sample']!= "TD12") & (overlay_table_concat['sample']!= "TD24") ]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="TD51")
In the analyses above, it would seem that the diversity, i.e. the amount of polymorphism, has increased a bit, as in Acevedo et al. 2014. However, presumably because we do not use Cirseq, we have a bimodal distribution of diversity, with possibly first a peak corresponding to sequencing errors, and second a peak corresponding to variants that do rise in frequency through time. In support of this hypothesis, the only CirSeq data we have, analyzed at the top of this document, does not show this bimodal distribution, and in the graphs above for the replicate A and the TLR3 treatment condition, only the variants with high frequency seem to increase in frequency through time. In the replicate D, it is not clear because the increase seems to be in both peaks.
Here we do the same type of analysis, but looking at all time points, not just 3 and 51.
In [52]:
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat, ax=ax, bw=0.2)
# giving title to the plot
plt.title("Diversity through time");
medians = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].median()
print("Median values of the diversity per sample:" )
print(medians)
means = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].mean()
print("Mean values of the diversity per sample:" )
print(means)
The increase is no longer obvious, certainly not monotonous. If anything, what might be happening is that first diversity decreases from time points 3 to 9, then increases. If this is true, this could be compatible with a selective sweep in the first days, so that only some haplotypes carrying the variants with the high fitness increase in frequency.
In [53]:
overlay_table = pd.DataFrame ({'Position':DD3_D['Position'], 'DD3_D':DD3_D ['Major_variant_frequency_quality_corrected'], 'DD6_D':DD6_D ['Major_variant_frequency_quality_corrected'],'DD9_D':DD9_D ['Major_variant_frequency_quality_corrected'],'DD12_D':DD12_D ['Major_variant_frequency_quality_corrected'],'DD24_D':DD24_D ['Major_variant_frequency_quality_corrected']})
siz = len(DD3_D ['Position'])
sample = siz*["DD3D"]+siz*["DD6D"]+siz*["DD9D"]+siz*["DD12D"]+siz*["DD24D"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD3_D['Position'],DD6_D['Position'],DD9_D['Position'],DD12_D['Position'],DD24_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD3_D ['Major_variant_frequency_quality_corrected'],DD6_D ['Major_variant_frequency_quality_corrected'], DD9_D ['Major_variant_frequency_quality_corrected'],DD12_D ['Major_variant_frequency_quality_corrected'], DD24_D ['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
overlay_table_concat['1_major_variant_frequency'] = 1.0 - overlay_table_concat['Major_variant_frequency_quality_corrected']
In [54]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Frequency of major variant")
plot_positions()
#plot_genes()
In [55]:
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat, ax=ax, bw=0.2)
# giving title to the plot
plt.title("Diversity through time");
medians = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].median()
print("Median values of the diversity per sample:" )
print(medians)
means = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].mean()
print("Mean values of the diversity per sample:" )
print(means)
In this case, it may be that the diversity increases then reaches some sort of plateau at about a median of 0.0047-0.0048.
In [56]:
siz = len(DD6_E ['Position'])
sample = siz*["DD6E"]+siz*["DD9E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD6_E['Position'],DD9_E['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD6_E ['Major_variant_frequency_quality_corrected'],DD9_E ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
overlay_table_concat['1_major_variant_frequency'] = 1.0 - overlay_table_concat['Major_variant_frequency_quality_corrected']
In [57]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
plt.ylabel("Frequency of major variant")
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
In [58]:
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat, ax=ax, bw=0.2)
# giving title to the plot
plt.title("Diversity through time");
medians = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].median()
print("Median values of the diversity per sample:" )
print(medians)
means = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].mean()
print("Mean values of the diversity per sample:" )
print(means)
Although we only have two time points, the results are consistent with an increase in the first few days.
In [59]:
overlay_table = pd.DataFrame ({'Position':TD9_A['Position'], 'TD9_A':TD9_A ['Major_variant_frequency_quality_corrected'], 'TD12_A':TD12_A ['Major_variant_frequency_quality_corrected'],'TD24_A':TD24_A ['Major_variant_frequency_quality_corrected'], 'TD51_A':TD51_A ['Major_variant_frequency_quality_corrected']})
siz = len(TD9_A ['Position'])
sample = siz*["TD9A"]+siz*["TD12A"]+siz*["TD24A"]+siz*["TD51A"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([TD9_A['Position'],TD12_A['Position'],TD24_A['Position'],TD51_A['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([TD9_A['Major_variant_frequency_quality_corrected'],TD12_A['Major_variant_frequency_quality_corrected'],TD24_A['Major_variant_frequency_quality_corrected'],TD51_A['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
overlay_table_concat['1_major_variant_frequency'] = 1.0 - overlay_table_concat['Major_variant_frequency_quality_corrected']
In [60]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
plt.ylabel("Frequency of major variant")
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
In [61]:
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat, ax=ax, bw=0.2)
# giving title to the plot
plt.title("Diversity through time");
medians = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].median()
print("Median values of the diversity per sample:" )
print(medians)
means = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].mean()
print("Mean values of the diversity per sample:" )
print(means)
The diversity initially increases, but then it may be decreasing between 24 and 51. Not clear with few time points and no replicate.
In [62]:
# construction of the data table
siz = len(DD3_A ['Coverage'])
sample = siz*["DD3A"]+siz*["DD6A"]+siz*["DD9A"]+siz*["DD12A"]+siz*["DD24A"]+siz*["DD51A"]
overlay_table_concat_DDA = pd.DataFrame ({'Position':pd.concat([DD3_A['Position'],DD6_A['Position'],DD9_A['Position'],DD12_A['Position'],DD24_A['Position'],DD51_A['Position']]), 'Coverage':pd.concat([DD3_A ['Coverage'],DD6_A ['Coverage'], DD9_A ['Coverage'],DD12_A ['Coverage'], DD24_A ['Coverage'], DD51_A ['Coverage'] ]), 'sample':sample})
siz = len(DD3_D ['Coverage'])
sample = siz*["DD3D"]+siz*["DD6D"]+siz*["DD9D"]+siz*["DD12D"]+siz*["DD24D"]
overlay_table_concat_DDD = pd.DataFrame ({'Position':pd.concat([DD3_D['Position'],DD6_D['Position'],DD9_D['Position'],DD12_D['Position'],DD24_D['Position']]), 'Coverage':pd.concat([DD3_D ['Coverage'],DD6_D ['Coverage'], DD9_D ['Coverage'],DD12_D ['Coverage'], DD24_D ['Coverage'] ]), 'sample':sample})
siz = len(DD6_E ['Coverage'])
sample = siz*["DD6E"]+siz*["DD9E"]
overlay_table_concat_DDE = pd.DataFrame ({'Position':pd.concat([DD6_E['Position'],DD9_E['Position']]), 'Coverage':pd.concat([DD6_E ['Coverage'],DD9_E ['Coverage']]), 'sample':sample})
siz = len(TD9_A ['Coverage'])
sample = siz*["TD9A"]+siz*["TD12A"]+siz*["TD24A"]+siz*["TD51A"]
overlay_table_concat_TD = pd.DataFrame ({'Position':pd.concat([TD9_A['Position'],TD12_A['Position'],TD24_A['Position'],TD51_A['Position']]), 'Coverage':pd.concat([TD9_A['Coverage'],TD12_A['Coverage'],TD24_A['Coverage'],TD51_A['Coverage'] ]), 'sample':sample})
overlay_table_concat=pd.concat([overlay_table_concat_DDA,overlay_table_concat_DDD, overlay_table_concat_DDE, overlay_table_concat_TD])
In [63]:
class MathTextSciFormatter(mticker.Formatter):
def __init__(self, fmt="%1.2e"):
self.fmt = fmt
def __call__(self, x, pos=None):
s = self.fmt % x
decimal_point = '.'
positive_sign = '+'
tup = s.split('e')
significand = tup[0].rstrip(decimal_point)
sign = tup[1][0].replace(positive_sign, '')
exponent = tup[1][1:].lstrip('0')
if exponent:
exponent = '10^{%s%s}' % (sign, exponent)
if significand and exponent:
s = r'%s{\times}%s' % (significand, exponent)
else:
s = r'%s%s' % (significand, exponent)
return "${}$".format(s)
markers = ['x','o','v','^','<', '+', 'x','o','v','^','<', '+', 'x','o','v','^','<']
sns.lmplot( x="Position", y="Coverage", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True, markers=markers, scatter_kws={"s": 20})
# Format with 2 decimal places
plt.gca().yaxis.set_major_formatter(MathTextSciFormatter("%1.2e"))
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, mode="expand", borderaxespad=0., ncol=4)
plot_positions()
#plot_genes()
In [64]:
siz = len(DD3_A ['Coverage'])
table_for_correlations = pd.DataFrame ({'DD3A':DD3_A ['Coverage'],'DD6_A':DD6_A['Coverage'], 'DD9_A':DD9_A ['Coverage'], 'DD12_A':DD12_A ['Coverage'], 'DD24_A':DD24_A ['Coverage'], 'DD51_A':DD51_A ['Coverage'],
'DD3_D':DD3_D ['Coverage'],'DD6_D':DD6_D ['Coverage'], 'DD9_D':DD9_D ['Coverage'],'DD12_D':DD12_D ['Coverage'], 'DD24_D':DD24_D ['Coverage'],
'DD6_E':DD6_E ['Coverage'], 'DD9_E':DD9_E ['Coverage'],
'TD9_A':TD9_A['Coverage'],'TD12_A':TD12_A['Coverage'],'TD24_A':TD24_A['Coverage'],'TD51_A':TD51_A['Coverage'] })
In [65]:
sns.pairplot(table_for_correlations, vars=['DD3A','DD3_D','DD6_E','TD9_A'], kind="scatter")
Out[65]:
In [66]:
sns.pairplot(table_for_correlations, vars=['DD51_A','DD24_D','DD9_E','TD51_A'], kind="scatter")
Out[66]:
Coverage values between samples seem to be quite correlated, although I have not plotted all pairwise correlations.
In [67]:
# construction of the data table
siz = len(DD3_A ['Coverage'])
sample = len(DD3_A ['Coverage'])*["DD3A"]+len(DD3_D ['Coverage'])*["DD3D"]
overlay_table_concat_DD3 = pd.DataFrame ({'Position':pd.concat([DD3_A['Position'],DD3_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD3_A ['Major_variant_frequency_quality_corrected'],DD3_D ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [68]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD3, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [69]:
# construction of the data table
siz = len(DD6_A ['Coverage'])
sample = len(DD6_A ['Coverage'])*["DD6A"]+len(DD6_D ['Coverage'])*["DD6D"]+len(DD6_E ['Coverage'])*["DD6E"]
overlay_table_concat_DD6 = pd.DataFrame ({'Position':pd.concat([DD6_A['Position'],DD6_D['Position'],DD6_E['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD6_A ['Major_variant_frequency_quality_corrected'],DD6_D ['Major_variant_frequency_quality_corrected'],DD6_E ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [70]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD6, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [71]:
# construction of the data table
siz = len(DD9_A ['Coverage'])
sample = len(DD9_A ['Coverage'])*["DD9A"]+len(DD9_D ['Coverage'])*["DD9D"]+len(DD9_E ['Coverage'])*["DD9E"]
overlay_table_concat_DD9 = pd.DataFrame ({'Position':pd.concat([DD9_A['Position'],DD9_D['Position'],DD9_E['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD9_A ['Major_variant_frequency_quality_corrected'],DD9_D ['Major_variant_frequency_quality_corrected'],DD9_E ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [72]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD9, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [73]:
# construction of the data table
siz = len(DD12_A ['Coverage'])
sample = len(DD12_A ['Coverage'])*["DD12A"]+len(DD12_D ['Coverage'])*["DD12D"]
overlay_table_concat_DD12 = pd.DataFrame ({'Position':pd.concat([DD12_A['Position'],DD12_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD12_A ['Major_variant_frequency_quality_corrected'],DD12_D ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [74]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD12, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [75]:
# construction of the data table
siz = len(DD24_A ['Coverage'])
sample = len(DD24_A ['Coverage'])*["DD24A"]+len(DD24_D ['Coverage'])*["DD24D"]
overlay_table_concat_DD24 = pd.DataFrame ({'Position':pd.concat([DD24_A['Position'],DD24_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD24_A ['Major_variant_frequency_quality_corrected'],DD24_D ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [76]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD24, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [77]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD3_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [78]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD6_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [79]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD9_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [80]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD12_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [81]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD24_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [82]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD51_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [83]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD24_D, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [84]:
lm=sns.lmplot( x="Position", y="Second_variant_frequency_quality_corrected", data=DD51_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
plt.legend(loc='lower right')
#sns.lmplot( x="position", y="majorbase_ratio", data=DD51_A, fit_reg=False, hue='synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
#plt.legend(loc='lower right')
Out[84]:
In [85]:
print( DD51_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD51_A['Second_variant_frequency_quality_corrected'][DD51_A['is_synonymous']=="synonymous"]
nonsyn = DD51_A['Second_variant_frequency_quality_corrected'][DD51_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD51_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD51_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 51")
Out[85]:
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 51.
In [86]:
print( DD24_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD24_A['Second_variant_frequency_quality_corrected'][DD24_A['is_synonymous']=="synonymous"]
nonsyn = DD24_A['Second_variant_frequency_quality_corrected'][DD24_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD24_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD24_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 24");
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 24.
In [87]:
print( DD12_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD12_A['Second_variant_frequency_quality_corrected'][DD12_A['is_synonymous']=="synonymous"]
nonsyn = DD12_A['Second_variant_frequency_quality_corrected'][DD12_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD12_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD12_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 12");
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 12.
In [88]:
print( DD9_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD9_A['Second_variant_frequency_quality_corrected'][DD9_A['is_synonymous']=="synonymous"]
nonsyn = DD9_A['Second_variant_frequency_quality_corrected'][DD9_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD9_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD9_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 9");
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 9.
In [89]:
print( DD6_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD6_A['Second_variant_frequency_quality_corrected'][DD6_A['is_synonymous']=="synonymous"]
nonsyn = DD6_A['Second_variant_frequency_quality_corrected'][DD6_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD6_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD6_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 6");
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 6.
In [90]:
print( DD3_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD3_A['Second_variant_frequency_quality_corrected'][DD3_A['is_synonymous']=="synonymous"]
nonsyn = DD3_A['Second_variant_frequency_quality_corrected'][DD3_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD3_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD3_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 3");
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 3.
In [91]:
print( DD24_D.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD24_D['Second_variant_frequency_quality_corrected'][DD24_D['is_synonymous']=="synonymous"]
nonsyn = DD24_D['Second_variant_frequency_quality_corrected'][DD24_D['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD24_D, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD24_D.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, replicate D, day 24");
In [92]:
sns.lmplot( x="Position", y="Second_variant_frequency_quality_corrected", data=TD9_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [93]:
sns.lmplot( x="Position", y="Second_variant_frequency_quality_corrected", data=TD12_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [94]:
sns.lmplot( x="Position", y="Second_variant_frequency_quality_corrected", data=TD24_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [95]:
sns.lmplot( x="Position", y="Second_variant_frequency_quality_corrected", data=TD51_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In the density plots above (figures 47, 49, 51), we saw that variants with a log-diversity above -5.5 rose in frequency through time. Those variants presumably are under selection. We therefore expect to see a difference between synonymous (putatively nearly-neutral) and non-synonymous variants (putatively under selection).
In [96]:
# We select variants with log-diversity above -5.5
threshold=np.exp(-5.5)
def above_55 (row, threshold):
if row['1_major_variant_frequency']>=threshold :
return "right"
else:
return "left"
def print_means_and_medians(table):
print("\n\t\tMedians:")
nums = table.groupby(['is_synonymous', 'peak'])['Second_variant_frequency_quality_corrected'].median().values
print("\tLeft peak:")
print("Non-synonymous: "+str(nums [0])+"; Synonymous: "+str(nums [2]))
print("\tRight peak:")
print("Non-synonymous: "+str(nums [1])+"; Synonymous: "+str(nums [3]))
print("\n\t\tMeans:")
nums = table.groupby(['is_synonymous', 'peak'])['Second_variant_frequency_quality_corrected'].mean().values
print("\tLeft peak:")
print("Non-synonymous: "+str(nums [0])+"; Synonymous: "+str(nums [2]))
print("\tRight peak:")
print("Non-synonymous: "+str(nums [1])+"; Synonymous: "+str(nums [3]))
print("\n\t\tNumber of positions:")
nums = table.groupby(['is_synonymous', 'peak'])['Second_variant_frequency_quality_corrected'].size().values
print("\tLeft peak:")
print("Non-synonymous: "+str(nums [0])+"; Synonymous: "+str(nums [2]))
print("\tRight peak:")
print("Non-synonymous: "+str(nums [1])+"; Synonymous: "+str(nums [3]))
print("\n\t\tFraction of variants that are non-synonymous:")
nums = table.groupby(['is_synonymous', 'peak'])['Second_variant_frequency_quality_corrected'].size()
print ("\tLeft peak: ")
print(nums[0]/(nums[0]+nums[2]))
print ("\tRight peak: ")
print(nums[1]/(nums[1]+nums[3]))
def test_distributions(table):
syn_left = table['Second_variant_frequency_quality_corrected'][(table['is_synonymous']=="synonymous") & (table['peak']=="left")]
non_syn_left = table['Second_variant_frequency_quality_corrected'][(table['is_synonymous']=="non-synonymous") & (table['peak']=="left")]
syn_right = table['Second_variant_frequency_quality_corrected'][(table['is_synonymous']=="synonymous") & (table['peak']=="right")]
non_syn_right = table['Second_variant_frequency_quality_corrected'][(table['is_synonymous']=="non-synonymous") & (table['peak']=="right")]
print("T-test p-value: synonymous vs non-synonymous mutations, left peak: "+str(ttest_ind(syn_left, non_syn_left)[1]))
print("T-test p-value: synonymous vs non-synonymous mutations, right peak: "+str(ttest_ind(syn_right, non_syn_right)[1]))
print("Mann-Whitney U-test p-value: synonymous vs non-synonymous mutations, left peak: "+str(mannwhitneyu(syn_left, non_syn_left)[1]))
print("Mann-Whitney U-test p-value: synonymous vs non-synonymous mutations, right peak: "+str(mannwhitneyu(syn_right, non_syn_right)[1]))
def violin_plots_syn_peaks (table):
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=table, hue='peak', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.0005,0.5)
axes.set_ylabel("Frequency of the second variant")
axes.set(yscale="log")
medians = table.groupby(['is_synonymous', 'peak'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
plt.axhline(y=medians[2], linewidth=1, linestyle=':')
plt.axhline(y=medians[3], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous");
In [97]:
DD51_A['peak'] = DD51_A.apply (lambda row: above_55 (row, threshold),axis=1)
print_means_and_medians(DD51_A)
test_distributions(DD51_A)
violin_plots_syn_peaks(DD51_A)
In [98]:
DD24_D['peak'] = DD24_D.apply (lambda row: above_55 (row, threshold),axis=1)
print_means_and_medians(DD24_D)
test_distributions(DD24_D)
violin_plots_syn_peaks(DD24_D)
In [99]:
TD51_A['peak'] = TD51_A.apply (lambda row: above_55 (row, threshold),axis=1)
print_means_and_medians(TD51_A)
test_distributions(TD51_A)
violin_plots_syn_peaks(TD51_A)
In all three analyses above, we see that for both peaks there are more non-synonymous variants than synonymous variants. This is expected because there are much more ways to make a non-synonymous substitution than to make a synonymous substitution (e.g. mutations in the first 2 positions of the codons nearly always result in a non-synonymous substitution).
However, the ratio of the number of non-synonymous variants over all variants is smaller in the left peak (hypothetically variants created during the 2-day amplification step), than in the right peak (hypothetically variants created during experimental evolution). These excess non-synonymous variants in the right peak could be beneficial, and therefore have increased thanks to selection, or could be neutral or slightly deleterious, in which case they would have increased by genetic draft, i.e. because they are linked on an haplotype to positively selected variants.
Looking at the frequencies of the variants, we see that the non-synonymous variants are significantly more frequent than the synonymous variants in the left peak, while it is not the case in the right peak. The results for the first peak may be linked to mutation rate biases, e.g. a mutation towards A would increase the frequency of A variants. This has not been investigated. The fact that the non-synonymous variants are on average less frequent than the synonymous variants in the right peak hints that most non-synonymous variants may be slightly deleterious. However, there seems to be an excess of high-frequency non-synonymous variants compared to the synonymous variants, so those may have been positively selected.
In [108]:
mut_dict = dict()
mut_dict["AC"] = 0
mut_dict["AG"] = 0
mut_dict["AT"] = 0
mut_dict["CA"] = 0
mut_dict["CG"] = 0
mut_dict["CT"] = 0
mut_dict["GA"] = 0
mut_dict["GC"] = 0
mut_dict["GT"] = 0
mut_dict["TA"] = 0
mut_dict["TC"] = 0
mut_dict["TG"] = 0
mut_dict["NN"] = 0
mut_dict["NA"] = 0
mut_dict["NC"] = 0
mut_dict["NG"] = 0
mut_dict["NT"] = 0
mut_dict["AN"] = 0
mut_dict["CN"] = 0
mut_dict["GN"] = 0
mut_dict["TN"] = 0
def extract_mutations_row (row, mut_dict):
mut_dict[row['Major_variant']+row['Second_variant'] ] +=1
def extractMutations(table, mut_dict):
other_dict = mut_dict.copy()
table.apply (lambda row: extract_mutations_row (row, other_dict),axis=1)
return other_dict
def normalize_dict(mut_dict):
other_dict = mut_dict.copy()
sum_A = other_dict["AC"] + other_dict["AG"] + other_dict["AT"] + other_dict["AN"]
sum_C = other_dict["CA"] + other_dict["CG"] + other_dict["CT"] + other_dict["CN"]
sum_G = other_dict["GA"] + other_dict["GC"] + other_dict["GT"] + other_dict["GN"]
sum_T = other_dict["TA"] + other_dict["TC"] + other_dict["TG"] + other_dict["TN"]
other_dict["AC"] = other_dict["AC"] /sum_A
other_dict["AG"] = other_dict["AG"] /sum_A
other_dict["AT"] = other_dict["AT"] /sum_A
other_dict["CA"] = other_dict["CA"] /sum_C
other_dict["CG"] = other_dict["CG"] /sum_C
other_dict["CT"] = other_dict["CT"] /sum_C
other_dict["GA"] = other_dict["GA"] /sum_G
other_dict["GC"] = other_dict["GC"] /sum_G
other_dict["GT"] = other_dict["GT"] /sum_G
other_dict["TA"] = other_dict["TA"] /sum_T
other_dict["TC"] = other_dict["TC"] /sum_T
other_dict["TG"] = other_dict["TG"] /sum_T
return other_dict
def print_mutation_patterns(mut_dict):
print("\t\tRaw counts: ")
for k,v in mut_dict.items():
if ("N" not in k):
print(k+" : " + str(v))
print ("\t\tFrequencies: ")
norm_dict = normalize_dict (mut_dict)
for k,v in norm_dict.items():
if ("N" not in k):
print(k+" : " + str(v))
def plot_mutation_patterns (left_dict, right_dict):
left_dict = normalize_dict (left_dict)
right_dict = normalize_dict (right_dict)
names = []
left_values = []
right_values = []
peaks_1=[]
peaks_2=[]
for k,v in left_dict.items():
if ("N" not in k):
names.append(k)
left_values.append(v)
right_values.append(right_dict[k])
peaks_1.append("left")
peaks_2.append("right")
tot_names= names + names
tot_values = left_values + right_values
tot_peaks = peaks_1 + peaks_2
df = pd.DataFrame({"peak" : tot_peaks, "mutation" : tot_names,
"frequency" : tot_values})
# create plot
fig, ax = plt.subplots()
index = np.arange(len(names))
bar_width = 0.35
opacity = 0.8
rects1 = plt.bar(index, left_values, bar_width,
alpha=opacity,
color='b',
label='Left')
rects2 = plt.bar(index + bar_width, right_values, bar_width,
alpha=opacity,
color='g',
label='Right')
plt.xlabel('Mutation')
plt.ylabel('Frequency')
plt.title('Mutation frequency')
plt.xticks(index + bar_width, names)
plt.legend()
plt.tight_layout()
plt.show()
def plot_mutation_patterns_single_peak (all_dict):
all_dict = normalize_dict (all_dict)
names = []
all_values = []
peaks_1=[]
for k,v in all_dict.items():
if ("N" not in k):
names.append(k)
all_values.append(v)
peaks_1.append("left")
tot_names= names
tot_values = all_values
tot_peaks = peaks_1
df = pd.DataFrame({"peak" : tot_peaks, "mutation" : tot_names,
"frequency" : tot_values})
# create plot
fig, ax = plt.subplots()
index = np.arange(len(names))
bar_width = 0.35
opacity = 0.8
rects1 = plt.bar(index, all_values, bar_width,
alpha=opacity,
color='b',
label='all')
plt.xlabel('Mutation')
plt.ylabel('Frequency')
plt.title('Mutation frequency')
plt.xticks(index + bar_width, names)
plt.legend()
plt.tight_layout()
plt.show()
def compute_and_plot_mutation_patterns(table, mut_dict):
left_dict = extractMutations(table[table['peak']=="left"], mut_dict)
right_dict = extractMutations(table[table['peak']=="right"], mut_dict)
plot_mutation_patterns(left_dict, right_dict)
def compute_and_plot_mutation_patterns_single_peak(table, mut_dict):
all_dict = extractMutations(table, mut_dict)
plot_mutation_patterns_single_peak(all_dict)
In [101]:
compute_and_plot_mutation_patterns(DD51_A, mut_dict)
In [102]:
compute_and_plot_mutation_patterns(DD24_D, mut_dict)
In [103]:
compute_and_plot_mutation_patterns(TD51_A, mut_dict)
In [109]:
compute_and_plot_mutation_patterns_single_peak(cirseq, mut_dict)
In [110]:
compute_and_plot_mutation_patterns_single_peak(DD51_A_no_reamp, mut_dict)
There are consistent differences between the mutation patterns of the left and right peaks. For instance, the frequency of the AC and TG mutations are always larger in the left peak, and those of CA and GT always lower in the left peak. The hypothesis we have for explaining the left peak mutations is that they come from errors during PCR amplification. A quick look at a paper on PCR errors (https://www-ncbi-nlm-nih-gov/pmc/articles/PMC5457411/) does not seem to recover this mutation pattern however, so I am a bit puzzled. A look at the library from the start (initial cirseq) or from the non-reamplified library recovers the same type of pattern as that obtained for the right peak. This suggests that the left peak really is different...
In [ ]:
a = increasing_A.keys()
b = increasing_D.keys()
intersection = list(set(a) & set(b))
print ("Positions that increase in both control replicates A and D: ")
print(intersection)
print("... and that are non-synonymous at time 51, replicate A:")
for i in intersection:
if DD51_A['is_synonymous'][i]=="non-synonymous":
print (i)
In [ ]:
a = intersection
b = increasing_E.keys()
intersection = list(set(a) & set(b))
print ("Positions that increase in control replicates A, D and E: ")
print(intersection)
print("... and that are non-synonymous at time 51, replicate A:")
for i in intersection:
if DD51_A['is_synonymous'][i]=="non-synonymous":
print (i)
In [ ]:
a = intersection
b = increasing_TA.keys()
intersection = list(set(a) & set(b))
print ("Positions that increase in control replicates A, D, E and TLR3: ")
print(intersection)
In [ ]:
a = increasing_A.keys()
b = increasing_TA.keys()
intersection = list(set(a) & set(b))
print ("Positions that increase in control replicate A,and TLR3: ")
print(intersection)
print("... and that are non-synonymous at time 51, replicate A:")
for i in intersection:
if DD51_A['is_synonymous'][i]=="non-synonymous":
print (i)
In [ ]:
a = increasing_D.keys()
b = increasing_TA.keys()
intersection = list(set(a) & set(b))
print ("Positions that increase in control replicate D,and TLR3: ")
print(intersection)
print("... and that are non-synonymous at time 24, replicate D:")
for i in intersection:
if DD24_D['is_synonymous'][i]=="non-synonymous":
print (i)
In [ ]:
def get_second_largest_base (li, cons):
consensus = cons.replace(" ","")
if consensus=="A":
li[0] = -1
elif consensus=="C":
li[1] = -1
elif consensus=="G":
li[2] = -1
elif consensus=="T":
li[3] = -1
if li[0]==li[1]==li[2]==li[3]:
return "N"
if max(li) == li[0]:
return "A"
elif max(li) == li[1]:
return "C"
elif max(li) == li[2]:
return "G"
else :
return "T"
def print_sequence_df (df):
consensus=""
mutant=""
consensus_aa=""
mutant_aa=""
for index, row in df.iterrows():
consensus += row['consensus_sequence']
mutant += row['mutated_sequence']
if (index+1) % 3 == 0:
consensus += ""
mutant += ""
print(row['consensus_sequence_aa'])
print(row['mutated_sequence_aa'])
consensus_aa += row['consensus_sequence_aa'] + " "
mutant_aa += row['mutated_sequence_aa'] + " "
print ("Nucleotidic consensus:")
print(consensus)
print ("Nucleotidic variant:")
print(mutant)
print ("Amino acid consensus:")
print(consensus_aa)
print ("Amino acid variant:")
print(mutant_aa)
def extract_sequences_around_position(position, dataframe, length):
d=pd.DataFrame()
d['consensus_sequence'] = dataframe['Major_variant'][position-length:position+length]
#print(d['consensus_sequence'])
d['consensus_sequence_aa'] = dataframe['consensus_aa'][position-length:position+length]
d['mutated_sequence'] = dataframe['Major_variant'][position-length:position+length]
d['mutated_sequence'][position] = dataframe['Second_variant'][position]
d['mutated_sequence_aa'] = dataframe['consensus_aa'][position-length:position+length]
#print(dataframe['consensus_aa'][position])
#print(dataframe['secondbase_aa'][position])
#d.iat[position, 3] = dataframe['secondbase_aa'][position]
print(d['mutated_sequence_aa'][position])
print(dataframe['secondbase_aa'][position])
d['mutated_sequence_aa'][position] = dataframe['secondbase_aa'][position]
#d.iloc[length, 3] = dataframe['secondbase_aa'][position]
print(d['mutated_sequence_aa'][position])
print("\n\n\tPOSITION: "+str(position))
print_sequence_df(d)
In [ ]:
extract_sequences_around_position(2340, DD12_A, 20)
In [ ]:
for i in positions:
extract_sequences_around_position(i, DD12_A, 20)
In [ ]:
toVincent = pd.DataFrame()
for d in [DD3_A, DD6_A, DD9_A, DD12_A, DD24_A, DD51_A, DD3_D, DD6_D, DD6_E, DD9_D, DD9_E, DD12_D, DD24_D, TD9_A, TD12_A, TD24_A, TD51_A]:
# print(d.loc[2340])
# print(d.loc[7172])
temp_316 = d[d['Unnamed: 0']==316]
temp_2340 = d[d['Unnamed: 0']==2340]
temp_7172 = d[d['Unnamed: 0']==7172]
temp_9165 = d[d['Unnamed: 0']==9165]
toVincent=pd.concat([toVincent,temp_316])
toVincent=pd.concat([toVincent,temp_2340])
toVincent=pd.concat([toVincent,temp_7172])
toVincent=pd.concat([toVincent,temp_9165])
expnames=["DD3_A", "DD3_A","DD3_A", "DD3_A","DD6_A", "DD6_A", "DD6_A", "DD6_A", "DD9_A", "DD9_A", "DD9_A", "DD9_A", "DD12_A", "DD12_A", "DD12_A", "DD12_A", "DD24_A", "DD24_A", "DD24_A", "DD24_A", "DD51_A", "DD51_A", "DD51_A", "DD51_A", "DD3_D", "DD3_D", "DD3_D", "DD3_D", "DD6_D", "DD6_D", "DD6_D", "DD6_D", "DD6_E", "DD6_E", "DD6_E", "DD6_E", "DD9_D", "DD9_D", "DD9_D", "DD9_D", "DD9_E", "DD9_E", "DD9_E", "DD9_E", "DD12_D", "DD12_D", "DD12_D", "DD12_D", "DD24_D", "DD24_D", "DD24_D", "DD24_D", "TD9_A", "TD9_A", "TD9_A", "TD9_A", "TD12_A", "TD12_A", "TD12_A", "TD12_A", "TD24_A", "TD24_A", "TD24_A", "TD24_A", "TD51_A", "TD51_A", "TD51_A", "TD51_A"]
toVincent['expName']=expnames
toVincent.to_csv( "pos_316_2340_7172_9165.csv")
print(temp_7172)
In [ ]: