Here we plot the major base frequency for different time points. We also detect positions that consistently rise in frequency through time, i.e. positions that rise monotonously in frequency with time. Positions that only rise in frequency between the first and the last time points have not been investigated.
In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from pylab import rcParams
import seaborn as sns
from array import array
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import linregress
from scipy.stats import mannwhitneyu
%matplotlib inline
In [4]:
begins=[]
ends=[]
names =[]
with open ("sequence.gb") as f:
in_pep = False
for l in f:
if "mat_peptide" in l:
begins.append(int(l.split()[1].split("..")[0]))
ends.append(int(l.split()[1].split("..")[1]))
in_pep = True
elif in_pep :
names.append(l.split("=")[1])
in_pep = False
print(begins)
print(ends)
print(names)
In [5]:
# Interesting positions
positions=[316,1670,1785,2340,5935,7172,8449,9165]
def plot_positions():
for x in positions:
plt.axvline(x=x, linewidth=1, linestyle=':')
def plot_genes():
for i in range(len(begins)):
plt.plot([begins[i], begins[i]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
if i%2==0:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.005, (names[i].replace('"', ''))[0:3], size='xx-small')
else:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.015, (names[i].replace('"', ''))[0:3], size='xx-small')
plt.plot([ends[-1], ends[-1]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
In [6]:
def synonymous (row):
if row['null'] or (row['consensus_aa']==row['secondbase_aa'] ):
return "synonymous"
else:
return "non-synonymous"
def add_columns(table):
table['null'] = (table['secondbase_aa']).isnull()
table['is_synonymous'] = table.apply (lambda row: synonymous (row),axis=1)
table['1_major_variant_frequency'] = 1.0 - table['Major_variant_frequency_quality_corrected']
In [7]:
def is_increasing(minor_frequencies):
#print(minor_frequencies)
previous = minor_frequencies[0]
for m in range(1,len(minor_frequencies)):
if previous < minor_frequencies[m]:
#print(str(previous) + " < " + str(minor_frequencies[m]))
previous = minor_frequencies[m]
else:
return False
return True
def get_variant_frequency(variant, table, i):
sum_of_bases = table['As_quality_corrected'][i]+table['Cs_quality_corrected'][i]+table['Gs_quality_corrected'][i]+table['Ts_quality_corrected'][i]+table['Ns_quality_corrected'][i]
if variant == "A":
return table["As_quality_corrected"][i] / sum_of_bases
elif variant == "C":
return table["Cs_quality_corrected"][i] / sum_of_bases
elif variant == "G":
return table["Gs_quality_corrected"][i] / sum_of_bases
elif variant == "T":
return table["Ts_quality_corrected"][i] / sum_of_bases
else:
return np.nan
def get_increasing_variants(tables):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
increasingVariants = dict()
for i in first["Position"]:
major = first["Major_variant"][i]
#print(last['Major_variant_frequency_quality_corrected'][i])
major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
if major == last["Major_variant"][i]:
minor = last["Second_variant"][i]
else:
minor = last["Major_variant"][i]
minor_frequencies[0] = get_variant_frequency(minor, first, i)
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
minor_frequencies[table_id] = get_variant_frequency(minor, tables[table_id], i)
if is_increasing(minor_frequencies):
increasingVariants[i] = [major_frequencies.tolist(), minor_frequencies.tolist()]
return increasingVariants
In [8]:
# CirSeq initial sample
cirseq = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1CirseqD3_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(cirseq)
In [9]:
# Control runs, replicate A
DD3_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD3_A)
DD6_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD6_A)
DD9_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD9_A)
DD12_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD12_A)
DD24_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD24_A)
DD51_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD51_A)
DD51_A_no_reamp = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51Anoreamplification_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD51_A_no_reamp)
In [10]:
# Control runs, replicate D
DD3_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD3_D)
DD6_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD6_D)
DD9_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD9_D)
DD12_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD12_D)
DD24_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD24_D)
In [11]:
# Control runs, replicate E
DD6_E = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD6_E)
DD9_E = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD9_E)
In [12]:
# TLR3 activation runs, replicate A
TD9_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_A)
TD12_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD12_A)
TD24_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD24_A)
TD51_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD51_A)
In [13]:
#DD3_A.describe(include='all')
In [14]:
sns.lmplot( x="Position", y="Coverage", data=cirseq, fit_reg=False, legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
The coverage is very low.
In [15]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=cirseq, fit_reg=False, legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [16]:
cirseq['Position'][cirseq['Major_variant_frequency_quality_corrected']<0.95]
Out[16]:
There is already position 1785... What are its states?
In [17]:
print("Position 1785: ")
print ("Major variant: "+cirseq['Major_variant'][1785])
print ("Second variant: "+cirseq['Second_variant'][1785])
print ("Major variant, aa: "+cirseq['consensus_aa'][1785])
print ("Second variant, aa: "+cirseq['secondbase_aa'][1785])
print("Frequency of the major variant: "+str(cirseq['Major_variant_frequency_quality_corrected'][1785]))
print("Frequency of the second variant: "+str(cirseq['Second_variant_frequency_quality_corrected'][1785]))
In [18]:
cirseq['1_major_variant_frequency'] = 1.0 - cirseq['Major_variant_frequency_quality_corrected']
#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=cirseq, fit_reg=False, legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
plt.legend(loc='lower right')
Here the distribution of diversity seems unimodal.
In [19]:
table_for_correlations = pd.DataFrame ({'51A_Cov':DD51_A ['Coverage'],'51ANR_Cov':DD51_A_no_reamp['Coverage'], '51A_MV':DD51_A ['Major_variant_frequency_quality_corrected'], '51ANR_MV':DD51_A_no_reamp ['Major_variant_frequency_quality_corrected'] })
table_for_correlations_highCov = table_for_correlations.loc[ (table_for_correlations['51A_Cov'] > 1000) & (table_for_correlations['51ANR_Cov'] > 1000) ]
sns.pairplot(table_for_correlations_highCov, vars=['51A_Cov','51ANR_Cov','51A_MV','51ANR_MV'], kind="scatter")
Out[19]:
In [20]:
table_for_correlations_highFreq = table_for_correlations.loc[ (table_for_correlations['51A_MV'] > 0.1) & (table_for_correlations['51ANR_MV'] > 0.1) ]
lm=sns.lmplot(x='51A_MV',y='51ANR_MV', data=table_for_correlations_highFreq)
axes = lm.axes
#axes[0,0].set_ylim(0.001,0.05)
#axes[0,0].set(yscale="log", xscale="log")
res=linregress(table_for_correlations_highFreq['51A_MV'], table_for_correlations_highFreq['51ANR_MV'])
print(res)
There is a correlation between the frequencies of the major variants in the two libraries, but the correlation coefficient is not great (0.55).
In [21]:
table_for_correlations_highFreq = table_for_correlations.loc[ (table_for_correlations['51ANR_MV'] > 0.9) ]
lm=sns.lmplot(x='51A_MV',y='51ANR_MV', data=table_for_correlations_highFreq)
axes = lm.axes
#axes[0,0].set_ylim(0.001,0.05)
#axes[0,0].set(yscale="log")
res=linregress(table_for_correlations_highFreq['51A_MV'], table_for_correlations_highFreq['51ANR_MV'])
print(res)
The correlation is not great for positions where there is little variation. It is also surprising to see that lots of positions with frequencies between 0.9 and 1.0 in the non-reamplified library have a frequency of nearly 1.0 in the reamplified one. This seems to suggest that the amplification by PCR overly amplified the dominant variant in lots of cases.
In [22]:
table_for_correlations = pd.DataFrame ({'51A_Cov':DD51_A ['Coverage'],'51ANR_Cov':DD51_A_no_reamp['Coverage'], '51A_SV':DD51_A ['Second_variant_frequency_quality_corrected'], '51ANR_SV':DD51_A_no_reamp ['Second_variant_frequency_quality_corrected'] })
table_for_correlations_highCov = table_for_correlations.loc[ (table_for_correlations['51A_Cov'] > 1000) & (table_for_correlations['51ANR_Cov'] > 1000) ]
In [23]:
#table_for_correlations_highFreq = table_for_correlations.loc[ (table_for_correlations['51A_MV'] > 0.95) ]
lm=sns.lmplot(x='51A_SV',y='51ANR_SV', data=table_for_correlations_highCov)
axes = lm.axes
#axes[0,0].set_ylim(0.001,0.05)
#axes[0,0].set(yscale="log")
res=linregress(table_for_correlations_highCov['51A_SV'], table_for_correlations_highCov['51ANR_SV'])
print(res)
The correlation between the frequencies of the second most frequent variants is not too bad (correlation coefficient 0.63).
Positions that differ in the most frequent base:
In [24]:
DD51_A ['Position'][DD51_A ['Major_variant'] != DD51_A_no_reamp ['Major_variant']]
Out[24]:
Number of positions that differ in the second most frequent base:
In [25]:
len(DD51_A ['Position'][DD51_A ['Second_variant'] != DD51_A_no_reamp ['Second_variant']])
Out[25]:
In [26]:
sns.kdeplot(DD51_A ['Second_variant_frequency_quality_corrected'][DD51_A ['Second_variant'] != DD51_A_no_reamp ['Second_variant']])
Out[26]:
In [27]:
(DD51_A ['Second_variant_frequency_quality_corrected'][DD51_A ['Second_variant'] != DD51_A_no_reamp ['Second_variant']]).describe()
Out[27]:
The second most frequent variants that differ between with and without reamplification usually have a low frequency; for frequent variants, the correlation is not too bad.
In [28]:
DD51_A_no_reamp['1_major_variant_frequency'] = 1.0 - DD51_A_no_reamp['Major_variant_frequency_quality_corrected']
#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=DD51_A_no_reamp, fit_reg=False, legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
plt.legend(loc='lower right')
In [29]:
DD51_A['1_major_variant_frequency'] = 1.0 - DD51_A['Major_variant_frequency_quality_corrected']
#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=DD51_A, fit_reg=False, legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
plt.legend(loc='lower right')
In [30]:
f, ax = plt.subplots(figsize=(15, 7))
lm=sns.kdeplot(np.log(DD51_A["1_major_variant_frequency"]), label="DD51_A")
axes = lm.axes
axes.set_xlim(-8,-4)
lm=sns.kdeplot(np.log(DD51_A_no_reamp["1_major_variant_frequency"]), label="DD51_A_no_reamp")
In [31]:
tables_A = [DD3_A, DD6_A, DD9_A, DD12_A, DD24_A, DD51_A]
increasing_A = get_increasing_variants(tables_A)
print("There are "+str(len(increasing_A))+" positions that rise in frequency.")
print("Those are:")
print(increasing_A.keys())
In [32]:
tables_D = [DD3_D, DD6_D, DD9_D, DD12_D, DD24_D]
increasing_D = get_increasing_variants(tables_D)
print("There are "+str(len(increasing_D))+" positions that rise in frequency.")
print("Those are:")
print(increasing_D.keys())
In [33]:
tables_E = [DD6_E, DD9_E]
increasing_E = get_increasing_variants(tables_E)
print("There are "+str(len(increasing_E))+" positions that rise in frequency.")
print("There are too many of them, we choose not to print them.")
In [34]:
tables_TA = [TD9_A, TD12_A, TD24_A, TD51_A]
increasing_TA = get_increasing_variants(tables_TA)
print("There are "+str(len(increasing_TA))+" positions that rise in frequency.")
print("Those are:")
print(increasing_TA.keys())
Here we plot the frequency of the major variant at day 51 in the control experiment, A. We color the dots according to whether they were consistently rising in frequency or not.
In [35]:
sns.set_palette("hls")
sns.set_context("poster")
In [36]:
increasing_A_keys = increasing_A.keys()
is_increasing = []
for i in DD51_A['Position']:
if i in increasing_A_keys:
is_increasing.append("Increasing")
else:
is_increasing.append("Not increasing")
to_plot = pd.DataFrame ({'Position':DD51_A['Position'], 'Major_variant_frequency_quality_corrected':DD51_A ['Major_variant_frequency_quality_corrected'],'Is_increasing':is_increasing})
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=to_plot, fit_reg=False, hue='Is_increasing', legend=False, size=7, aspect=2)
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
Here we plot the increase in frequency of the second variant, over the course of the entire experiment, in the different replicates.
In [37]:
def plot_increase_per_increasing_position (increasing_list, last_table):
increasing_list_keys = increasing_list.keys()
increase = []
is_increasing = []
for i in last_table['Position']:
if i in increasing_list_keys:
increase.append(increasing_list[i][1][len(increasing_list[i][1])-1] - increasing_list[i][1][0])
is_increasing.append("Increasing")
else:
increase.append(0.0)
is_increasing.append("Not increasing")
to_plot = pd.DataFrame ({'Position':last_table['Position'], 'Increase':increase,'Is_increasing':is_increasing})
sns.lmplot( x="Position", y="Increase", data=to_plot, fit_reg=False, hue='Is_increasing', legend=False, size=7, aspect=2)
# plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
return
In [38]:
plot_increase_per_increasing_position (increasing_A, DD51_A)
In [39]:
plot_increase_per_increasing_position (increasing_D, DD24_D)
In [40]:
plot_increase_per_increasing_position (increasing_E, DD9_E)
In [41]:
plot_increase_per_increasing_position (increasing_TA, TD51_A)
Here we plot the frequencies of the major variants, for the different time points of an experiment.
In [42]:
overlay_table = pd.DataFrame ({'Position':DD3_A['Position'], 'DD3_A':DD3_A ['Major_variant_frequency_quality_corrected'], 'DD6_A':DD6_A ['Major_variant_frequency_quality_corrected'],'DD9_A':DD9_A ['Major_variant_frequency_quality_corrected'],'DD12_A':DD12_A ['Major_variant_frequency_quality_corrected'], 'DD24_A':DD24_A ['Major_variant_frequency_quality_corrected'], 'DD51_A':DD51_A ['Major_variant_frequency_quality_corrected']})
siz = len(DD3_A ['Position'])
sample = siz*["DD3A"]+siz*["DD6A"]+siz*["DD9A"]+siz*["DD12A"]+siz*["DD24A"]+siz*["DD51A"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD3_A['Position'],DD6_A['Position'],DD9_A['Position'],DD12_A['Position'],DD24_A['Position'],DD51_A['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD3_A ['Major_variant_frequency_quality_corrected'],DD6_A ['Major_variant_frequency_quality_corrected'], DD9_A ['Major_variant_frequency_quality_corrected'],DD12_A ['Major_variant_frequency_quality_corrected'], DD24_A ['Major_variant_frequency_quality_corrected'], DD51_A ['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
In [43]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Frequency of major variant")
plot_positions()
#plot_genes()
Same thing, smaller dots
In [44]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, scatter_kws={"s": 10})
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Frequency of major variant")
plot_positions()
#plot_genes()
In [45]:
overlay_table_concat_no51 = overlay_table_concat.loc[overlay_table_concat['sample']!= "DD51A"]
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data= overlay_table_concat_no51, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Frequency of major variant")
plot_positions()
#plot_genes()
Here we want to know if over all positions, the diversity has increased between day 3 and day 51.
In [46]:
overlay_table_concat['1_major_variant_frequency'] = 1.0 - overlay_table_concat['Major_variant_frequency_quality_corrected']
#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")
overlay_table_concat_nointer = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A")]
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=overlay_table_concat_nointer, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
In [47]:
overlay_table_concat_nointer = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") &(overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A")]
lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=overlay_table_concat_nointer, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
#plt.legend(loc='lower right')
#plot_positions()
#plot_genes()
Out[47]:
In [48]:
overlay_table_concat_nointer = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A")]
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat_nointer, hue='sample', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = overlay_table_concat_nointer.groupby(['sample'])['1_major_variant_frequency'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
print(medians)
syn = overlay_table_concat_nointer['1_major_variant_frequency'][overlay_table_concat_nointer['sample']=="DD3A"]
nonsyn = overlay_table_concat_nointer['1_major_variant_frequency'][overlay_table_concat_nointer['sample']=="DD51A"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
print((syn).median())
print((nonsyn).median())
# giving title to the plot
plt.title("Diversity through time");
In [49]:
f, ax = plt.subplots(figsize=(15, 7))
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD3A")
lm.set_ylabel('Density')
lm.set_xlabel('Logarithm of the diversity per position')
axes = lm.axes
axes.set_xlim(-8,-4)
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD6A")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD6A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD9A")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD12A")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD51A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD24A")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24A") & (overlay_table_concat['sample']!= "DD12A") & (overlay_table_concat['sample']!= "DD9A") & (overlay_table_concat['sample']!= "DD6A") & (overlay_table_concat['sample']!= "DD3A")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD51A")
In [50]:
overlay_table = pd.DataFrame ({'Position':DD3_D['Position'], 'DD3_D':DD3_D ['Major_variant_frequency_quality_corrected'], 'DD6_D':DD6_D ['Major_variant_frequency_quality_corrected'],'DD9_D':DD9_D ['Major_variant_frequency_quality_corrected'],'DD12_D':DD12_D ['Major_variant_frequency_quality_corrected'], 'DD24_D':DD24_D ['Major_variant_frequency_quality_corrected']})
siz = len(DD3_D ['Position'])
sample = siz*["DD3D"]+siz*["DD6D"]+siz*["DD9D"]+siz*["DD12D"]+siz*["DD24D"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD3_D['Position'],DD6_D['Position'],DD9_D['Position'],DD12_D['Position'],DD24_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD3_D ['Major_variant_frequency_quality_corrected'],DD6_D ['Major_variant_frequency_quality_corrected'], DD9_D ['Major_variant_frequency_quality_corrected'],DD12_D ['Major_variant_frequency_quality_corrected'], DD24_D ['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
overlay_table_concat["1_major_variant_frequency"] = 1-overlay_table_concat['Major_variant_frequency_quality_corrected']
In [51]:
f, ax = plt.subplots(figsize=(15, 7))
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24D") & (overlay_table_concat['sample']!= "DD12D") & (overlay_table_concat['sample']!= "DD9D") & (overlay_table_concat['sample']!= "DD6D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD3D")
lm.set_ylabel('Density')
lm.set_xlabel('Logarithm of the diversity per position')
axes = lm.axes
axes.set_xlim(-8,-4)
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24D") & (overlay_table_concat['sample']!= "DD12D") & (overlay_table_concat['sample']!= "DD9D") & (overlay_table_concat['sample']!= "DD3D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD6D")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24D") & (overlay_table_concat['sample']!= "DD12D") & (overlay_table_concat['sample']!= "DD6D") & (overlay_table_concat['sample']!= "DD3D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD9D")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD24D") & (overlay_table_concat['sample']!= "DD9D") & (overlay_table_concat['sample']!= "DD6D") & (overlay_table_concat['sample']!= "DD3D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD12D")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "DD12D") & (overlay_table_concat['sample']!= "DD9D") & (overlay_table_concat['sample']!= "DD6D") & (overlay_table_concat['sample']!= "DD3D")]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="DD24D")
In [52]:
overlay_table = pd.DataFrame ({'Position':TD9_A['Position'], 'TD9':TD9_A ['Major_variant_frequency_quality_corrected'], 'TD12':TD12_A ['Major_variant_frequency_quality_corrected'],'TD24':TD24_A ['Major_variant_frequency_quality_corrected'],'TD51':TD51_A ['Major_variant_frequency_quality_corrected'] })
siz = len(TD9_A ['Position'])
sample = siz*["TD9"]+siz*["TD12"]+siz*["TD24"]+siz*["TD51"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([TD9_A['Position'], TD12_A['Position'], TD24_A['Position'], TD51_A['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([ TD9_A ['Major_variant_frequency_quality_corrected'], TD12_A ['Major_variant_frequency_quality_corrected'], TD24_A ['Major_variant_frequency_quality_corrected'], TD51_A ['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
overlay_table_concat["1_major_variant_frequency"] = 1-overlay_table_concat['Major_variant_frequency_quality_corrected']
In [53]:
f, ax = plt.subplots(figsize=(15, 7))
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "TD12") & (overlay_table_concat['sample']!= "TD24") & (overlay_table_concat['sample']!= "TD51") ]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="TD9")
lm.set_ylabel('Density')
lm.set_xlabel('Logarithm of the diversity per position')
axes = lm.axes
axes.set_xlim(-8,-4)
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "TD9") & (overlay_table_concat['sample']!= "TD24") & (overlay_table_concat['sample']!= "TD51") ]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="TD12")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "TD9") & (overlay_table_concat['sample']!= "TD12") & (overlay_table_concat['sample']!= "TD51") ]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="TD24")
temp = overlay_table_concat.loc[ (overlay_table_concat['sample']!= "TD9") & (overlay_table_concat['sample']!= "TD12") & (overlay_table_concat['sample']!= "TD24") ]
del temp['sample']
del temp['Major_variant_frequency_quality_corrected']
del temp['Position']
#print(temp.describe())
#ax = sns.kdeplot("1_major_variant_frequency", data=temp)
lm=sns.kdeplot(np.log(temp["1_major_variant_frequency"]), label="TD51")
In the analyses above, it would seem that the diversity, i.e. the amount of polymorphism, has increased a bit, as in Acevedo et al. 2014. However, presumably because we do not use Cirseq, we have a bimodal distribution of diversity, with possibly first a peak corresponding to sequencing errors, and second a peak corresponding to variants that do rise in frequency through time. In support of this hypothesis, the only CirSeq data we have, analyzed at the top of this document, does not show this bimodal distribution, and in the graphs above for the replicate A and the TLR3 treatment condition, only the variants with high frequency seem to increase in frequency through time. In the replicate D, it is not clear because the increase seems to be in both peaks.
Here we do the same type of analysis, but looking at all time points, not just 3 and 51.
In [54]:
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat, ax=ax, bw=0.2)
# giving title to the plot
plt.title("Diversity through time");
medians = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].median()
print("Median values of the diversity per sample:" )
print(medians)
means = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].mean()
print("Mean values of the diversity per sample:" )
print(means)
The increase is no longer obvious, certainly not monotonous. If anything, what might be happening is that first diversity decreases from time points 3 to 9, then increases. If this is true, this could be compatible with a selective sweep in the first days, so that only some haplotypes carrying the variants with the high fitness increase in frequency.
In [55]:
overlay_table = pd.DataFrame ({'Position':DD3_D['Position'], 'DD3_D':DD3_D ['Major_variant_frequency_quality_corrected'], 'DD6_D':DD6_D ['Major_variant_frequency_quality_corrected'],'DD9_D':DD9_D ['Major_variant_frequency_quality_corrected'],'DD12_D':DD12_D ['Major_variant_frequency_quality_corrected'],'DD24_D':DD24_D ['Major_variant_frequency_quality_corrected']})
siz = len(DD3_D ['Position'])
sample = siz*["DD3D"]+siz*["DD6D"]+siz*["DD9D"]+siz*["DD12D"]+siz*["DD24D"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD3_D['Position'],DD6_D['Position'],DD9_D['Position'],DD12_D['Position'],DD24_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD3_D ['Major_variant_frequency_quality_corrected'],DD6_D ['Major_variant_frequency_quality_corrected'], DD9_D ['Major_variant_frequency_quality_corrected'],DD12_D ['Major_variant_frequency_quality_corrected'], DD24_D ['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
overlay_table_concat['1_major_variant_frequency'] = 1.0 - overlay_table_concat['Major_variant_frequency_quality_corrected']
In [56]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel("Frequency of major variant")
plot_positions()
#plot_genes()
In [57]:
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat, ax=ax, bw=0.2)
# giving title to the plot
plt.title("Diversity through time");
medians = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].median()
print("Median values of the diversity per sample:" )
print(medians)
means = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].mean()
print("Mean values of the diversity per sample:" )
print(means)
In this case, it may be that the diversity increases then reaches some sort of plateau at about a median of 0.0047-0.0048.
In [58]:
siz = len(DD6_E ['Position'])
sample = siz*["DD6E"]+siz*["DD9E"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([DD6_E['Position'],DD9_E['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD6_E ['Major_variant_frequency_quality_corrected'],DD9_E ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
overlay_table_concat['1_major_variant_frequency'] = 1.0 - overlay_table_concat['Major_variant_frequency_quality_corrected']
In [59]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
plt.ylabel("Frequency of major variant")
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
In [60]:
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat, ax=ax, bw=0.2)
# giving title to the plot
plt.title("Diversity through time");
medians = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].median()
print("Median values of the diversity per sample:" )
print(medians)
means = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].mean()
print("Mean values of the diversity per sample:" )
print(means)
Although we only have two time points, the results are consistent with an increase in the first few days.
In [61]:
overlay_table = pd.DataFrame ({'Position':TD9_A['Position'], 'TD9_A':TD9_A ['Major_variant_frequency_quality_corrected'], 'TD12_A':TD12_A ['Major_variant_frequency_quality_corrected'],'TD24_A':TD24_A ['Major_variant_frequency_quality_corrected'], 'TD51_A':TD51_A ['Major_variant_frequency_quality_corrected']})
siz = len(TD9_A ['Position'])
sample = siz*["TD9A"]+siz*["TD12A"]+siz*["TD24A"]+siz*["TD51A"]
overlay_table_concat = pd.DataFrame ({'Position':pd.concat([TD9_A['Position'],TD12_A['Position'],TD24_A['Position'],TD51_A['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([TD9_A['Major_variant_frequency_quality_corrected'],TD12_A['Major_variant_frequency_quality_corrected'],TD24_A['Major_variant_frequency_quality_corrected'],TD51_A['Major_variant_frequency_quality_corrected'] ]), 'sample':sample})
overlay_table_concat['1_major_variant_frequency'] = 1.0 - overlay_table_concat['Major_variant_frequency_quality_corrected']
In [62]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
plt.ylabel("Frequency of major variant")
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
#plot_genes()
In [63]:
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
sns.violinplot(x='sample', y="1_major_variant_frequency", data=overlay_table_concat, ax=ax, bw=0.2)
# giving title to the plot
plt.title("Diversity through time");
medians = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].median()
print("Median values of the diversity per sample:" )
print(medians)
means = overlay_table_concat.groupby(['sample'], sort=False)['1_major_variant_frequency'].mean()
print("Mean values of the diversity per sample:" )
print(means)
The diversity initially increases, but then it may be decreasing between 24 and 51. Not clear with few time points and no replicate.
In [64]:
# construction of the data table
siz = len(DD3_A ['Coverage'])
sample = siz*["DD3A"]+siz*["DD6A"]+siz*["DD9A"]+siz*["DD12A"]+siz*["DD24A"]+siz*["DD51A"]
overlay_table_concat_DDA = pd.DataFrame ({'Position':pd.concat([DD3_A['Position'],DD6_A['Position'],DD9_A['Position'],DD12_A['Position'],DD24_A['Position'],DD51_A['Position']]), 'Coverage':pd.concat([DD3_A ['Coverage'],DD6_A ['Coverage'], DD9_A ['Coverage'],DD12_A ['Coverage'], DD24_A ['Coverage'], DD51_A ['Coverage'] ]), 'sample':sample})
siz = len(DD3_D ['Coverage'])
sample = siz*["DD3D"]+siz*["DD6D"]+siz*["DD9D"]+siz*["DD12D"]+siz*["DD24D"]
overlay_table_concat_DDD = pd.DataFrame ({'Position':pd.concat([DD3_D['Position'],DD6_D['Position'],DD9_D['Position'],DD12_D['Position'],DD24_D['Position']]), 'Coverage':pd.concat([DD3_D ['Coverage'],DD6_D ['Coverage'], DD9_D ['Coverage'],DD12_D ['Coverage'], DD24_D ['Coverage'] ]), 'sample':sample})
siz = len(DD6_E ['Coverage'])
sample = siz*["DD6E"]+siz*["DD9E"]
overlay_table_concat_DDE = pd.DataFrame ({'Position':pd.concat([DD6_E['Position'],DD9_E['Position']]), 'Coverage':pd.concat([DD6_E ['Coverage'],DD9_E ['Coverage']]), 'sample':sample})
siz = len(TD9_A ['Coverage'])
sample = siz*["TD9A"]+siz*["TD12A"]+siz*["TD24A"]+siz*["TD51A"]
overlay_table_concat_TD = pd.DataFrame ({'Position':pd.concat([TD9_A['Position'],TD12_A['Position'],TD24_A['Position'],TD51_A['Position']]), 'Coverage':pd.concat([TD9_A['Coverage'],TD12_A['Coverage'],TD24_A['Coverage'],TD51_A['Coverage'] ]), 'sample':sample})
overlay_table_concat=pd.concat([overlay_table_concat_DDA,overlay_table_concat_DDD, overlay_table_concat_DDE, overlay_table_concat_TD])
In [65]:
class MathTextSciFormatter(mticker.Formatter):
def __init__(self, fmt="%1.2e"):
self.fmt = fmt
def __call__(self, x, pos=None):
s = self.fmt % x
decimal_point = '.'
positive_sign = '+'
tup = s.split('e')
significand = tup[0].rstrip(decimal_point)
sign = tup[1][0].replace(positive_sign, '')
exponent = tup[1][1:].lstrip('0')
if exponent:
exponent = '10^{%s%s}' % (sign, exponent)
if significand and exponent:
s = r'%s{\times}%s' % (significand, exponent)
else:
s = r'%s%s' % (significand, exponent)
return "${}$".format(s)
markers = ['x','o','v','^','<', '+', 'x','o','v','^','<', '+', 'x','o','v','^','<']
sns.lmplot( x="Position", y="Coverage", data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True, markers=markers, scatter_kws={"s": 20})
# Format with 2 decimal places
plt.gca().yaxis.set_major_formatter(MathTextSciFormatter("%1.2e"))
#plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, mode="expand", borderaxespad=0., ncol=4)
plot_positions()
#plot_genes()
In [66]:
siz = len(DD3_A ['Coverage'])
table_for_correlations = pd.DataFrame ({'DD3A':DD3_A ['Coverage'],'DD6_A':DD6_A['Coverage'], 'DD9_A':DD9_A ['Coverage'], 'DD12_A':DD12_A ['Coverage'], 'DD24_A':DD24_A ['Coverage'], 'DD51_A':DD51_A ['Coverage'],
'DD3_D':DD3_D ['Coverage'],'DD6_D':DD6_D ['Coverage'], 'DD9_D':DD9_D ['Coverage'],'DD12_D':DD12_D ['Coverage'], 'DD24_D':DD24_D ['Coverage'],
'DD6_E':DD6_E ['Coverage'], 'DD9_E':DD9_E ['Coverage'],
'TD9_A':TD9_A['Coverage'],'TD12_A':TD12_A['Coverage'],'TD24_A':TD24_A['Coverage'],'TD51_A':TD51_A['Coverage'] })
In [67]:
sns.pairplot(table_for_correlations, vars=['DD3A','DD3_D','DD6_E','TD9_A'], kind="scatter")
Out[67]:
In [68]:
sns.pairplot(table_for_correlations, vars=['DD51_A','DD24_D','DD9_E','TD51_A'], kind="scatter")
Out[68]:
Coverage values between samples seem to be quite correlated, although I have not plotted all pairwise correlations.
In [69]:
# construction of the data table
siz = len(DD3_A ['Coverage'])
sample = len(DD3_A ['Coverage'])*["DD3A"]+len(DD3_D ['Coverage'])*["DD3D"]
overlay_table_concat_DD3 = pd.DataFrame ({'Position':pd.concat([DD3_A['Position'],DD3_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD3_A ['Major_variant_frequency_quality_corrected'],DD3_D ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [70]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD3, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [71]:
# construction of the data table
siz = len(DD6_A ['Coverage'])
sample = len(DD6_A ['Coverage'])*["DD6A"]+len(DD6_D ['Coverage'])*["DD6D"]+len(DD6_E ['Coverage'])*["DD6E"]
overlay_table_concat_DD6 = pd.DataFrame ({'Position':pd.concat([DD6_A['Position'],DD6_D['Position'],DD6_E['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD6_A ['Major_variant_frequency_quality_corrected'],DD6_D ['Major_variant_frequency_quality_corrected'],DD6_E ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [72]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD6, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
plot_genes()
In [73]:
# construction of the data table
siz = len(DD9_A ['Coverage'])
sample = len(DD9_A ['Coverage'])*["DD9A"]+len(DD9_D ['Coverage'])*["DD9D"]+len(DD9_E ['Coverage'])*["DD9E"]
overlay_table_concat_DD9 = pd.DataFrame ({'Position':pd.concat([DD9_A['Position'],DD9_D['Position'],DD9_E['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD9_A ['Major_variant_frequency_quality_corrected'],DD9_D ['Major_variant_frequency_quality_corrected'],DD9_E ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [74]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD9, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [75]:
# construction of the data table
siz = len(DD12_A ['Coverage'])
sample = len(DD12_A ['Coverage'])*["DD12A"]+len(DD12_D ['Coverage'])*["DD12D"]
overlay_table_concat_DD12 = pd.DataFrame ({'Position':pd.concat([DD12_A['Position'],DD12_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD12_A ['Major_variant_frequency_quality_corrected'],DD12_D ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [76]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD12, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [77]:
# construction of the data table
siz = len(DD24_A ['Coverage'])
sample = len(DD24_A ['Coverage'])*["DD24A"]+len(DD24_D ['Coverage'])*["DD24D"]
overlay_table_concat_DD24 = pd.DataFrame ({'Position':pd.concat([DD24_A['Position'],DD24_D['Position']]), 'Major_variant_frequency_quality_corrected':pd.concat([DD24_A ['Major_variant_frequency_quality_corrected'],DD24_D ['Major_variant_frequency_quality_corrected']]), 'sample':sample})
In [78]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=overlay_table_concat_DD24, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True)
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [79]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD3_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [80]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD6_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [81]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD9_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [82]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD12_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
In [83]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD24_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [84]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD51_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [85]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=DD24_D, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
plt.legend(loc='lower right')
plot_positions()
#plot_genes()
# giving title to the plot
#plt.title("Synonymous and non-synonymous polymorphisms, experiment A day 3");
In [86]:
lm=sns.lmplot( x="Position", y="Second_variant_frequency_quality_corrected", data=DD51_A, fit_reg=False, hue='is_synonymous', legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")
plt.legend(loc='lower right')
#sns.lmplot( x="position", y="majorbase_ratio", data=DD51_A, fit_reg=False, hue='synonymous', legend=False, size=7, aspect=2) #, markers=DD3_A['synonymous'])
#plt.legend(loc='lower right')
Out[86]:
In [87]:
print( DD51_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD51_A['Second_variant_frequency_quality_corrected'][DD51_A['is_synonymous']=="synonymous"]
nonsyn = DD51_A['Second_variant_frequency_quality_corrected'][DD51_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD51_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD51_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 51")
Out[87]:
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 51.
In [88]:
print( DD24_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD24_A['Second_variant_frequency_quality_corrected'][DD24_A['is_synonymous']=="synonymous"]
nonsyn = DD24_A['Second_variant_frequency_quality_corrected'][DD24_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD24_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD24_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 24");
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 24.
In [89]:
print( DD12_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD12_A['Second_variant_frequency_quality_corrected'][DD12_A['is_synonymous']=="synonymous"]
nonsyn = DD12_A['Second_variant_frequency_quality_corrected'][DD12_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD12_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD12_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 12");
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 12.
In [90]:
print( DD9_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD9_A['Second_variant_frequency_quality_corrected'][DD9_A['is_synonymous']=="synonymous"]
nonsyn = DD9_A['Second_variant_frequency_quality_corrected'][DD9_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD9_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD9_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 9");
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 9.
In [91]:
print( DD6_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD6_A['Second_variant_frequency_quality_corrected'][DD6_A['is_synonymous']=="synonymous"]
nonsyn = DD6_A['Second_variant_frequency_quality_corrected'][DD6_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD6_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD6_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 6");
The difference in frequencies of the variants do not differ significantly between synonymous and non-synonymous variants at time point 6.
In [92]:
print( DD3_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median())
syn = DD3_A['Second_variant_frequency_quality_corrected'][DD3_A['is_synonymous']=="synonymous"]
nonsyn = DD3_A['Second_variant_frequency_quality_corrected'][DD3_A['is_synonymous']=="non-synonymous"]
print("T-test p-value: "+str(ttest_ind(syn, nonsyn)[1]))
f, ax = plt.subplots(figsize=(10, 7))
ax.set(yscale="log")
lm=sns.violinplot(x='is_synonymous', y="Second_variant_frequency_quality_corrected", data=DD3_A, hue='is_synonymous', ax=ax, bw=0.2)
axes = lm.axes
axes.set_ylim(0.001,0.05)
axes.set(yscale="log")
medians = DD3_A.groupby(['is_synonymous'])['Second_variant_frequency_quality_corrected'].median().values
plt.axhline(y=medians[0], linewidth=1, linestyle=':')
plt.axhline(y=medians[1], linewidth=1, linestyle=':')
# giving title to the plot
plt.title("Frequency of the second variant, synonymous vs non-synonymous, day 3");