Analysis of the libraries

Here we plot the major base frequency for different time points. We also detect positions that consistently rise in frequency through time, i.e. positions that rise monotonously in frequency with time. Positions that only rise in frequency between the first and the last time points have not been investigated.


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from pylab import rcParams
import seaborn as sns
from array import array
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import linregress
from scipy.stats import mannwhitneyu
%matplotlib inline

Obtaining the sequence annotation


In [4]:
begins=[]
ends=[]
names =[]
with open ("sequence.gb") as f:
    in_pep = False
    for l in f:
        if "mat_peptide" in l:
            begins.append(int(l.split()[1].split("..")[0]))
            ends.append(int(l.split()[1].split("..")[1]))
            in_pep = True
        elif in_pep :
            names.append(l.split("=")[1])
            in_pep = False
            
print(begins)
print(ends)
print(names)


[108, 474, 753, 978, 2490, 3546, 4224, 4614, 6465, 6846, 6915, 7668]
[473, 752, 977, 2489, 3545, 4223, 4613, 6464, 6845, 6914, 7667, 10376]
['"capsid"\n', '"propeptide"\n', '"membrane"\n', '"envelope"\n', '"NS1"\n', '"NS2A"\n', '"NS2B"\n', '"NS3"\n', '"NS4A"\n', '"2K"\n', '"NS4B"\n', '"NS5"\n']

Functions to plot interesting positions and gene boundaries


In [5]:
# Interesting positions
positions=[316,1670,1785,2340,5935,7172,8449,9165]
def plot_positions():
    for x in positions:
        plt.axvline(x=x, linewidth=1, linestyle=':')
        
def plot_genes():
    for i in range(len(begins)):
        plt.plot([begins[i], begins[i]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
        if i%2==0:
            plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.005, (names[i].replace('"', ''))[0:3], size='xx-small')
        else:
            plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.015, (names[i].replace('"', ''))[0:3], size='xx-small')
    plt.plot([ends[-1], ends[-1]], [0.99,1.0], linewidth=2, linestyle='-', color="black")

Function to add useful columns to the tables


In [6]:
def synonymous (row):
    if row['null'] or (row['consensus_aa']==row['secondbase_aa'] ):
        return "synonymous" 
    else:
        return "non-synonymous"

def add_columns(table):
    table['null'] = (table['secondbase_aa']).isnull()
    table['is_synonymous'] = table.apply (lambda row: synonymous (row),axis=1)
    table['1_major_variant_frequency'] = 1.0 - table['Major_variant_frequency_quality_corrected']

Functions to detect variants increasing in frequency


In [7]:
def is_increasing(minor_frequencies):
    #print(minor_frequencies)
    previous = minor_frequencies[0]
    for m in range(1,len(minor_frequencies)):
        if previous < minor_frequencies[m]:
            #print(str(previous) + " < " + str(minor_frequencies[m]))
            previous = minor_frequencies[m]
        else:
            return False
    return True

def get_variant_frequency(variant, table, i):
    sum_of_bases = table['As_quality_corrected'][i]+table['Cs_quality_corrected'][i]+table['Gs_quality_corrected'][i]+table['Ts_quality_corrected'][i]+table['Ns_quality_corrected'][i]
    if variant == "A":
        return table["As_quality_corrected"][i] / sum_of_bases
    elif variant == "C":
        return table["Cs_quality_corrected"][i] / sum_of_bases
    elif variant == "G":
        return table["Gs_quality_corrected"][i] / sum_of_bases
    elif variant == "T":
        return table["Ts_quality_corrected"][i] / sum_of_bases
    else:
        return np.nan
        

def get_increasing_variants(tables):
    num_tables = len(tables)
    first = tables[0]
    last = tables[num_tables-1]
    major = ""
    minor = ""
    major_frequencies = array('d',[0.0]*num_tables)
    minor_frequencies = array('d',[0.0]*num_tables)
    increasingVariants = dict()
    for i in first["Position"]:
        major = first["Major_variant"][i]
        #print(last['Major_variant_frequency_quality_corrected'][i])
        major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
        if major == last["Major_variant"][i]:
            minor = last["Second_variant"][i]
        else:
            minor = last["Major_variant"][i]
        minor_frequencies[0] = get_variant_frequency(minor, first, i)
        for table_id in range(1, num_tables):
            major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
            minor_frequencies[table_id] = get_variant_frequency(minor, tables[table_id], i)
        if is_increasing(minor_frequencies):
            increasingVariants[i] = [major_frequencies.tolist(), minor_frequencies.tolist()]
    return increasingVariants

Reading all data


In [8]:
# CirSeq initial sample
cirseq = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1CirseqD3_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(cirseq)

In [9]:
# Control runs, replicate A
DD3_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD3_A)
DD6_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD6_A)
DD9_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD9_A)
DD12_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD12_A)
DD24_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD24_A)
DD51_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD51_A)
DD51_A_no_reamp = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51Anoreamplification_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD51_A_no_reamp)

In [10]:
# Control runs, replicate D
DD3_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD3_D)
DD6_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD6_D)
DD9_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD9_D)
DD12_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD12_D)
DD24_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD24_D)

In [11]:
# Control runs, replicate E
DD6_E = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD6_E)
DD9_E = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD9_E)

In [12]:
# TLR3 activation runs, replicate A
TD9_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_A)
TD12_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD12_A)
TD24_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD24_A)
TD51_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD51_A)

In [13]:
#DD3_A.describe(include='all')

Analysis of the initial sample obtained with CirSeq

Coverage


In [14]:
sns.lmplot( x="Position", y="Coverage", data=cirseq, fit_reg=False, legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()


The coverage is very low.

Frequency of the major base


In [15]:
sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=cirseq, fit_reg=False, legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 20})
plt.legend(loc='lower right')
plot_positions()
plot_genes()


Positions with frequency of the major variant below 0.95:


In [16]:
cirseq['Position'][cirseq['Major_variant_frequency_quality_corrected']<0.95]


Out[16]:
0            0
1            1
2            2
910        910
926        926
1785      1785
10776    10776
10779    10779
10801    10801
10802    10802
10803    10803
10804    10804
10805    10805
10806    10806
Name: Position, dtype: int64

There is already position 1785... What are its states?


In [17]:
print("Position 1785: ")
print ("Major variant: "+cirseq['Major_variant'][1785])
print ("Second variant: "+cirseq['Second_variant'][1785])
print ("Major variant, aa: "+cirseq['consensus_aa'][1785])
print ("Second variant, aa: "+cirseq['secondbase_aa'][1785])

print("Frequency of the major variant: "+str(cirseq['Major_variant_frequency_quality_corrected'][1785]))
print("Frequency of the second variant: "+str(cirseq['Second_variant_frequency_quality_corrected'][1785]))


Position 1785: 
Major variant: C
Second variant: T
Major variant, aa: A
Second variant, aa: V
Frequency of the major variant: 0.870546
Frequency of the second variant: 0.125279

Plot of the diversity in the Cirseq experiment


In [18]:
cirseq['1_major_variant_frequency'] = 1.0 - cirseq['Major_variant_frequency_quality_corrected']

#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")

lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=cirseq, fit_reg=False, legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")

plt.legend(loc='lower right')


Here the distribution of diversity seems unimodal.

Comparison with and without reamplification


In [19]:
table_for_correlations = pd.DataFrame ({'51A_Cov':DD51_A ['Coverage'],'51ANR_Cov':DD51_A_no_reamp['Coverage'], '51A_MV':DD51_A ['Major_variant_frequency_quality_corrected'], '51ANR_MV':DD51_A_no_reamp ['Major_variant_frequency_quality_corrected'] })
table_for_correlations_highCov = table_for_correlations.loc[ (table_for_correlations['51A_Cov'] > 1000) & (table_for_correlations['51ANR_Cov'] > 1000) ]

sns.pairplot(table_for_correlations_highCov, vars=['51A_Cov','51ANR_Cov','51A_MV','51ANR_MV'], kind="scatter")


Out[19]:
<seaborn.axisgrid.PairGrid at 0x7f2e6a6b4be0>

Focus on the correlation between major variant frequencies


In [20]:
table_for_correlations_highFreq = table_for_correlations.loc[ (table_for_correlations['51A_MV'] > 0.1) & (table_for_correlations['51ANR_MV'] > 0.1) ]


lm=sns.lmplot(x='51A_MV',y='51ANR_MV', data=table_for_correlations_highFreq)
axes = lm.axes
#axes[0,0].set_ylim(0.001,0.05)

#axes[0,0].set(yscale="log", xscale="log")
res=linregress(table_for_correlations_highFreq['51A_MV'], table_for_correlations_highFreq['51ANR_MV'])
print(res)


LinregressResult(slope=0.92080042757889113, intercept=0.078620806030560431, rvalue=0.55404367334663751, pvalue=0.0, stderr=0.013313968229028231)

There is a correlation between the frequencies of the major variants in the two libraries, but the correlation coefficient is not great (0.55).

Focus on those with frequency >0.9 in the sample with reamplification


In [21]:
table_for_correlations_highFreq = table_for_correlations.loc[ (table_for_correlations['51ANR_MV'] > 0.9)  ]

lm=sns.lmplot(x='51A_MV',y='51ANR_MV', data=table_for_correlations_highFreq)
axes = lm.axes
#axes[0,0].set_ylim(0.001,0.05)

#axes[0,0].set(yscale="log")
res=linregress(table_for_correlations_highFreq['51A_MV'], table_for_correlations_highFreq['51ANR_MV'])
print(res)


LinregressResult(slope=0.77600804175089222, intercept=0.22278502849188975, rvalue=0.48967318178798563, pvalue=0.0, stderr=0.013302675626933793)

The correlation is not great for positions where there is little variation. It is also surprising to see that lots of positions with frequencies between 0.9 and 1.0 in the non-reamplified library have a frequency of nearly 1.0 in the reamplified one. This seems to suggest that the amplification by PCR overly amplified the dominant variant in lots of cases.

Correlation between second most frequent variants


In [22]:
table_for_correlations = pd.DataFrame ({'51A_Cov':DD51_A ['Coverage'],'51ANR_Cov':DD51_A_no_reamp['Coverage'], '51A_SV':DD51_A ['Second_variant_frequency_quality_corrected'], '51ANR_SV':DD51_A_no_reamp ['Second_variant_frequency_quality_corrected'] })
table_for_correlations_highCov = table_for_correlations.loc[ (table_for_correlations['51A_Cov'] > 1000) & (table_for_correlations['51ANR_Cov'] > 1000) ]

In [23]:
#table_for_correlations_highFreq = table_for_correlations.loc[ (table_for_correlations['51A_MV'] > 0.95)  ]

lm=sns.lmplot(x='51A_SV',y='51ANR_SV', data=table_for_correlations_highCov)
axes = lm.axes
#axes[0,0].set_ylim(0.001,0.05)

#axes[0,0].set(yscale="log")
res=linregress(table_for_correlations_highCov['51A_SV'], table_for_correlations_highCov['51ANR_SV'])
print(res)


LinregressResult(slope=1.0931256196492429, intercept=-0.00037973675091814828, rvalue=0.6325437149737615, pvalue=0.0, stderr=0.012948740114485434)

The correlation between the frequencies of the second most frequent variants is not too bad (correlation coefficient 0.63).

Positions that differ in their most frequent base, and in their second most frequent base

Positions that differ in the most frequent base:


In [24]:
DD51_A ['Position'][DD51_A ['Major_variant'] != DD51_A_no_reamp ['Major_variant']]


Out[24]:
3          3
4          4
5          5
2340    2340
Name: Position, dtype: int64

Number of positions that differ in the second most frequent base:


In [25]:
len(DD51_A ['Position'][DD51_A ['Second_variant'] != DD51_A_no_reamp ['Second_variant']])


Out[25]:
1114

Frequencies of the variants that change between the two library preparations


In [26]:
sns.kdeplot(DD51_A ['Second_variant_frequency_quality_corrected'][DD51_A ['Second_variant'] != DD51_A_no_reamp ['Second_variant']])


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2e503022e8>

In [27]:
(DD51_A ['Second_variant_frequency_quality_corrected'][DD51_A ['Second_variant'] != DD51_A_no_reamp ['Second_variant']]).describe()


Out[27]:
count    1114.000000
mean        0.002228
std         0.002332
min         0.000239
25%         0.001260
50%         0.001690
75%         0.002630
max         0.046251
Name: Second_variant_frequency_quality_corrected, dtype: float64

The second most frequent variants that differ between with and without reamplification usually have a low frequency; for frequent variants, the correlation is not too bad.

Analysis of the diversity at time point 24 with or without reamplification


In [28]:
DD51_A_no_reamp['1_major_variant_frequency'] = 1.0 - DD51_A_no_reamp['Major_variant_frequency_quality_corrected']

#f, ax = plt.subplots(figsize=(10, 7))
#ax.set(yscale="log")

lm=sns.lmplot( x="Position", y="1_major_variant_frequency", data=DD51_A_no_reamp, fit_reg=False, legend=False, size=7, aspect=2, scatter_kws={"s": 10})
axes = lm.axes
axes[0,0].set_ylim(0.001,0.05)
axes[0,0].set(yscale="log")

plt.legend(loc='lower right')