In [2]:
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from pylab import rcParams
import seaborn as sns
from array import array
import math
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import linregress
from scipy.stats import mannwhitneyu
import statistics
%matplotlib inline
In [3]:
begins=[]
ends=[]
names =[]
with open ("CommonData/sequence.gb") as f:
in_pep = False
for l in f:
if "mat_peptide" in l:
begins.append(int(l.split()[1].split("..")[0]))
ends.append(int(l.split()[1].split("..")[1]))
in_pep = True
elif in_pep :
names.append(l.split("=")[1])
in_pep = False
print(begins)
print(ends)
print(names)
In [4]:
# Interesting positions
positions=[316,1670,1785,2340,5935,7172,8449,9165]
def plot_positions():
for x in positions:
plt.axvline(x=x, linewidth=1, linestyle=':')
def plot_genes():
for i in range(len(begins)):
plt.plot([begins[i], begins[i]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
if i%2==0:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.005, (names[i].replace('"', ''))[0:3], size='xx-small')
else:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.015, (names[i].replace('"', ''))[0:3], size='xx-small')
plt.plot([ends[-1], ends[-1]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
In [5]:
def synonymous (row):
if row['null'] or (row['Consensus_aa']==row['Secondbase_aa'] ):
return "synonymous"
else:
return "non-synonymous"
def add_columns(table):
table['null'] = (table['Secondbase_aa']).isnull()
table['is_synonymous'] = table.apply (lambda row: synonymous (row),axis=1)
table['1_major_variant_frequency'] = 1.0 - table['Major_variant_frequency_quality_corrected']
In [6]:
def is_increasing(minor_frequencies):
#print(minor_frequencies)
tolerance = 0.01
minimum_increase = 0.1
previous = minor_frequencies[0]
if minor_frequencies[-1] - minor_frequencies[0] < minimum_increase:
return False
for m in range(1,len(minor_frequencies)):
if previous < minor_frequencies[m] or previous < minor_frequencies[m] + tolerance:
#print(str(previous) + " < " + str(minor_frequencies[m]))
previous = minor_frequencies[m]
else:
return False
return True
# Strict definition of an increasing position
def is_strictly_increasing(minor_frequencies):
#print(minor_frequencies)
previous = minor_frequencies[0]
for m in range(1,len(minor_frequencies)):
if previous < minor_frequencies[m]:
#print(str(previous) + " < " + str(minor_frequencies[m]))
previous = minor_frequencies[m]
else:
return False
return True
def get_variant_frequency(variant, table, i):
sum_of_bases = table['As_quality_corrected'][i]+table['Cs_quality_corrected'][i]+table['Gs_quality_corrected'][i]+table['Ts_quality_corrected'][i]+table['Ns_quality_corrected'][i]
if variant == "A":
return table["As_quality_corrected"][i] / sum_of_bases
elif variant == "C":
return table["Cs_quality_corrected"][i] / sum_of_bases
elif variant == "G":
return table["Gs_quality_corrected"][i] / sum_of_bases
elif variant == "T":
return table["Ts_quality_corrected"][i] / sum_of_bases
else:
return np.nan
def get_increasing_variants(tables):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
increasingVariants = dict()
for i in first["Position"]:
major = first["Major_variant"][i]
#print(last['Major_variant_frequency_quality_corrected'][i])
major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
if major == last["Major_variant"][i]:
minor = last["Second_variant"][i]
else:
minor = last["Major_variant"][i]
minor_frequencies[0] = get_variant_frequency(minor, first, i)
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
minor_frequencies[table_id] = get_variant_frequency(minor, tables[table_id], i)
if is_increasing(minor_frequencies):
increasingVariants[i] = [major_frequencies.tolist(), minor_frequencies.tolist()]
return increasingVariants
def printMajorFrequencyThroughSamples(tables, numPos):
major = tables[0]['Major_variant'][numPos]
last_major = tables[0]['Major_variant'][numPos]
print("Position "+ str(numPos) +", Major variant in first sample: " + major)
print("Position "+ str(numPos) +", Frequencies of "+major+" through the samples: ")
for i in range(len(tables)):
print("\t"+str(get_variant_frequency(major, tables[i], numPos)))
print("Position "+ str(numPos) +", Major variant in last sample: " + tables[-1]['Major_variant'][numPos])
def printMajorFrequencyThroughSamples_2340_7172(tables):
printMajorFrequencyThroughSamples(tables, 2340)
printMajorFrequencyThroughSamples(tables, 7172)
In [7]:
# Functions to think in terms of standard deviation of the frequency per site
def get_varying_variants(tables):
sd_threshold = 0.1
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
varyingVariants = dict()
for i in first["Position"]:
major = first["Major_variant"][i]
#print(last['Major_variant_frequency_quality_corrected'][i])
major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
sd_value = statistics.pstdev(major_frequencies)
if sd_value > sd_threshold:
varyingVariants[i] = sd_value
print("There are "+str(len(varyingVariants))+" positions whose major variant varies a lot in frequency.")
print("Those are:")
print(varyingVariants.keys())
return varyingVariants
In [8]:
# Control runs, replicate A
DD3_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD3_A)
DD6_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD6_A)
DD9_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD9_A)
DD12_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD12_A)
DD18_A = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD18_A)
DD24_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD24_A)
DD51_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD51_A)
DD51_A_no_reamp = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51Anoreamplification_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD51_A_no_reamp)
tables_A = [DD3_A, DD6_A, DD9_A, DD12_A, DD18_A, DD24_A, DD51_A]
In [9]:
# Control runs, replicate D
DD3_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD3_D)
DD6_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD6_D)
DD9_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD9_D)
DD12_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD12_D)
DD18_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD18_D)
DD24_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD24_D)
tables_D = [DD3_D, DD6_D, DD9_D, DD12_D, DD18_D, DD24_D]
In [10]:
# Control runs, replicate E
DD3_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD3E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD6_E = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD9_E = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD12_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD12E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD18_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD24E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24crude_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD24Ecrude_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD3_E)
add_columns(DD6_E)
add_columns(DD9_E)
add_columns(DD12_E)
add_columns(DD18_E)
add_columns(DD24_E)
add_columns(DD24crude_E)
tables_E = [DD3_E, DD6_E, DD9_E, DD12_E, DD18_E, DD24_E]
In [11]:
# TLR3 activation runs, replicate A
TD9_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_A)
TD12_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD12_A)
TD18_A = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD18_A)
TD24_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD24_A)
TD51_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD51_A)
tables_TA = [TD9_A, TD12_A, TD18_A, TD24_A, TD51_A]
In [12]:
# TLR3 activation runs, replicate D
TD9_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD18_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_D)
add_columns(TD12_D)
add_columns(TD18_D)
add_columns(TD24_D)
tables_TD = [TD9_D, TD12_D, TD18_D, TD24_D]
In [13]:
# TLR3 activation runs, replicate E
TD9_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD12E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD18_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD24E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_E)
add_columns(TD12_E)
add_columns(TD18_E)
add_columns(TD24_E)
tables_TE = [TD9_E, TD12_E, TD18_E, TD24_E]
In [14]:
# All tables
all_tables = tables_A+tables_D+tables_E+tables_TA+tables_TD+tables_TE
all_table_names = ["DD3_A", "DD6_A", "DD9_A", "DD12_A", "DD18_A", "DD24_A", "DD51_A", "DD3_D", "DD6_D", "DD9_D", "DD12_D", "DD18_D", "DD24_D", "DD3_E", "DD6_E", "DD9_E", "DD12_E", "DD18_E", "DD24_E", "TD9_A", "TD12_A", "TD18_A", "TD24_A", "TD51_A", "TD9_D", "TD12_D", "TD18_D", "TD24_D", "TD9_E", "TD12_E", "TD18_E", "TD24_E"]
In [15]:
increasing_A = get_increasing_variants(tables_A)
print("There are "+str(len(increasing_A))+" positions that rise in frequency.")
print("Those are:")
print(increasing_A.keys())
In [16]:
def plot_increasing_positions(increasing_pos, last_time_point) :
sns.set_palette("hls")
sns.set_context("poster")
increasing_pos_keys = increasing_pos.keys()
Is_increasing = []
for i in last_time_point['Position']:
if i in increasing_pos_keys:
Is_increasing.append("Increasing")
else:
Is_increasing.append("Not increasing")
to_plot = pd.DataFrame ({'Position':last_time_point['Position'], 'Major_variant_frequency_quality_corrected':last_time_point ['Major_variant_frequency_quality_corrected'],'Is_increasing':Is_increasing, 'is_synonymous':last_time_point ['is_synonymous']})
ax=sns.lmplot( x="Position", y="Major_variant_frequency_quality_corrected", data=to_plot, fit_reg=False, hue='Is_increasing', row='is_synonymous', legend=False, size=7, aspect=2)
ax.set(xlabel='Position along the genome', ylabel='Major variant frequency, last time point')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
In [17]:
plot_increasing_positions(increasing_A, DD51_A)
In [18]:
def write_increasing_positions(increasing_pos, tables, exp_names, filename) :
increasing_pos_table = pd.DataFrame()
for d in tables:
increasing_pos_table = pd.concat([increasing_pos_table,d[d['Position'].isin(increasing_pos)]])
num = len(increasing_pos)
expnames_iterated = []
for i in exp_names :
expnames_iterated.append([i]*num)
expnames = [item for sublist in expnames_iterated for item in sublist]
print(len(expnames))
increasing_pos_table['expName']=expnames
increasing_pos_table.to_csv( filename )
In [19]:
write_increasing_positions(increasing_A, tables_A, ["DD3_A", "DD6_A", "DD9_A", "DD12_A", "DD18_A", "DD24_A", "DD51_A"], "increasing_A.csv")
In [20]:
printMajorFrequencyThroughSamples_2340_7172(tables_A)
In [21]:
increasing_D = get_increasing_variants(tables_D)
print("There are "+str(len(increasing_D))+" positions that rise in frequency.")
print("Those are:")
print(increasing_D.keys())
In [22]:
plot_increasing_positions(increasing_D, DD24_D)
In [23]:
write_increasing_positions(increasing_D, tables_D, ["DD3_D", "DD6_D", "DD9_D", "DD12_D", "DD18_D", "DD24_D"], "increasing_D.csv")
In [24]:
printMajorFrequencyThroughSamples_2340_7172(tables_D)
In [25]:
increasing_E = get_increasing_variants(tables_E)
print("There are "+str(len(increasing_E))+" positions that rise in frequency.")
print("Those are:")
print(increasing_E.keys())
In [26]:
plot_increasing_positions(increasing_E, DD24_E)
In [27]:
write_increasing_positions(increasing_E, tables_E, ["DD3_E", "DD6_E", "DD9_E", "DD12_E", "DD18_E", "DD24_E"], "increasing_E.csv")
In [28]:
printMajorFrequencyThroughSamples_2340_7172(tables_E)
In [29]:
increasing_TA = get_increasing_variants(tables_TA)
print("There are "+str(len(increasing_TA))+" positions that rise in frequency.")
print("Those are:")
print(increasing_TA.keys())
In [30]:
plot_increasing_positions(increasing_TA, TD51_A)
In [31]:
write_increasing_positions(increasing_TA, tables_TA, [ "TD9_A", "TD12_A", "TD18_A", "TD24_A", "TD51_A"], "increasing_TA.csv")
In [32]:
printMajorFrequencyThroughSamples_2340_7172(tables_TA)
In [33]:
increasing_TD = get_increasing_variants(tables_TD)
print("There are "+str(len(increasing_TD))+" positions that rise in frequency.")
print("Those are:")
print(increasing_TD.keys())
In [34]:
plot_increasing_positions(increasing_TD, TD24_D)
In [35]:
printMajorFrequencyThroughSamples_2340_7172(tables_TD)
In [36]:
write_increasing_positions(increasing_TD, tables_TD, [ "TD9_D", "TD12_D", "TD18_D", "TD24_D"], "increasing_TD.csv")
In [37]:
increasing_TE = get_increasing_variants(tables_TE)
print("There are "+str(len(increasing_TE))+" positions that rise in frequency.")
print("Those are:")
print(increasing_TE.keys())
In [38]:
plot_increasing_positions(increasing_TE, TD24_E)
In [39]:
write_increasing_positions(increasing_TE, tables_TE, [ "TD9_E", "TD12_E", "TD18_E", "TD24_E"], "increasing_TE.csv")
In [40]:
printMajorFrequencyThroughSamples_2340_7172(tables_TE)
In [41]:
varying_A = get_varying_variants(tables_A)
In [42]:
varying_D = get_varying_variants(tables_D)
In [43]:
varying_E = get_varying_variants(tables_E)
In [44]:
varying_TA = get_varying_variants(tables_TA)
In [45]:
varying_TD = get_varying_variants(tables_TD)
In [46]:
varying_TE = get_varying_variants(tables_TE)
In [47]:
all_varying = [varying_A,varying_D,varying_E,varying_TA,varying_TD,varying_TE]
def build_table_for_plotting_varying_pos (all_varying):
all_keys = list()
for di in all_varying:
all_keys = all_keys + list(di.keys())
return(Counter(all_keys))
counts = build_table_for_plotting_varying_pos(all_varying)
print (counts)
In [48]:
counts_sorted = sorted(counts.items())
labels, values = zip(*counts_sorted)
indexes = np.arange(len(labels))
width = 1
sns.set_palette("hls")
sns.set_context("poster")
a4_dims = (11.7, 8.27)
to_plot = pd.DataFrame ({'Position':indexes, 'standard_deviation':values})
fig, ax = plt.subplots(figsize=a4_dims)
sns.barplot(y="Position", x="standard_deviation", data=to_plot, orient="h")
ax.set(xlabel='Number of samples where standard deviation is above 0.1', ylabel='Position in the genome')
plt.yticks(indexes + width * 0.5, labels)
plt.show()
In [49]:
# Output positions of interest based on this analysis
varying_pos = [1670,1785,2193,2340,5662,7172,10006]
write_increasing_positions(varying_pos, all_tables, all_table_names, "varying_positions_all_samples.csv")
We want to select all the positions in which the minor variant reaches >0.1 in at least two time points, in two different replicates or experiments. To do so, we get the variant at time point 0, then we look at the following time points and see if one site has some minor variant >0.1 in 2 time points. We collect all the sites with this characteristic for this experiment. We do this for all experiments, then we take all sites that appear at least twice.
In [50]:
def getInterestingVariants (tables):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
interestingVariants = list()
for i in first["Position"]:
minor_frequencies = array('d',[0.0]*num_tables)
major = first["Major_variant"][i]
minor = tables[-1]["Second_variant"][i]
if minor == major:
minor = tables[-1]["Major_variant"][i]
counter = 0
for j in range(num_tables):
minor_frequencies[j] = get_variant_frequency(minor, tables[j], i)
if minor_frequencies[j] > 0.1:
counter += 1
if counter >=2:
interestingVariants.append(i)
return interestingVariants
In [51]:
interesting_A = getInterestingVariants (tables_A)
interesting_D = getInterestingVariants (tables_D)
interesting_E = getInterestingVariants (tables_E)
interesting_TA = getInterestingVariants (tables_TA)
interesting_TD = getInterestingVariants (tables_TD)
interesting_TE = getInterestingVariants (tables_TE)
In [52]:
all_interesting = list()
all_interesting = all_interesting + interesting_A + interesting_D + interesting_E + interesting_TA + interesting_TD + interesting_TE
all_interesting
Out[52]:
In [53]:
Counter(all_interesting)
Out[53]:
Therefore the positions of interest are: 0,1,2,3,1670,1785,2340,5662,7172,10006, 10806
In [54]:
for_output = [0,1,2,3,1670,1785,2340,5662,7172,10006, 10806]
write_increasing_positions(for_output, all_tables, all_table_names, "positions_morethan01_all_samples.csv")
In [55]:
all_interesting_D = list()
all_interesting_D = all_interesting_D + interesting_A + interesting_D + interesting_E
all_interesting_D
Out[55]:
In [56]:
Counter(all_interesting_D)
Out[56]:
In [57]:
all_interesting_TD = list()
all_interesting_TD = all_interesting_TD + interesting_TA + interesting_TD + interesting_TE
all_interesting_TD
Out[57]:
In [58]:
Counter(all_interesting_TD)
Out[58]:
In [59]:
def test1670_2193 (tables):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
for i in [1670,2193]:
minor_frequencies = array('d',[0.0]*num_tables)
major = first["Major_variant"][i]
minor = tables[-1]["Second_variant"][i]
if minor == major:
minor = tables[-1]["Major_variant"][i]
counter = 0
print(str(i)+" : ")
for j in range(num_tables):
minor_frequencies[j] = get_variant_frequency(minor, tables[j], i)
print("\texp "+str(j) +", variant "+minor + " : "+str(minor_frequencies[j]))
if minor_frequencies[j] > 0.1:
counter += 1
print("Counter of times > 0.1: "+ str(counter))
return
In [60]:
test1670_2193 (tables_A)
In [61]:
test1670_2193 (tables_D)
In [62]:
test1670_2193 (tables_E)
In [63]:
test1670_2193 (tables_TA)
In [64]:
test1670_2193 (tables_TD)
In [65]:
test1670_2193 (tables_TE)
In [66]:
def get_delta(tables, j, i):
sum_of_bases_j = tables[j]['As_quality_corrected'][i]+tables[j]['Cs_quality_corrected'][i]+tables[j]['Gs_quality_corrected'][i]+tables[j]['Ts_quality_corrected'][i]+tables[j]['Ns_quality_corrected'][i]
sum_of_bases_j_1 = tables[j-1]['As_quality_corrected'][i]+tables[j-1]['Cs_quality_corrected'][i]+tables[j-1]['Gs_quality_corrected'][i]+tables[j-1]['Ts_quality_corrected'][i]+tables[j-1]['Ns_quality_corrected'][i]
# sum_of_squares = pow(tables[j]["As_quality_corrected"][i] / sum_of_bases_j -
# tables[j-1]["As_quality_corrected"][i] / sum_of_bases_j_1, 2) + pow(tables[j]["Cs_quality_corrected"][i] / sum_of_bases_j -
# tables[j-1]["Cs_quality_corrected"][i] / sum_of_bases_j_1, 2) + pow(tables[j]["Gs_quality_corrected"][i] / sum_of_bases_j -
# tables[j-1]["Gs_quality_corrected"][i] / sum_of_bases_j_1, 2) + pow(tables[j]["Ts_quality_corrected"][i] / sum_of_bases_j -
# tables[j-1]["Ts_quality_corrected"][i] / sum_of_bases_j_1, 2)
sum_of_abs = abs(tables[j]["As_quality_corrected"][i] / sum_of_bases_j -
tables[j-1]["As_quality_corrected"][i] / sum_of_bases_j_1) + abs(tables[j]["Cs_quality_corrected"][i] / sum_of_bases_j -
tables[j-1]["Cs_quality_corrected"][i] / sum_of_bases_j_1) + abs(tables[j]["Gs_quality_corrected"][i] / sum_of_bases_j -
tables[j-1]["Gs_quality_corrected"][i] / sum_of_bases_j_1) + abs(tables[j]["Ts_quality_corrected"][i] / sum_of_bases_j -
tables[j-1]["Ts_quality_corrected"][i] / sum_of_bases_j_1)
return sum_of_abs
def getDeltas (tables):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
deltas = list()
for i in range(num_tables - 1):
deltas.append( array('d',[0.0]*len(first["Position"])) )
interestingVariants = list()
for i in first["Position"]:
# minor_frequencies = array('d',[0.0]*num_tables)
#major = first["Major_variant"][i]
#minor = tables[-1]["Second_variant"][i]
#if minor == major:
# minor = tables[-1]["Major_variant"][i]
for j in range(num_tables-1):
deltas[j][i] = get_delta(tables, j, i)
#deltas[j][i] = abs ( get_variant_frequency(minor, tables[j], i) - get_variant_frequency(minor, tables[j-1], i) )
names = list()
d = list()
ld = list()
for j in range(num_tables-1):
for i in first["Position"]:
names.append(str(j))
d.append(deltas[j][i])
ld.append(math.log(deltas[j][i]+1e-9))
d2 = {'names':names, 'deltas':d}
ld2 = {'names':names, 'deltas':ld}
df = pd.DataFrame(d2)
ldf = pd.DataFrame(ld2)
return df, ldf
def plotDeltas(d):
sns.set_style("whitegrid")
ax = sns.violinplot(d["names"], y=d["deltas"], inner=None)
#ax = sns.swarmplot(d["numCons"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Between-sample deltas', ylabel='Delta values')
def plotLogDeltas(d):
sns.set_style("whitegrid")
ax = sns.boxplot(d["names"], y=d["deltas"])
#ax = sns.swarmplot(d["numCons"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Between-sample deltas', ylabel='Delta values')
ax.set_yscale('log')
In [67]:
d_A, ld_A = getDeltas (tables_A)
In [68]:
%matplotlib inline
plotLogDeltas(d_A)
In [69]:
d_D, ld_D = getDeltas (tables_D)
In [70]:
%matplotlib inline
plotLogDeltas(d_D)
In [71]:
d_E, ld_E = getDeltas (tables_E)
In [72]:
%matplotlib inline
plotLogDeltas(d_E)
In [73]:
d_TA, ld_TA = getDeltas (tables_TA)
In [74]:
%matplotlib inline
plotLogDeltas(d_TA)
In [75]:
d_TD, ld_TD = getDeltas (tables_TD)
In [76]:
%matplotlib inline
plotLogDeltas(d_TD)
In [77]:
d_TE, ld_TE = getDeltas (tables_TE)
In [78]:
%matplotlib inline
plotLogDeltas(d_TE)
In [79]:
deltas = list()
deltas = deltas + d_TE['deltas'].tolist() + d_TD['deltas'].tolist() + d_TA['deltas'].tolist() + d_E['deltas'].tolist() + d_D['deltas'].tolist() + d_A['deltas'].tolist()
print(len(deltas))
In [80]:
log_deltas = list()
print(type( ld_TE['deltas']))
log_deltas = log_deltas + ld_TE['deltas'].tolist() + ld_TD['deltas'].tolist() + ld_TA['deltas'].tolist() + ld_E['deltas'].tolist() + ld_D['deltas'].tolist() + ld_A['deltas'].tolist()
print(len(log_deltas))
In [81]:
a = np.array(deltas)
p = np.nanpercentile(a, 95) # return 95th percentile.
print(p)
a = np.array(log_deltas)
logp = np.nanpercentile(a, 95) # return 95th percentile.
print(logp)
In [82]:
sns.kdeplot(log_deltas)
plt.plot([logp, logp], [0, 1], linewidth=2)
Out[82]:
In [83]:
sns.kdeplot(log_deltas)
plt.plot([-5, -5], [0, 1], linewidth=2)
Out[83]:
In [84]:
def plotDeltasThreshold(d, threshold):
sns.set_style("whitegrid")
# Create variable with TRUE if delta is greater than threshold
good = d['deltas'] > threshold
df = d[good]
ax = sns.violinplot(df["names"], y=df["deltas"], inner=None)
#ax = sns.swarmplot(d["numCons"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Between-sample deltas', ylabel='Delta values')
def plotLogDeltasThreshold(d, threshold):
sns.set_style("whitegrid")
# Create variable with TRUE if delta is greater than threshold
good = d['deltas'] > math.exp(threshold)
df = d[good]
ax = sns.boxplot(df["names"], y=df["deltas"])
#ax = sns.swarmplot(d["numCons"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Between-sample deltas', ylabel='Delta values')
ax.set_yscale('log')
In [85]:
plotLogDeltasThreshold(d_A, -5)
In [86]:
plotLogDeltasThreshold(d_D, -5)
In [87]:
plotLogDeltasThreshold(d_E, -5)