In [1]:
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from pylab import rcParams
import seaborn as sns
from array import array
import math
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import linregress
from scipy.stats import mannwhitneyu
import statistics
%matplotlib inline
In [2]:
begins=[]
ends=[]
names =[]
with open ("CommonData/sequence.gb") as f:
in_pep = False
for l in f:
if "mat_peptide" in l:
begins.append(int(l.split()[1].split("..")[0]))
ends.append(int(l.split()[1].split("..")[1]))
in_pep = True
elif in_pep :
names.append(l.split("=")[1])
in_pep = False
print(begins)
print(ends)
print(names)
In [3]:
# Interesting positions
#positions=[316,1670,1785,2340,5935,7172,8449,9165]
positions = [1670, 2340, 5662, 7172, 10006, 2193, 1785]
def plot_positions():
for x in positions:
plt.axvline(x=x, linewidth=1, linestyle=':', color="black")
def plot_genes(with_text=True):
for i in range(len(begins)):
plt.plot([begins[i], begins[i]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
if with_text:
if i%2==0:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.005, (names[i].replace('"', ''))[0:3], size='xx-small')
else:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.015, (names[i].replace('"', ''))[0:3], size='xx-small')
plt.plot([ends[-1], ends[-1]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
In [4]:
def synonymous (row):
if row['null'] or (row['Consensus_aa']==row['Secondbase_aa'] ):
return "synonymous"
else:
return "non-synonymous"
def add_columns(table):
table['null'] = (table['Secondbase_aa']).isnull()
table['is_synonymous'] = table.apply (lambda row: synonymous (row),axis=1)
table['1_major_variant_frequency'] = 1.0 - table['Major_variant_frequency_quality_corrected']
In [5]:
def is_increasing(minor_frequencies):
#print(minor_frequencies)
tolerance = 0.01
minimum_increase = 0.1
previous = minor_frequencies[0]
if minor_frequencies[-1] - minor_frequencies[0] < minimum_increase:
return False
for m in range(1,len(minor_frequencies)):
if previous < minor_frequencies[m] or previous < minor_frequencies[m] + tolerance:
#print(str(previous) + " < " + str(minor_frequencies[m]))
previous = minor_frequencies[m]
else:
return False
return True
# Strict definition of an increasing position
def is_strictly_increasing(minor_frequencies):
#print(minor_frequencies)
previous = minor_frequencies[0]
for m in range(1,len(minor_frequencies)):
if previous < minor_frequencies[m]:
#print(str(previous) + " < " + str(minor_frequencies[m]))
previous = minor_frequencies[m]
else:
return False
return True
def get_variant_frequency(variant, table, i):
sum_of_bases = table['As_quality_corrected'][i]+table['Cs_quality_corrected'][i]+table['Gs_quality_corrected'][i]+table['Ts_quality_corrected'][i]+table['Ns_quality_corrected'][i]
if variant == "A":
return table["As_quality_corrected"][i] / sum_of_bases
elif variant == "C":
return table["Cs_quality_corrected"][i] / sum_of_bases
elif variant == "G":
return table["Gs_quality_corrected"][i] / sum_of_bases
elif variant == "T":
return table["Ts_quality_corrected"][i] / sum_of_bases
else:
return np.nan
def get_increasing_variants(tables):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
increasingVariants = dict()
for i in first["Position"]:
major = first["Major_variant"][i]
#print(last['Major_variant_frequency_quality_corrected'][i])
major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
if major == last["Major_variant"][i]:
minor = last["Second_variant"][i]
else:
minor = last["Major_variant"][i]
minor_frequencies[0] = get_variant_frequency(minor, first, i)
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
minor_frequencies[table_id] = get_variant_frequency(minor, tables[table_id], i)
if is_increasing(minor_frequencies):
increasingVariants[i] = [major_frequencies.tolist(), minor_frequencies.tolist()]
return increasingVariants
def printMajorFrequencyThroughSamples(tables, numPos):
major = tables[0]['Major_variant'][numPos]
last_major = tables[0]['Major_variant'][numPos]
print("Position "+ str(numPos) +", Major variant in first sample: " + major)
print("Position "+ str(numPos) +", Frequencies of "+major+" through the samples: ")
for i in range(len(tables)):
print("\t"+str(get_variant_frequency(major, tables[i], numPos)))
print("Position "+ str(numPos) +", Major variant in last sample: " + tables[-1]['Major_variant'][numPos])
def printMajorFrequencyThroughSamples_2340_7172(tables):
printMajorFrequencyThroughSamples(tables, 2340)
printMajorFrequencyThroughSamples(tables, 7172)
In [6]:
## Function to get colors by time points
def color_from_name (name, colors):
if "3_" in name:
return colors[0]
if "6_" in name:
return colors[1]
if "9_" in name:
return colors[2]
if "12_" in name:
return colors[3]
if "18_" in name:
return colors[4]
if "24_" in name:
return colors[5]
if "51_" in name:
return colors[6]
else:
print("PROBLEM: did not find " + name)
def colors_from_names (names):
colors= list()
pal = sns.color_palette( n_colors=7 )
hexa_cols = pal.as_hex()
for name in names:
colors.append(color_from_name (name, hexa_cols) )
return colors
In [7]:
# Control runs, replicate A
DD3_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD3_A)
DD6_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD6_A)
DD9_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD9_A)
DD12_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD12_A)
DD18_A = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD18_A)
DD24_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD24_A)
DD51_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD51_A)
DD51_A_no_reamp = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51Anoreamplification_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
add_columns(DD51_A_no_reamp)
tables_A = [DD3_A, DD6_A, DD9_A, DD12_A, DD18_A, DD24_A, DD51_A]
names_A = ["DD3_A", "DD6_A", "DD9_A", "DD12_A", "DD18_A", "DD24_A", "DD51_A"]
colors_A = colors_from_names(names_A)
In [8]:
# Control runs, replicate D
DD3_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD3_D)
DD6_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD6_D)
DD9_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD9_D)
DD12_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD12_D)
DD18_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD18_D)
DD24_D = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD24_D)
tables_D = [DD3_D, DD6_D, DD9_D, DD12_D, DD18_D, DD24_D]
names_D = ["DD3_D", "DD6_D", "DD9_D", "DD12_D", "DD18_D", "DD24_D"]
colors_D = colors_from_names(names_D)
In [9]:
# Control runs, replicate E
DD3_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD3E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD6_E = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD9_E = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD12_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD12E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD18_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD18E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD24E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24crude_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1DD24Ecrude_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(DD3_E)
add_columns(DD6_E)
add_columns(DD9_E)
add_columns(DD12_E)
add_columns(DD18_E)
add_columns(DD24_E)
add_columns(DD24crude_E)
tables_E = [DD3_E, DD6_E, DD9_E, DD12_E, DD18_E, DD24_E]
names_E = ["DD3_E", "DD6_E", "DD9_E", "DD12_E", "DD18_E", "DD24_E"]
colors_E = colors_from_names(names_E)
In [10]:
# TLR3 activation runs, replicate A
TD9_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_A)
TD12_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD12_A)
TD18_A = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD18_A)
TD24_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD24_A)
TD51_A = pd.read_csv ("PKG-DREUX_HV5GLBCXY/CutAdaptPearViroMapperMapped/HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD51_A)
tables_TA = [TD9_A, TD12_A, TD18_A, TD24_A, TD51_A]
names_TA = ["TD9_A", "TD12_A", "TD18_A", "TD24_A", "TD51_A"]
colors_TA = colors_from_names(names_TA)
In [11]:
# TLR3 activation runs, replicate D
TD9_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD18_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_D = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_D)
add_columns(TD12_D)
add_columns(TD18_D)
add_columns(TD24_D)
tables_TD = [TD9_D, TD12_D, TD18_D, TD24_D]
names_TD = ["TD9_D", "TD12_D", "TD18_D", "TD24_D"]
colors_TD = colors_from_names(names_TD)
In [12]:
# TLR3 activation runs, replicate E
TD9_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD12E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD18_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD18E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_E = pd.read_csv ("HJJ7JBCX2_ZIKV/Mapped_Reads/HJJ7JBCX2_ZIKV-s-and-c_18s004258-1-1_DREUX_lane1TD24E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
add_columns(TD9_E)
add_columns(TD12_E)
add_columns(TD18_E)
add_columns(TD24_E)
tables_TE = [TD9_E, TD12_E, TD18_E, TD24_E]
names_TE= [ "TD9_E", "TD12_E", "TD18_E", "TD24_E"]
colors_TE = colors_from_names(names_TE)
In [13]:
# All tables
all_tables = tables_A+tables_D+tables_E+tables_TA+tables_TD+tables_TE
all_table_names = ["DD3_A", "DD6_A", "DD9_A", "DD12_A", "DD18_A", "DD24_A", "DD51_A", "DD3_D", "DD6_D", "DD9_D", "DD12_D", "DD18_D", "DD24_D", "DD3_E", "DD6_E", "DD9_E", "DD12_E", "DD18_E", "DD24_E", "TD9_A", "TD12_A", "TD18_A", "TD24_A", "TD51_A", "TD9_D", "TD12_D", "TD18_D", "TD24_D", "TD9_E", "TD12_E", "TD18_E", "TD24_E"]
all_experiments = [tables_A,tables_D,tables_E,tables_TA,tables_TD,tables_TE]
all_experiment_names = ["tables_A","tables_D","tables_E","tables_TA","tables_TD","tables_TE"]
In [14]:
## Function to get colors by experiment
def color_from_name_by_experiment (name, colors):
if "DD" in name:
if "_A" in name:
return colors[0]
if "_D" in name:
return colors[1]
if "_E" in name:
return colors[2]
elif "TD" in name:
if "_A" in name:
return colors[3]
if "_D" in name:
return colors[4]
if "_E" in name:
return colors[5]
else:
print("PROBLEM: did not find " + name)
def colors_from_names_by_experiment (names):
colors= list()
hexa_cols = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"] #sns.diverging_palette(255, 133, n=6, center="dark")
#hexa_cols = pal.as_hex()
for name in names:
colors.append(color_from_name_by_experiment (name, hexa_cols) )
return colors
In [15]:
# Tables per round
tables_3 = [DD3_A, DD3_D, DD3_E]
table_names_3 = ["DD3_A", "DD3_D", "DD3_E"]
tables_6 = [DD6_A, DD6_D, DD6_E]
table_names_6 = ["DD6_A", "DD6_D", "DD6_E"]
tables_9 = [DD9_A, DD9_D, DD9_E, TD9_A, TD9_D, TD9_E]
table_names_9 = ["DD9_A", "DD9_D", "DD9_E", "TD9_A", "TD9_D", "TD9_E"]
tables_12 = [DD12_A, DD12_D, DD12_E, TD12_A, TD12_D, TD12_E]
table_names_12 = ["DD12_A", "DD12_D", "DD12_E", "TD12_A", "TD12_D", "TD12_E"]
tables_18 = [DD18_A, DD18_D, DD18_E, TD18_A, TD18_D, TD18_E]
table_names_18 = ["DD18_A", "DD18_D", "DD18_E", "TD18_A", "TD18_D", "TD18_E"]
tables_24 = [DD24_A, DD24_D, DD24_E, TD24_A, TD24_D, TD24_E]
table_names_24 = ["DD24_A", "DD24_D", "DD24_E", "TD24_A", "TD24_D", "TD24_E"]
tables_51 = [DD51_A, TD51_A]
table_names_51 = ["DD51_A", "TD51_A"]
# colors per round
colors_3 = colors_from_names_by_experiment(table_names_3)
colors_6 = colors_from_names_by_experiment(table_names_6)
colors_9 = colors_from_names_by_experiment(table_names_9)
colors_12 = colors_from_names_by_experiment(table_names_12)
colors_18 = colors_from_names_by_experiment(table_names_18)
colors_24 = colors_from_names_by_experiment(table_names_24)
colors_51 = colors_from_names_by_experiment(table_names_51)
In [16]:
def plotCoverage(tables, names):
variable = 'Coverage'
sample = list()
posList = list()
variableList = list()
for i in range(len(names)):
sample = sample + len(tables[i][variable]) * [names[i]]
posList.append(tables[i]['Position'])
variableList.append(tables[i][variable])
positions = pd.concat(posList)
variableValues = pd.concat(variableList)
overlay_table_concat = pd.DataFrame ({'Position':positions, variable:variableValues, 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 5})
plt.legend(loc='lower right')
plt.ylim(0, 300000)
plot_positions()
plot_genes()
# Now a plot without text
g=sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2, lowess=True,scatter_kws={"s": 5})
#plt.legend(loc='lower right')
plt.ylim(0, 300000)
plot_positions()
plot_genes(False)
g.set_ylabels("")
g.set_xlabels("")
g.set(xticks=[],yticks=[0,50000,100000,150000, 200000, 250000, 300000])
# Now a plot without text and flat
g=sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=4, lowess=True,scatter_kws={"s": 5})
#plt.legend(loc='lower right')
plt.ylim(0, 300000)
plot_positions()
plot_genes(False)
g.set_ylabels("")
g.set_xlabels("")
g.set(xticks=[],yticks=[0,50000,100000,150000, 200000, 250000, 300000])
In [17]:
sns.set_palette(colors_A)
sns.color_palette(colors_A)
plotCoverage(tables_A, names_A)
In [18]:
sns.set_palette(colors_D)
sns.color_palette(colors_D)
plotCoverage(tables_D, names_D)
In [19]:
print(names_A)
print(colors_A)
print(names_D)
print(colors_D)
print(names_TA)
print(colors_TA)
In [20]:
sns.set_palette(colors_E)
sns.color_palette(colors_E)
plotCoverage(tables_E, names_E)
In [21]:
sns.set_palette(colors_TA)
#sns.color_palette(colors_TA)
plotCoverage(tables_TA, names_TA)
In [22]:
sns.set_palette(colors_TD)
plotCoverage(tables_TD, names_TD)
In [23]:
sns.set_palette(colors_TE)
plotCoverage(tables_TE, names_TE)
In [24]:
sns.set_palette(colors_3)
plotCoverage(tables_3, table_names_3)
In [25]:
sns.set_palette(colors_6)
plotCoverage(tables_6, table_names_6)
In [26]:
sns.set_palette(colors_9)
plotCoverage(tables_9, table_names_9)
In [27]:
sns.set_palette(colors_12)
plotCoverage(tables_12, table_names_12)
In [28]:
sns.set_palette(colors_18)
plotCoverage(tables_18, table_names_18)
In [29]:
sns.set_palette(colors_24)
plotCoverage(tables_24, table_names_24)
In [30]:
sns.set_palette(colors_51)
plotCoverage(tables_51, table_names_51)
In [31]:
# Functions to think in terms of standard deviation of the frequency per site
def compute_sds(tables):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
sds = list()
for i in first["Position"]:
major = first["Major_variant"][i]
#print(last['Major_variant_frequency_quality_corrected'][i])
major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
sd_value = statistics.pstdev(major_frequencies)
sds.append(sd_value)
return sds
def get_varying_variants_sd(tables, sd_threshold):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
varyingVariants = dict()
for i in first["Position"]:
major = first["Major_variant"][i]
#print(last['Major_variant_frequency_quality_corrected'][i])
major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
sd_value = statistics.pstdev(major_frequencies)
if sd_value > sd_threshold:
varyingVariants[i] = sd_value
print("There are "+str(len(varyingVariants))+" positions whose major variant varies a lot in frequency.")
print("Those are:")
print(varyingVariants.keys())
return varyingVariants
In [32]:
# Compute all standard deviations for all sites for all experiments
sds_A = compute_sds(tables_A)
sds_D = compute_sds(tables_D)
sds_E = compute_sds(tables_E)
sds_TA = compute_sds(tables_TA)
sds_TD = compute_sds(tables_TD)
sds_TE = compute_sds(tables_TE)
In [33]:
sds = list()
sds = sds + sds_A + sds_D + sds_E + sds_TA + sds_TD + sds_TE
print(len(sds))
In [34]:
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
sns.kdeplot(sds)
sds_threshold = 0.015
plt.plot([sds_threshold, sds_threshold], [0, 1], linewidth=2)
ax.set(xlabel='Standard deviation of base frequencies', ylabel='Density')
Out[34]:
In [35]:
num =0
for i in sds:
if (i>sds_threshold):
num = num +1
print("There are "+str(num) + " pairs position*experiments that have a standard deviation larger than the threshold "+str(sds_threshold))
In [36]:
varying_A = get_varying_variants_sd(tables_A, sds_threshold)
varying_D = get_varying_variants_sd(tables_D, sds_threshold)
varying_E = get_varying_variants_sd(tables_E, sds_threshold)
varying_TA = get_varying_variants_sd(tables_TA, sds_threshold)
varying_TD = get_varying_variants_sd(tables_TD, sds_threshold)
varying_TE = get_varying_variants_sd(tables_TE, sds_threshold)
In [37]:
all_varying = [varying_A,varying_D,varying_E,varying_TA,varying_TD,varying_TE]
def build_table_for_plotting_varying_pos (all_varying):
all_keys = list()
for di in all_varying:
all_keys = all_keys + list(di.keys())
return(Counter(all_keys))
counts = build_table_for_plotting_varying_pos(all_varying)
print (counts)
In [38]:
counts_sorted = sorted(counts.items())
labels, values = zip(*counts_sorted)
indexes = np.arange(len(labels))
width = 1
sns.set_palette("hls")
sns.set_context("poster")
a4_dims = (14.7, 14.27)
to_plot = pd.DataFrame ({'Position':indexes, 'standard_deviation':values})
fig, ax = plt.subplots(figsize=a4_dims)
sns.barplot(y="Position", x="standard_deviation", data=to_plot, orient="h")
ax.set(xlabel='Number of samples where standard deviation is above ' + str(sds_threshold), ylabel='Position in the genome')
plt.yticks(indexes + width * 0.5, labels, size=10)
plt.show()
Positions 2340, 5662, 7172 and 10006 vary a lot in all 6 experiments.
In [39]:
def plotSecondVariant(tables, names):
variable = 'Second_variant_frequency_quality_corrected'
sample = list()
posList = list()
variableList = list()
for i in range(len(names)):
sample = sample + len(tables[i][variable]) * [names[i]]
posList.append(tables[i]['Position'])
variableList.append(tables[i][variable])
positions = pd.concat(posList)
variableValues = pd.concat(variableList)
overlay_table_concat = pd.DataFrame ({'Position':positions, variable:variableValues, 'sample':sample})
sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
# plt.legend(loc='lower right')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
axes = plt.gca()
axes.set_ylim([0,1.0])
# Now a plot without text
g=sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=2)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
g.set_ylabels("")
g.set_xlabels("")
g.set(xticks=[],yticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
axes = plt.gca()
axes.set_ylim([0,1.0])
# Now a plot without text and flat
g=sns.lmplot( x="Position", y=variable, data=overlay_table_concat, fit_reg=False, hue='sample', legend=False, size=7, aspect=4)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_positions()
g.set_ylabels("")
g.set_xlabels("")
g.set(xticks=[],yticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
axes = plt.gca()
axes.set_ylim([0,1.0])
#plot_genes()
In [40]:
sns.set_palette(colors_A)
plotSecondVariant(tables_A, names_A)
In [41]:
sns.set_palette(colors_D)
plotSecondVariant(tables_D, names_D)
In [42]:
sns.set_palette(colors_E)
plotSecondVariant(tables_E, names_E)
In [43]:
sns.set_palette(colors_TA)
plotSecondVariant(tables_TA, names_TA)
In [44]:
sns.set_palette(colors_TD)
plotSecondVariant(tables_TD, names_TD)
In [45]:
sns.set_palette(colors_TE)
plotSecondVariant(tables_TE, names_TE)
In [46]:
for i in range(len(all_tables)):
avg = np.mean(all_tables[i]['Coverage'])
print("Experiment "+ all_table_names[i] + " : " + str(avg))
In [47]:
def listAllSecondVariantsAboveThreshold(tables, names, threshold):
variable = 'Second_variant_frequency_quality_corrected'
sample = list()
posList = list()
variableList = list()
for i in range(len(names)):
sample = sample + len(tables[i][variable]) * [names[i]]
posList.append(tables[i]['Position'])
variableList.append(tables[i][variable])
for j in range (len(tables[i][variable])):
if (tables[i][variable][j] > threshold):
print("Sample " + names[i] + " ; position " + str(j) + "; Second_variant_frequency_quality_corrected : " + str(tables[i][variable][j]))
In [48]:
listAllSecondVariantsAboveThreshold(tables_A, names_A, 0.2)
In [49]:
# list of all highly varying variants in at least one experiment: all_varying
def get_all_varying_pos (all_varying):
all_keys = list()
for di in all_varying:
all_keys = all_keys + list(di.keys())
temp = list(dict.fromkeys(all_keys))
temp.sort()
return( temp )
all_varying_keys = get_all_varying_pos(all_varying)
def get_sd_for_select_variants(variants, tables):
sd_values = list()
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
varyingVariants = dict()
for i in variants:
major = first["Major_variant"][i]
major_frequencies[0] = first['Major_variant_frequency_quality_corrected'][i]
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
sd_value = statistics.pstdev(major_frequencies)
sd_values.append(sd_value)
return sd_values
sample = list()
variableList = list()
pos = list()
for i in range(len(all_experiments)):
sample = sample + len(all_varying_keys) * [all_experiment_names[i]]
variableList = variableList + get_sd_for_select_variants(all_varying_keys, all_experiments[i])
pos = pos + all_varying_keys
positions = pos
variableValues = variableList
overlay_table_concat = pd.DataFrame ({'Position':positions, 'StdDev':variableValues, 'sample':sample})
a4_dims = (14.7, 7.27)
fig, ax = plt.subplots(figsize=a4_dims)
g = sns.stripplot( x="Position", y='StdDev', data=overlay_table_concat, hue='sample', size=7)
# Horizontal line for the threshold sds_threshold
plt.axhline(y=sds_threshold, linewidth=1, linestyle=':', color="black")
#g.set_xlabels(positions, rotation=30)
#g.set(xticks=[],yticks=[0,50000,100000,150000, 200000, 250000, 300000])
g.set_xticklabels(positions, rotation=60, fontdict={'fontsize':10})
plt.legend(bbox_to_anchor=(1.05, 1), loc=1, borderaxespad=0.)
axes = plt.gca()
axes.set_ylim([0,0.5])
Out[49]:
In [50]:
a4_dims = (14.7, 7.27)
fig, ax = plt.subplots(figsize=a4_dims)
g = sns.stripplot( x="Position", y='StdDev', data=overlay_table_concat, hue='sample', size=7)
g.legend_.remove()
g.set(ylabel='Standard deviation of base frequencies')
# Horizontal line for the threshold sds_threshold
plt.axhline(y=sds_threshold, linewidth=1, linestyle=':', color="black")
#g.set_xlabels(positions, rotation=30)
#g.set(xticks=[],yticks=[0,50000,100000,150000, 200000, 250000, 300000])
g.set_xticklabels(positions, rotation=60, fontdict={'fontsize':10})
axes = plt.gca()
axes.set_ylim([0,0.5])
Out[50]:
In [51]:
a4_dims = (14.7, 7.27)
fig, ax = plt.subplots(figsize=a4_dims)
g = sns.stripplot( x="Position", y='StdDev', data=overlay_table_concat, hue='sample', size=7)
g.legend_.remove()
g.set(ylabel='')
# Horizontal line for the threshold sds_threshold
plt.axhline(y=sds_threshold, linewidth=1, linestyle=':', color="black")
#g.set_xlabels(positions, rotation=30)
#g.set(xticks=[],yticks=[0,50000,100000,150000, 200000, 250000, 300000])
g.set_xticklabels(positions, rotation=60, fontdict={'fontsize':10})
axes = plt.gca()
axes.set_ylim([0,0.5])
Out[51]:
In [53]:
sns.set_palette(colors_18)
positions = [1670, 2340, 5662, 7172, 10006, 2193, 1785]
plotSecondVariant(tables_18, table_names_18)
#plotCoverage(tables_18, table_names_18)
In [ ]: