Here we analyse the libraries with respect to the clone sequence in which mutations will be introduced. We print positions that differ from the clone and rise in frequency between the first and the last time points.
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
from array import array
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import linregress
%matplotlib inline
In [2]:
begins=[]
ends=[]
names =[]
with open ("sequence.gb") as f:
in_pep = False
for l in f:
if "mat_peptide" in l:
begins.append(int(l.split()[1].split("..")[0]))
ends.append(int(l.split()[1].split("..")[1]))
in_pep = True
elif in_pep :
names.append(l.split("=")[1])
in_pep = False
print(begins)
print(ends)
print(names)
In [3]:
file = "cloneSequence/SP6-ZIKV_seq_only.txt"
clone = ""
with open (file) as f:
for l in f:
if ">" in l:
pass
else:
clone +=l.strip()
In [4]:
# Interesting positions
positions=[316,1670,1785,2340,5935,7172,8449,9165]
def plot_positions():
for x in positions:
plt.axvline(x=x, linewidth=1, linestyle=':')
def plot_genes():
for i in range(len(begins)):
plt.plot([begins[i], begins[i]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
if i%2==0:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.005, (names[i].replace('"', ''))[0:3], size='xx-small')
else:
plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.015, (names[i].replace('"', ''))[0:3], size='xx-small')
plt.plot([ends[-1], ends[-1]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
In [5]:
def is_increasing(minor_frequencies):
#print(minor_frequencies)
previous = minor_frequencies[0]
for m in range(1,len(minor_frequencies)):
if previous < minor_frequencies[m]:
#print(str(previous) + " < " + str(minor_frequencies[m]))
previous = minor_frequencies[m]
else:
return False
return True
def get_variant_frequency(variant, table, i):
sum_of_bases = table['As_quality_corrected'][i]+table['Cs_quality_corrected'][i]+table['Gs_quality_corrected'][i]+table['Ts_quality_corrected'][i]+table['Ns_quality_corrected'][i]
if variant == "A":
return table["As_quality_corrected"][i] / sum_of_bases
elif variant == "C":
return table["Cs_quality_corrected"][i] / sum_of_bases
elif variant == "G":
return table["Gs_quality_corrected"][i] / sum_of_bases
elif variant == "T":
return table["Ts_quality_corrected"][i] / sum_of_bases
else:
return np.nan
def get_increasing_variants(tables, clone):
num_tables = len(tables)
first = tables[0]
last = tables[num_tables-1]
major = ""
minor = ""
major_frequencies = array('d',[0.0]*num_tables)
minor_frequencies = array('d',[0.0]*num_tables)
increasingVariants = dict()
for i in first["Position"]:
major = clone[i] #first["Major_variant"][i]
#print(last['Major_variant_frequency_quality_corrected'][i])
major_frequencies[0] = get_variant_frequency(major, first, i)
if major == last["Major_variant"][i]:
minor = last["Second_variant"][i]
else:
minor = last["Major_variant"][i]
minor_frequencies[0] = get_variant_frequency(minor, first, i)
for table_id in range(1, num_tables):
major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
minor_frequencies[table_id] = get_variant_frequency(minor, tables[table_id], i)
if is_increasing(minor_frequencies):
increasingVariants[i] = [major, minor, major_frequencies.tolist(), minor_frequencies.tolist()]
return increasingVariants
def print_variants(dict_variants):
print("Position\tclone base\tincreasing variant\tFinal frequency")
for k in dict_variants.keys():
print(str(k)+"\t"+dict_variants[k][0]+"\t"+dict_variants[k][1]+"\t"+str(dict_variants[k][3][-1]))
In [6]:
# CirSeq initial sample
cirseq = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1CirseqD3_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
In [7]:
# Control runs, replicate A
DD3_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD6_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD9_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD12_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD24_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD51_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD51_A_no_reamp = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51Anoreamplification_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
In [8]:
# Control runs, replicate D
DD3_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD6_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD9_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD12_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
In [9]:
# Control runs, replicate E
DD6_E = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD9_E = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
In [10]:
# TLR3 activation runs, replicate A
TD9_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD51_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
In [11]:
#DD3_A.describe(include='all')
In [12]:
tables_A = [DD3_A, DD6_A, DD9_A, DD12_A, DD24_A, DD51_A]
increasing_A = get_increasing_variants(tables_A, clone)
print("There are "+str(len(increasing_A))+" positions that rise in frequency.")
print("Those are:")
print_variants(increasing_A)
In [13]:
tables_D = [DD3_D, DD6_D, DD9_D, DD12_D, DD24_D]
increasing_D = get_increasing_variants(tables_D, clone)
print("There are "+str(len(increasing_D))+" positions that rise in frequency.")
print("Those are:")
print_variants(increasing_D)
In [14]:
tables_E = [DD6_E, DD9_E]
increasing_E = get_increasing_variants(tables_E, clone)
print("There are "+str(len(increasing_E))+" positions that rise in frequency.")
print("There are too many of them, we choose not to print them.")
In [15]:
tables_TA = [TD9_A, TD12_A, TD24_A, TD51_A]
increasing_TA = get_increasing_variants(tables_TA, clone)
print("There are "+str(len(increasing_TA))+" positions that rise in frequency.")
print("Those are:")
print_variants(increasing_TA)
In [ ]: