Create classes to handle Sample during VCF parsing
In [1]:
class Sample:
def __init__ (self, id, min_var_freq, ref_seq):
self.id = id
self.min_var_freq = min_var_freq
self.position_list = []
# iterate other bases of the reference sequence
for base in ref_seq:
# Create a dict for all 4 DNA base where the reference base is identified
self.position_list.append( {
"A": {"ref_base": True, "freq" : 1} if base == "A" else {"ref_base": False, "freq" : 0},
"C": {"ref_base": True, "freq" : 1} if base == "C" else {"ref_base": False, "freq" : 0},
"G": {"ref_base": True, "freq" : 1} if base == "G" else {"ref_base": False, "freq" : 0},
"T": {"ref_base": True, "freq" : 1} if base == "T" else {"ref_base": False, "freq" : 0}})
def add_variant (self, pos, base, freq):
if freq > self.min_var_freq:
self.position_list[pos-1][base]["freq"] = freq
Test Sample and Position Classes
In [2]:
a = Sample("test", 0.01, "ATATCGATC")
a.add_variant(3, 'T', 0.1)
a.add_variant(3, 'C', 0.1)
a.add_variant(3, 'A', 0.8)
a.__dict__
Out[2]:
Parse the reference sequence
In [3]:
from Bio import SeqIO
ref_seq = str(SeqIO.read(open("./Cassette-AAV-CMV-GFP-hTK.fa"), "fasta").seq)
ref_seq
Out[3]:
Define the min freq
In [4]:
min_freq = float(1)/1000
print min_freq
Define the Sample in VCF file
In [5]:
sample_dict = {
'C1_AAV': Sample('C1_AAV', min_freq, ref_seq),
'RUN1_S7_AAV': Sample('RUN1_S7_AAV', min_freq, ref_seq),
'RUN2_S8_AAV': Sample('RUN2_S8_AAV', min_freq, ref_seq),
'RUN1_S2_AAV': Sample('RUN1_S2_AAV', min_freq, ref_seq),
'RUN2_S1_AAV': Sample('RUN2_S1_AAV', min_freq, ref_seq),
'RUN1_S4_AAV': Sample('RUN1_S4_AAV', min_freq, ref_seq),
'RUN2_S2_AAV': Sample('RUN2_S2_AAV', min_freq, ref_seq),
'RUN1_S6_AAV': Sample('RUN1_S6_AAV', min_freq, ref_seq),
'RUN2_S3_AAV': Sample('RUN2_S3_AAV', min_freq, ref_seq),
'RUN1_S8_AAV': Sample('RUN1_S8_AAV', min_freq, ref_seq),
'RUN2_S4_AAV': Sample('RUN2_S4_AAV', min_freq, ref_seq),
'RUN1_S3_AAV': Sample('RUN1_S3_AAV', min_freq, ref_seq),
'RUN2_S5_AAV': Sample('RUN2_S5_AAV', min_freq, ref_seq),
'RUN1_S5_AAV': Sample('RUN1_S5_AAV', min_freq, ref_seq),
'RUN2_S6_AAV': Sample('RUN2_S6_AAV', min_freq, ref_seq)}
# 'RUN2_S7_AAV': "Negative Control 2", # Negative control have not interest for this analysis
# 'RUN1_S1_AAV': "Negative Control 1", # Negative control have not interest for this analysis
# 'RUN2_S9_AAV': Sample('', "Internal Normalizer 3", min_freq, ref_seq),
Open vcf file and parse general metadata
In [6]:
import HTSeq
vcfr = HTSeq.VCF_Reader( "./NoIndel.vcf" )
vcfr.parse_meta()
vcfr.make_info_dict()
Parse VCF variants and add to Sample objects if an alternative base if found
In [7]:
for variant in vcfr:
# Extract information for samples in sample dict
for name, sample in sample_dict.items():
# Extract genotypes from the INFO field for the current sample
GT = [int(i) for i in variant.samples[name]['GT'].split('/')]
#print variant.samples[name]
# Extract more informations only if needed
if len(GT) > 1:
DPG = [int(i) for i in variant.samples[name]['DPG'].split(',')]
DP = int(variant.samples[name]['DP'])
base = [variant.ref]+variant.alt
# Modifify sample for each alternative genotype
for i in range (len (GT)):
sample.add_variant(
pos = variant.pos.pos,
base = base[GT[i]],
freq = float(DPG[i])/DP)
In [8]:
sample_group = {
"Internal Normalizer" : ['RUN1_S7_AAV', 'RUN2_S8_AAV'],}
In [9]:
sample_group = {
"IEX" : ['RUN1_S3_AAV', 'RUN2_S5_AAV', 'RUN1_S5_AAV', 'RUN2_S6_AAV']}
In [13]:
sample_group = {
"Internal Normalizer" : ['RUN1_S7_AAV', 'RUN2_S8_AAV'],
"CsCl-" : ['RUN1_S2_AAV', 'RUN2_S1_AAV'],
"CsCl+" : ['RUN1_S4_AAV', 'RUN2_S2_AAV'],
"AVB-" : ['RUN1_S6_AAV', 'RUN2_S3_AAV'],
"AVB+" : ['RUN1_S8_AAV', 'RUN2_S4_AAV'],
"IEX-" : ['RUN1_S3_AAV', 'RUN2_S5_AAV'],
"IEX+" : ['RUN1_S5_AAV', 'RUN2_S6_AAV'],
"CsCl" : ['RUN1_S2_AAV', 'RUN2_S1_AAV', 'RUN1_S4_AAV', 'RUN2_S2_AAV'],
"AVB" : ['RUN1_S6_AAV', 'RUN2_S3_AAV', 'RUN1_S8_AAV', 'RUN2_S4_AAV'],
"IEX" : ['RUN1_S3_AAV', 'RUN2_S5_AAV', 'RUN1_S5_AAV', 'RUN2_S6_AAV'],
"ALL_AAV" : ['RUN1_S2_AAV', 'RUN2_S1_AAV', 'RUN1_S4_AAV', 'RUN2_S2_AAV','RUN1_S6_AAV', 'RUN2_S3_AAV', 'RUN1_S8_AAV', 'RUN2_S4_AAV', 'RUN1_S3_AAV', 'RUN2_S5_AAV', 'RUN1_S5_AAV', 'RUN2_S6_AAV']}
In [14]:
for name, group in sample_group.items():
sample = Sample('name', min_freq, ref_seq)
# For each position along the length of reference the sequence
for i in range (len(ref_seq)):
for base in ["A", "C", "G", "T"]:
# Not interested by the frequency of the reference base
if sample.position_list[i][base]['ref_base']:
sample.position_list[i][base]["freq"] = 0.0
# Extract the list of frequency for this base in all samples
else:
var_freq_list = [sample_dict[var_name].position_list[i][base]["freq"] for var_name in group]
#print (var_freq_list)
# At least half of the samples in the list must have the variant with a frequency > 0 else it is considered null
if len([j for j in var_freq_list if j != 0]) >= len(var_freq_list)/2:
sample.position_list[i][base]["freq"] = sum(var_freq_list)/len(var_freq_list)
#print ("TRUE")
else:
sample.position_list[i][base]["freq"] = 0.0
# Write 1 report per sample
with open (name+"_report.csv", "wb") as report:
report.write("Pos\tAlt_A\tAlt_C\tAlt_G\tAlt_T\tsum\n")
for n, p in enumerate(sample.position_list):
report.write ("{}\t{}\t{}\t{}\t{}\t{}\n".format(
n, p["A"]['freq'], p["C"]['freq'], p["G"]['freq'], p["T"]['freq'],
p["A"]['freq'] + p["C"]['freq'] + p["G"]['freq'] + p["T"]['freq']))