Analysis of VCF file containing variants along rAAV genome from datasets generated by SSV-Seq

Create classes to handle Sample during VCF parsing


In [1]:
class Sample:

    def __init__ (self, id, min_var_freq, ref_seq):
        self.id = id
        self.min_var_freq = min_var_freq
        self.position_list = []
        
        # iterate other bases of the reference sequence
        for base in ref_seq:
            
            # Create a dict for all 4 DNA base where the reference base is identified        
            self.position_list.append( {
                "A": {"ref_base": True, "freq" : 1} if base == "A" else {"ref_base": False, "freq" : 0},
                "C": {"ref_base": True, "freq" : 1} if base == "C" else {"ref_base": False, "freq" : 0},
                "G": {"ref_base": True, "freq" : 1} if base == "G" else {"ref_base": False, "freq" : 0},
                "T": {"ref_base": True, "freq" : 1} if base == "T" else {"ref_base": False, "freq" : 0}})
            
    def add_variant (self, pos, base, freq):
        if freq > self.min_var_freq:   
            self.position_list[pos-1][base]["freq"] = freq

Test Sample and Position Classes


In [2]:
a = Sample("test", 0.01, "ATATCGATC")
a.add_variant(3, 'T', 0.1)
a.add_variant(3, 'C', 0.1)
a.add_variant(3, 'A', 0.8)
a.__dict__


Out[2]:
{'id': 'test',
 'min_var_freq': 0.01,
 'position_list': [{'A': {'freq': 1, 'ref_base': True},
   'C': {'freq': 0, 'ref_base': False},
   'G': {'freq': 0, 'ref_base': False},
   'T': {'freq': 0, 'ref_base': False}},
  {'A': {'freq': 0, 'ref_base': False},
   'C': {'freq': 0, 'ref_base': False},
   'G': {'freq': 0, 'ref_base': False},
   'T': {'freq': 1, 'ref_base': True}},
  {'A': {'freq': 0.8, 'ref_base': True},
   'C': {'freq': 0.1, 'ref_base': False},
   'G': {'freq': 0, 'ref_base': False},
   'T': {'freq': 0.1, 'ref_base': False}},
  {'A': {'freq': 0, 'ref_base': False},
   'C': {'freq': 0, 'ref_base': False},
   'G': {'freq': 0, 'ref_base': False},
   'T': {'freq': 1, 'ref_base': True}},
  {'A': {'freq': 0, 'ref_base': False},
   'C': {'freq': 1, 'ref_base': True},
   'G': {'freq': 0, 'ref_base': False},
   'T': {'freq': 0, 'ref_base': False}},
  {'A': {'freq': 0, 'ref_base': False},
   'C': {'freq': 0, 'ref_base': False},
   'G': {'freq': 1, 'ref_base': True},
   'T': {'freq': 0, 'ref_base': False}},
  {'A': {'freq': 1, 'ref_base': True},
   'C': {'freq': 0, 'ref_base': False},
   'G': {'freq': 0, 'ref_base': False},
   'T': {'freq': 0, 'ref_base': False}},
  {'A': {'freq': 0, 'ref_base': False},
   'C': {'freq': 0, 'ref_base': False},
   'G': {'freq': 0, 'ref_base': False},
   'T': {'freq': 1, 'ref_base': True}},
  {'A': {'freq': 0, 'ref_base': False},
   'C': {'freq': 1, 'ref_base': True},
   'G': {'freq': 0, 'ref_base': False},
   'T': {'freq': 0, 'ref_base': False}}]}

Parse the reference sequence


In [3]:
from Bio import SeqIO
ref_seq = str(SeqIO.read(open("./Cassette-AAV-CMV-GFP-hTK.fa"), "fasta").seq)
ref_seq


Out[3]:
'CTGCGCGCTCGCTCGCTCACTGAGGCCGCCCGGGCAAAGCCCGGGCGTCGGGCGACCTTTGGTCGCCCGGCCTCAGTGAGCGAGCGAGCGCGCAGAGAGGGAGTGGCCAACTCCATCACTAGGGGTTCCTTGTAGTTAATGATTAACCCGCCATGCTACTTATCTACGTAGCCATGCTCTAGGAAGATCTCGACGCGTTGACATTGATTATTGACTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCATAGCCCATATATGGAGTTCCGCGTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACGGTAAACTGCCCACTTGGCAGTACATCAAGTGTATCATATGCCAAGTACGCCCCCTATTGACGTCAATGACGGTAAATGGCCCGCCTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGGCAGTACATCTACGTATTAGTCATCGCTATTACCATGGTGATGCGGTTTTGGCAGTACATCAATGGGCGTGGATAGCGGTTTGACTCACGGGGATTTCCAAGTCTCCACCCCATTGACGTCAATGGGAGTTTGTTTTGGCACCAAAATCAACGGGACTTTCCAAAATGTCGTAACAACTCCGCCCCATTGACGCAAATGGGCGGTAGGCGTGTACGGTGGGAGGTCTATATAAGCAGAGCTCTCTGGCTAACTAGAGAACCCACTGCTTACTGGCTTATCGAAATTAATACGACTCACTATAGGGAGACCCAAGCTGGCTAGCGACCATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAAGTCGACCCCGCGAAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTCTAGCGGGATCAATTCCGCCCCCCCCCTAACGTTACTGGCCGAAGCCGCTTGGAATAAGGCCGGTGTGCGTTTGTCTATATGTTATTTTCCACCATATTGCCGTCTTTTGGCAATGTGAGGGCCCGGAAACCTGGCCCTGTCTTCTTGACGAGCATTCCTAGGGGTCTTTCCCCTCTCGCCAAAGGAATGCAAGGTCTGTTGAATGTCGTGAAGGAAGCAGTTCCTCTGGAAGCTTCTTGAAGACAAACAACGTCTGTAGCGACCCTTTGCAGGCAGCGGAACCCCCCACCTGGCGACAGGTGCCTCTGCGGCCAAAAGCCACGTGTATAAGATACACCTGCAAAGGCGGCACAACCCCAGTGCCACGTTGTGAGTTGGATAGTTGTGGAAAGAGTCAAATGGCTCTCCTCAAGCGTATTCAACAAGGGGCTGAAGGATGCCCAGAAGGTACCCCATTGTATGGGATCTGATCTGGGGCCTCGGTGCACATGCTTTACATGTGTTTAGTCGAGGTTAAAAAAACGTCTAGGCCCCCCGAACCACGGGGACGTGGTTTTCCTTTGAAAAACACGATGATATCATGAAAAAGCCTGAACTCACCGCGACGTCTGTCGAGAAGTTTCTGATCGAAAAGTTCGACAGCGTCTCCGACCTGATGCAGCTCTCGGAGGGCGAAGAATCTCGTGCTTTCAGCTTCGATGTAGGAGGGCGTGGATATGTCCTGCGGGTAAATAGCTGCGCCGATGGTTTCTACAAAGATCGTTATGTTTATCGGCACTTTGCATCGGCCGCGCTCCCGATTCCGGAAGTGCTTGACATTGGGGAATTCAGCGAGAGCCTGACCTATTGCATCTCCCGCCGTGCACGGGGTGTCACGTTGCAAGACCTGCCTGAAACCGAACTGCCCGCTGTTCTGCAGCCGGTCGCGGAGGCCATGGATGCGATCGCTGCGGCCGATCTTAGCCAGACGAGCGGGTTCGGCCCATTCGGACCGCAAGGAATCGGTCAATACACTACATGGCGTGATTTCATATGCGCGATTGCTGATCCCCATGTGTATCACTGGCAAACTGTGATGGACGACACCGTCAGTGCGTCCGTCGCGCAGGCTCTCGATGAGCTGATGCTTTGGGCCGAGGACTGCCCCGAAGTCCGGCACCTCGTGCACGCGGATTTCGGCTCCAACAATGTCCTGACGGACAATGGCCGCATAACAGCGGTCATTGACTGGAGCGAGGCGATGTTCGGGGATTCCCAATACGAGGTCGCCAACATCTTCTTCTGGAGGCCGTGGTTGGCTTGTATGGAGCAGCAGACGCGCTACTTCGAGCGGAGGCATCCGGAGCTTGCAGGATCGCCGCGGCTCCGGGCGTATATGCTCCGCATTGGTCTTGACCAACTCTATCAGAGCTTGGTTGACGGCAATTTCGATGATGCAGCTTGGGCGCAGGGTCGATGCGACGCAATCGTCCGATCCGGAGCCGGGACTGTCGGGCGTACACAAATCGCCCGCAGAAGCGCGGCCGTCTGGACCGATGGCTGTGTAGAAGTCGCGTCTGCGTTCGACCAGGCTGCGCGTTCTCGCGGCCATAGCAACCGACGTACGGCGTTGCGCCCTCGCCGGCAGCAAGAAGCCACGGAAGTCCGCCCGGAGCAGAAAATGCCCACGCTACTGCGGGTTTATATAGACGGTCCCCACGGGATGGGGAAAACCACCACCACGCAACTGCTGGTGGCCCTGGGTTCGCGCGACGATATCGTCTACGTACCCGAGCCGATGACTTACTGGCGGGTGCTGGGGGCTTCCGAGACAATCGCGAACATCTACACCACACAACACCGCCTCGACCAGGGTGAGATATCGGCCGGGGACGCGGCGGTGGTAATGACAAGCGCCCAGATAACAATGGGCATGCCTTATGCCGTGACCGACGCCGTTCTGGCTCCTCATATCGGGGGGGAGGCTGGGAGCTCACATGCCCCGCCCCCGGCCCTCACCCTCATCTTCGACCGCCATCCCATCGCCGCCCTCCTGTGCTACCCGGCCGCGCGGTACCTTATGGGCAGCATGACCCCCCAGGCCGTGCTGGCGTTCGTGGCCCTCATCCCGCCGACCTTGCCCGGCACCAACATCGTGCTTGGGGCCCTTCCGGAGGACAGACACATCGACCGCCTGGCCAAACGCCAGCGCCCCGGCGAGCGGCTGGACCTGGCTATGCTGGCTGCGATTCGCCGCGTTTACGGGCTACTTGCCAATACGGTGCGGTATCTGCAGTGCGGCGGGTCGTGGCGGGAGGACTGGGGACAGCTTTCGGGGACGGCCGTGCCGCCCCAGGGTGCCGAGCCCCAGAGCAACGCGGGCCCACGACCCCATATCGGGGACACGTTATTTACCCTGTTTCGGGCCCCCGAGTTGCTGGCCCCCAACGGCGACCTGTATAACGTGTTTGCCTGGGCCTTGGACGTCTTGGCCAAACGCCTCCGTTCCATGCACGTCTTTATCCTGGATTACGACCAATCGCCCGCCGGCTGCCGGGACGCCCTGCTGCAACTTACCTCCGGGATGGTCCAGACCCACGTCACCACCCCCGGCTCCATACCGACGATATGCGACCTGGCGCGCACGTTTGCCCGGGAGATGGGGGAGGCTAACTGAAAATCGATGGATCCACTAGTTCTAGAGGGCCCTATTCTATAGTGTCACCTAAATGCTAGAGCTCGCTGATCAGCCTCGACTGTGCCTTCTAGTTGCCAGCCATCTGTTGTTTGCCCCTCCCCCGTGCCTTCCTTGACCCTGGAAGGTGCCACTCCCACTGTCCTTTCCTAATAAAATGAGGAAATTGCATCGCATTGTCTGAGTAGGTGTCATTCTATTCTGGGGGGTGGGGTGGGGCAGGACAGCAAGGGGGAGGATTGGGAAGACAATAGCAGGCATGCTGGGGATGCGGTGGGCTCTATGGCTTCTGAGGCGGAAAGAACCAGGTAGATAAGTAGCATGGCGGGTTAATCATTAACTACAAGGAACCCCTAGTGATGGAGTTGGCCACTCCCTCTCTGCGCGCTCGCTCGCTCACTGAGGCCGGGCGACCAAAGGTCGCCCGACGCCCGGGCTTTGCCCGGGCGGCCTCAGTGAGCGAGCGAGCGCGCAG'

Define the min freq


In [4]:
min_freq = float(1)/1000
print min_freq


0.001

Define the Sample in VCF file


In [5]:
sample_dict = {
 'C1_AAV': Sample('C1_AAV', min_freq, ref_seq),
 'RUN1_S7_AAV': Sample('RUN1_S7_AAV', min_freq, ref_seq),
 'RUN2_S8_AAV': Sample('RUN2_S8_AAV', min_freq, ref_seq),
 'RUN1_S2_AAV': Sample('RUN1_S2_AAV', min_freq, ref_seq),
 'RUN2_S1_AAV': Sample('RUN2_S1_AAV', min_freq, ref_seq),
 'RUN1_S4_AAV': Sample('RUN1_S4_AAV', min_freq, ref_seq),
 'RUN2_S2_AAV': Sample('RUN2_S2_AAV', min_freq, ref_seq),
 'RUN1_S6_AAV': Sample('RUN1_S6_AAV', min_freq, ref_seq),
 'RUN2_S3_AAV': Sample('RUN2_S3_AAV', min_freq, ref_seq),
 'RUN1_S8_AAV': Sample('RUN1_S8_AAV', min_freq, ref_seq),
 'RUN2_S4_AAV': Sample('RUN2_S4_AAV', min_freq, ref_seq),
 'RUN1_S3_AAV': Sample('RUN1_S3_AAV', min_freq, ref_seq),
 'RUN2_S5_AAV': Sample('RUN2_S5_AAV', min_freq, ref_seq),
 'RUN1_S5_AAV': Sample('RUN1_S5_AAV', min_freq, ref_seq),
 'RUN2_S6_AAV': Sample('RUN2_S6_AAV', min_freq, ref_seq)}
 
# 'RUN2_S7_AAV': "Negative Control 2", # Negative control have not interest for this analysis
# 'RUN1_S1_AAV': "Negative Control 1", # Negative control have not interest for this analysis
# 'RUN2_S9_AAV': Sample('', "Internal Normalizer 3", min_freq, ref_seq),

Open vcf file and parse general metadata


In [6]:
import HTSeq
vcfr = HTSeq.VCF_Reader( "./NoIndel.vcf" ) 
vcfr.parse_meta() 
vcfr.make_info_dict()

Parse VCF variants and add to Sample objects if an alternative base if found


In [7]:
for variant in vcfr:
    # Extract information for samples in sample dict
    for name, sample in sample_dict.items():
        
        # Extract genotypes from the INFO field for the current sample
        GT = [int(i) for i in variant.samples[name]['GT'].split('/')]
        #print variant.samples[name]
        # Extract more informations only if needed
        if len(GT) > 1:
            DPG = [int(i) for i in variant.samples[name]['DPG'].split(',')]
            DP = int(variant.samples[name]['DP'])
            base = [variant.ref]+variant.alt
            
            # Modifify sample for each alternative genotype
            for i in range (len (GT)):
                sample.add_variant(
                    pos = variant.pos.pos,
                    base = base[GT[i]],
                    freq = float(DPG[i])/DP)

In [8]:
sample_group = {
    "Internal Normalizer" : ['RUN1_S7_AAV', 'RUN2_S8_AAV'],}

In [9]:
sample_group = {
    "IEX" : ['RUN1_S3_AAV', 'RUN2_S5_AAV', 'RUN1_S5_AAV', 'RUN2_S6_AAV']}

In [13]:
sample_group = {
    "Internal Normalizer" : ['RUN1_S7_AAV', 'RUN2_S8_AAV'],
    "CsCl-" : ['RUN1_S2_AAV', 'RUN2_S1_AAV'],
    "CsCl+" : ['RUN1_S4_AAV', 'RUN2_S2_AAV'],
    "AVB-" : ['RUN1_S6_AAV', 'RUN2_S3_AAV'],
    "AVB+" : ['RUN1_S8_AAV', 'RUN2_S4_AAV'],
    "IEX-" : ['RUN1_S3_AAV', 'RUN2_S5_AAV'],
    "IEX+" : ['RUN1_S5_AAV', 'RUN2_S6_AAV'],
    "CsCl" : ['RUN1_S2_AAV', 'RUN2_S1_AAV', 'RUN1_S4_AAV', 'RUN2_S2_AAV'],
    "AVB" : ['RUN1_S6_AAV', 'RUN2_S3_AAV', 'RUN1_S8_AAV', 'RUN2_S4_AAV'],
    "IEX" : ['RUN1_S3_AAV', 'RUN2_S5_AAV', 'RUN1_S5_AAV', 'RUN2_S6_AAV'],
    "ALL_AAV" : ['RUN1_S2_AAV', 'RUN2_S1_AAV', 'RUN1_S4_AAV', 'RUN2_S2_AAV','RUN1_S6_AAV', 'RUN2_S3_AAV', 'RUN1_S8_AAV', 'RUN2_S4_AAV', 'RUN1_S3_AAV', 'RUN2_S5_AAV', 'RUN1_S5_AAV', 'RUN2_S6_AAV']}

In [14]:
for name, group in sample_group.items():
    sample = Sample('name', min_freq, ref_seq)
    
    # For each position along the length of reference the sequence
    for i in range (len(ref_seq)):
        for base in ["A", "C", "G", "T"]:
                        
            # Not interested by the frequency of the reference base
            if sample.position_list[i][base]['ref_base']:
                sample.position_list[i][base]["freq"] = 0.0
                
            # Extract the list of frequency for this base in all samples
            else: 
                var_freq_list = [sample_dict[var_name].position_list[i][base]["freq"] for var_name in group]
                #print (var_freq_list)
                
                # At least half of the samples in the list must have the variant with a frequency > 0 else it is considered null
                if len([j for j in var_freq_list if j != 0]) >= len(var_freq_list)/2:
                    sample.position_list[i][base]["freq"] = sum(var_freq_list)/len(var_freq_list)
                    #print ("TRUE")
                else:
                    sample.position_list[i][base]["freq"] = 0.0
                
    # Write 1 report per sample
    with open (name+"_report.csv", "wb") as report:
        report.write("Pos\tAlt_A\tAlt_C\tAlt_G\tAlt_T\tsum\n")
        for n, p in enumerate(sample.position_list):
            report.write ("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                n, p["A"]['freq'], p["C"]['freq'], p["G"]['freq'], p["T"]['freq'],
                p["A"]['freq'] + p["C"]['freq'] + p["G"]['freq'] + p["T"]['freq']))