In [1]:
import vcf

In [2]:
#CONSTANTS
PARENT_1 = 0
PARENT_2 = 1
OFFSPRING = 2

In [3]:
#Input files
vcf_filenames= ("Dm28-NESM.GATK-HC.SNP-FinalSet.EFF.vcf", 
                "Esmeraldo-NESM.GATK-HC.SNP-FinalSet.EFF.vcf",
                "X10825-NESM.GATK-HC.SNP-FinalSet.EFF.vcf"
                )
vcf_files = [ vcf.Reader(open(filename)) for filename in vcf_filenames ]

In [4]:
#Out files
out_filenames = ("p1_specific.out",
                 "p2_specific.out",
                 "offsp_specific.out"
                 )

out_files = [ open(filename,"w") for filename in out_filenames ]

In [5]:
#Print header
column_headers=["CHROM","POS","Dm28_ALT","Esmeraldo_ALT","X10825_ALT","X10825_GT","GENE_NAME"]
for f in out_files:
    f.write("\t".join(column_headers)+"\n")

#Define output format, with corresponding header
def formatOutputLine(records):
    current_chr,current_pos = extract_coord(records)
    fields = (current_chr,
              current_pos,
              records[PARENT_1].ALT[0] if records[PARENT_1] else "-", 
              records[PARENT_2].ALT[0] if records[PARENT_2] else "-",
              records[OFFSPRING].ALT[0] if records[OFFSPRING] else "-",
              records[OFFSPRING].samples[0]["GT"] if records[OFFSPRING] else "-",
              extractProteinFromEffect(records) 
              )
    return "\t".join([str(x) for x in fields])+"\n"

In [6]:
def get_next_record(vcf):
    record = False
    try:
        record = vcf.next()
    except StopIteration:
        pass
    return record

In [7]:
def extractProteinFromEffect(records):
    protein = "non-coding"
    for record in records:
        if record and "EFF" in record.INFO:
            protein = record.INFO["EFF"][0].split("|")[8]
            break
    return protein if protein else "non-coding"

def extract_coord(records):
    chrom = None
    pos = None
    for r in records:
        if r:
            chrom = r.CHROM
            pos = r.POS
    return (chrom,pos)

In [8]:
#Ordering of chromosomes in file is:
# 11, ... ,19,1,20,...,29,2,30,...,90,...,9,10
#Solution:
# num > 10 : e.g 30 -> 300, 39 -> 390, 40-> 400
#num < 100: e.g 3 -> 3*100 + 95 = 395
# num == 10 : 999999 (force to be last)
def chrom_order_value(chrom):
    cmp_value = 0
    if chrom > 10:
        cmp_value = chrom * 10
    if chrom < 10:
        cmp_value = (chrom * 100) + 95
    if chrom == 10:
        cmp_value = 999999
    return cmp_value

def cmp_records(record1, record2):
    cmp_result = 0
    
    if record1.CHROM == record2.CHROM:
        if record1.POS < record2.POS:
            cmp_result = -1
        elif record1.POS > record2.POS:
            cmp_result = 1
    elif chrom_order_value(record1.CHROM) < chrom_order_value(record2.CHROM):
        cmp_result = -1
    else:
        cmp_result = 1
        
    return cmp_result

In [9]:
def get_records_to_process(records):
    min_pos = None
    min_record = None
    
    for idx,current_record in enumerate(records):
        if min_pos is None and current_record:
            min_pos = [idx]
            min_record = current_record
        
        elif current_record:
            record_comparison = cmp_records(current_record, min_record)
            #If records belong to the same position
            if record_comparison == 0:
                min_pos.append(idx)
            #If one of the records has a lower chromosomal position, process that one
            elif record_comparison == -1:
                min_pos = [idx]
                min_record = current_record
    
    return [(records[x] if x in min_pos else False) for x in (PARENT_1,PARENT_2,OFFSPRING)]

In [10]:
def is_homozygous(record):
    #return len(record.REF)
    return not record or record.samples[0].data.GT in ("0/0","0|0","1/1","1|1")

def is_parent1_specific(records):    
    test1 = not records[PARENT_1] and not records[OFFSPRING] and records[PARENT_2]
    
    test2 = records[PARENT_1] and records[OFFSPRING]
    test2 = test2 and records[PARENT_1].ALT[0].sequence == records[OFFSPRING].ALT[0].sequence 
    test2 = test2 and (not records[PARENT_2] or records[PARENT_1].ALT[0].sequence != records[PARENT_2].ALT[0].sequence)
    return test1 or test2

def is_parent2_specific(records):
    test1 = not records[PARENT_2] and not records[OFFSPRING] and records[PARENT_1]
    
    test2 = records[PARENT_2] and records[OFFSPRING]
    test2 = test2 and records[PARENT_2].ALT[0].sequence == records[OFFSPRING].ALT[0].sequence 
    test2 = test2 and (not records[PARENT_1] or records[PARENT_1].ALT[0].sequence != records[PARENT_2].ALT[0].sequence)
    
    return test1 or test2
    
def is_offspring_specific(records):
    test1 = not records[OFFSPRING] and records[PARENT_1] and records[PARENT_2]
    
    test2 = records[OFFSPRING] and (not records[PARENT_1] or records[PARENT_1].ALT[0].sequence != records[OFFSPRING].ALT[0].sequence ) 
    test2 = test2 and (not records[PARENT_2] or records[PARENT_2].ALT[0].sequence != records[OFFSPRING].ALT[0].sequence )   
    return test1 or test2

In [11]:
def process_records(records,out_files):
    #If all records are homozygous
    if reduce(lambda x,y:x and y,(is_homozygous(r) for r in records )):
        if is_parent1_specific(records):
            out_files[PARENT_1].write( formatOutputLine(records) )
        elif is_parent2_specific(records):
            out_files[PARENT_2].write( formatOutputLine(records) )
        elif is_offspring_specific(records):
            out_files[OFFSPRING].write( formatOutputLine(records) )

In [12]:
def update_iteration(loaded_records,processed_records, vcf_files):
    new_records = []
    for loaded,processed,vcf_file in zip(loaded_records,processed_records,vcf_files):
        if not loaded:
            new_records.append(False)
        if loaded and not processed:
            new_records.append(loaded)
        if loaded and processed:
            new_records.append( get_next_record(vcf_file) )
    return new_records

In [13]:
#Initialize records with 1st record
loaded_records = [get_next_record( vcf_file ) for vcf_file in vcf_files]

In [14]:
def debug_printer(records):
    rec_str = []
    for rec in records:
        if rec:
            rec_str.append( (rec.CHROM,rec.POS) )
        else:
            rec_str.append(False)
    return str(rec_str)

In [15]:
counter = 1
while reduce( lambda x,y: x or y, loaded_records):
    #print debug_printer(loaded_records)
    records_to_process = get_records_to_process(loaded_records)
    #print debug_printer(records_to_process)
    process_records( records_to_process ,out_files)
    loaded_records = update_iteration(loaded_records,records_to_process, vcf_files)
    counter +=1
    if counter % 10000 == 0:
        print "Positions processed: "+str(counter)


Positions processed: 2000
Positions processed: 4000
Positions processed: 6000
Positions processed: 8000
Positions processed: 10000
Positions processed: 12000
Positions processed: 14000
Positions processed: 16000
Positions processed: 18000
Positions processed: 20000
Positions processed: 22000
Positions processed: 24000
Positions processed: 26000
Positions processed: 28000
Positions processed: 30000
Positions processed: 32000
Positions processed: 34000
Positions processed: 36000
Positions processed: 38000
Positions processed: 40000
Positions processed: 42000
Positions processed: 44000
Positions processed: 46000
Positions processed: 48000
Positions processed: 50000
Positions processed: 52000
Positions processed: 54000
Positions processed: 56000
Positions processed: 58000
Positions processed: 60000
Positions processed: 62000
Positions processed: 64000
Positions processed: 66000
Positions processed: 68000
Positions processed: 70000
Positions processed: 72000
Positions processed: 74000
Positions processed: 76000
Positions processed: 78000
Positions processed: 80000
Positions processed: 82000
Positions processed: 84000
Positions processed: 86000
Positions processed: 88000
Positions processed: 90000
Positions processed: 92000
Positions processed: 94000
Positions processed: 96000
Positions processed: 98000
Positions processed: 100000
Positions processed: 102000
Positions processed: 104000
Positions processed: 106000
Positions processed: 108000
Positions processed: 110000
Positions processed: 112000
Positions processed: 114000
Positions processed: 116000
Positions processed: 118000
Positions processed: 120000
Positions processed: 122000
Positions processed: 124000
Positions processed: 126000
Positions processed: 128000
Positions processed: 130000
Positions processed: 132000
Positions processed: 134000
Positions processed: 136000
Positions processed: 138000
Positions processed: 140000
Positions processed: 142000
Positions processed: 144000
Positions processed: 146000
Positions processed: 148000
Positions processed: 150000
Positions processed: 152000
Positions processed: 154000
Positions processed: 156000
Positions processed: 158000
Positions processed: 160000
Positions processed: 162000
Positions processed: 164000
Positions processed: 166000
Positions processed: 168000
Positions processed: 170000
Positions processed: 172000
Positions processed: 174000
Positions processed: 176000
Positions processed: 178000
Positions processed: 180000
Positions processed: 182000
Positions processed: 184000
Positions processed: 186000
Positions processed: 188000
Positions processed: 190000
Positions processed: 192000
Positions processed: 194000
Positions processed: 196000
Positions processed: 198000
Positions processed: 200000
Positions processed: 202000
Positions processed: 204000
Positions processed: 206000
Positions processed: 208000
Positions processed: 210000
Positions processed: 212000
Positions processed: 214000
Positions processed: 216000
Positions processed: 218000
Positions processed: 220000
Positions processed: 222000
Positions processed: 224000
Positions processed: 226000
Positions processed: 228000
Positions processed: 230000
Positions processed: 232000
Positions processed: 234000
Positions processed: 236000
Positions processed: 238000
Positions processed: 240000
Positions processed: 242000
Positions processed: 244000
Positions processed: 246000
Positions processed: 248000
Positions processed: 250000
Positions processed: 252000
Positions processed: 254000
Positions processed: 256000
Positions processed: 258000
Positions processed: 260000
Positions processed: 262000
Positions processed: 264000
Positions processed: 266000
Positions processed: 268000
Positions processed: 270000
Positions processed: 272000
Positions processed: 274000
Positions processed: 276000
Positions processed: 278000
Positions processed: 280000
Positions processed: 282000
Positions processed: 284000
Positions processed: 286000
Positions processed: 288000
Positions processed: 290000
Positions processed: 292000
Positions processed: 294000
Positions processed: 296000
Positions processed: 298000
Positions processed: 300000
Positions processed: 302000
Positions processed: 304000
Positions processed: 306000
Positions processed: 308000
Positions processed: 310000
Positions processed: 312000
Positions processed: 314000
Positions processed: 316000
Positions processed: 318000
Positions processed: 320000
Positions processed: 322000
Positions processed: 324000
Positions processed: 326000
Positions processed: 328000
Positions processed: 330000
Positions processed: 332000
Positions processed: 334000
Positions processed: 336000
Positions processed: 338000
Positions processed: 340000
Positions processed: 342000
Positions processed: 344000
Positions processed: 346000
Positions processed: 348000
Positions processed: 350000
Positions processed: 352000
Positions processed: 354000
Positions processed: 356000
Positions processed: 358000
Positions processed: 360000
Positions processed: 362000
Positions processed: 364000
Positions processed: 366000
Positions processed: 368000
Positions processed: 370000
Positions processed: 372000
Positions processed: 374000
Positions processed: 376000

In [16]:
for out_file in out_files:
    out_file.close()

In [17]:
print debug_printer(records_to_process)


[('10', 518366), False, False]

In [18]: