In [1]:
import vcf
In [2]:
#CONSTANTS
PARENT_1 = 0
PARENT_2 = 1
OFFSPRING = 2
In [3]:
#Input files
vcf_filenames= ("Dm28-NESM.GATK-HC.SNP-FinalSet.EFF.vcf",
"Esmeraldo-NESM.GATK-HC.SNP-FinalSet.EFF.vcf",
"X10825-NESM.GATK-HC.SNP-FinalSet.EFF.vcf"
)
vcf_files = [ vcf.Reader(open(filename)) for filename in vcf_filenames ]
In [4]:
#Out files
out_filenames = ("p1_specific.out",
"p2_specific.out",
"offsp_specific.out"
)
out_files = [ open(filename,"w") for filename in out_filenames ]
In [5]:
#Print header
column_headers=["CHROM","POS","Dm28_ALT","Esmeraldo_ALT","X10825_ALT","X10825_GT","GENE_NAME"]
for f in out_files:
f.write("\t".join(column_headers)+"\n")
#Define output format, with corresponding header
def formatOutputLine(records):
current_chr,current_pos = extract_coord(records)
fields = (current_chr,
current_pos,
records[PARENT_1].ALT[0] if records[PARENT_1] else "-",
records[PARENT_2].ALT[0] if records[PARENT_2] else "-",
records[OFFSPRING].ALT[0] if records[OFFSPRING] else "-",
records[OFFSPRING].samples[0]["GT"] if records[OFFSPRING] else "-",
extractProteinFromEffect(records)
)
return "\t".join([str(x) for x in fields])+"\n"
In [6]:
def get_next_record(vcf):
record = False
try:
record = vcf.next()
except StopIteration:
pass
return record
In [7]:
def extractProteinFromEffect(records):
protein = "non-coding"
for record in records:
if record and "EFF" in record.INFO:
protein = record.INFO["EFF"][0].split("|")[8]
break
return protein if protein else "non-coding"
def extract_coord(records):
chrom = None
pos = None
for r in records:
if r:
chrom = r.CHROM
pos = r.POS
return (chrom,pos)
In [8]:
#Ordering of chromosomes in file is:
# 11, ... ,19,1,20,...,29,2,30,...,90,...,9,10
#Solution:
# num > 10 : e.g 30 -> 300, 39 -> 390, 40-> 400
#num < 100: e.g 3 -> 3*100 + 95 = 395
# num == 10 : 999999 (force to be last)
def chrom_order_value(chrom):
cmp_value = 0
if chrom > 10:
cmp_value = chrom * 10
if chrom < 10:
cmp_value = (chrom * 100) + 95
if chrom == 10:
cmp_value = 999999
return cmp_value
def cmp_records(record1, record2):
cmp_result = 0
if record1.CHROM == record2.CHROM:
if record1.POS < record2.POS:
cmp_result = -1
elif record1.POS > record2.POS:
cmp_result = 1
elif chrom_order_value(record1.CHROM) < chrom_order_value(record2.CHROM):
cmp_result = -1
else:
cmp_result = 1
return cmp_result
In [9]:
def get_records_to_process(records):
min_pos = None
min_record = None
for idx,current_record in enumerate(records):
if min_pos is None and current_record:
min_pos = [idx]
min_record = current_record
elif current_record:
record_comparison = cmp_records(current_record, min_record)
#If records belong to the same position
if record_comparison == 0:
min_pos.append(idx)
#If one of the records has a lower chromosomal position, process that one
elif record_comparison == -1:
min_pos = [idx]
min_record = current_record
return [(records[x] if x in min_pos else False) for x in (PARENT_1,PARENT_2,OFFSPRING)]
In [10]:
def is_homozygous(record):
#return len(record.REF)
return not record or record.samples[0].data.GT in ("0/0","0|0","1/1","1|1")
def is_parent1_specific(records):
test1 = not records[PARENT_1] and not records[OFFSPRING] and records[PARENT_2]
test2 = records[PARENT_1] and records[OFFSPRING]
test2 = test2 and records[PARENT_1].ALT[0].sequence == records[OFFSPRING].ALT[0].sequence
test2 = test2 and (not records[PARENT_2] or records[PARENT_1].ALT[0].sequence != records[PARENT_2].ALT[0].sequence)
return test1 or test2
def is_parent2_specific(records):
test1 = not records[PARENT_2] and not records[OFFSPRING] and records[PARENT_1]
test2 = records[PARENT_2] and records[OFFSPRING]
test2 = test2 and records[PARENT_2].ALT[0].sequence == records[OFFSPRING].ALT[0].sequence
test2 = test2 and (not records[PARENT_1] or records[PARENT_1].ALT[0].sequence != records[PARENT_2].ALT[0].sequence)
return test1 or test2
def is_offspring_specific(records):
test1 = not records[OFFSPRING] and records[PARENT_1] and records[PARENT_2]
test2 = records[OFFSPRING] and (not records[PARENT_1] or records[PARENT_1].ALT[0].sequence != records[OFFSPRING].ALT[0].sequence )
test2 = test2 and (not records[PARENT_2] or records[PARENT_2].ALT[0].sequence != records[OFFSPRING].ALT[0].sequence )
return test1 or test2
In [11]:
def process_records(records,out_files):
#If all records are homozygous
if reduce(lambda x,y:x and y,(is_homozygous(r) for r in records )):
if is_parent1_specific(records):
out_files[PARENT_1].write( formatOutputLine(records) )
elif is_parent2_specific(records):
out_files[PARENT_2].write( formatOutputLine(records) )
elif is_offspring_specific(records):
out_files[OFFSPRING].write( formatOutputLine(records) )
In [12]:
def update_iteration(loaded_records,processed_records, vcf_files):
new_records = []
for loaded,processed,vcf_file in zip(loaded_records,processed_records,vcf_files):
if not loaded:
new_records.append(False)
if loaded and not processed:
new_records.append(loaded)
if loaded and processed:
new_records.append( get_next_record(vcf_file) )
return new_records
In [13]:
#Initialize records with 1st record
loaded_records = [get_next_record( vcf_file ) for vcf_file in vcf_files]
In [14]:
def debug_printer(records):
rec_str = []
for rec in records:
if rec:
rec_str.append( (rec.CHROM,rec.POS) )
else:
rec_str.append(False)
return str(rec_str)
In [15]:
counter = 1
while reduce( lambda x,y: x or y, loaded_records):
#print debug_printer(loaded_records)
records_to_process = get_records_to_process(loaded_records)
#print debug_printer(records_to_process)
process_records( records_to_process ,out_files)
loaded_records = update_iteration(loaded_records,records_to_process, vcf_files)
counter +=1
if counter % 10000 == 0:
print "Positions processed: "+str(counter)
In [16]:
for out_file in out_files:
out_file.close()
In [17]:
print debug_printer(records_to_process)
In [18]: