In [1]:
# url = 'ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/ExAC.r0.3.sites.vep.vcf.gz'
# ! wget --timestamping --no-verbose --directory-prefix download {url}
In [2]:
import re
import gzip
import numpy
import pandas
import vcf
In [4]:
def get_allele_frequency(x):
"""x is list or array"""
x = numpy.array(x)
return x[(numpy.abs(x - 0.5)).argmin()]
In [5]:
# Options
path = 'download/platforms/raw/ExAC.r0.3.sites.vep.vcf.gz'
maj_af_min = 0.05
In [6]:
bed_rows = []
for r in vcf.Reader(filename=path, prepend_chr=True):
# Quality control
if r.FILTER:
continue
# Exclude non-SNPs
if not r.is_snp:
continue
# Major allele frequency check
allele_freq = get_allele_frequency(r.INFO['AF'])
if not ((allele_freq >= maj_af_min) and
(allele_freq <= 1 - maj_af_min)):
continue
# Add to bed rows
bed_rows.append((r.CHROM, r.POS - 1, r.POS))
bed_df = pandas.DataFrame(bed_rows, columns=['chrom', 'chromStart', 'chromEnd'])
In [8]:
bed_df.head(3)
Out[8]:
In [10]:
len(bed_df)
Out[10]:
In [12]:
with gzip.open('download/platforms/bed/exac.bed.gz', 'wt') as write_file:
bed_df.to_csv(write_file, index=False, header=False, sep='\t')