Common SNPs per gene in ExAC


In [1]:
# url = 'ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/ExAC.r0.3.sites.vep.vcf.gz'
# ! wget --timestamping --no-verbose --directory-prefix download {url}

In [2]:
import re
import gzip
import numpy
import pandas
import vcf

Helper functions


In [4]:
def get_allele_frequency(x):
    """x is list or array"""
    x = numpy.array(x)
    return x[(numpy.abs(x - 0.5)).argmin()]

Parse


In [5]:
# Options
path = 'download/platforms/raw/ExAC.r0.3.sites.vep.vcf.gz'
maj_af_min = 0.05

In [6]:
bed_rows = []
for r in vcf.Reader(filename=path, prepend_chr=True):
    
    # Quality control
    if r.FILTER:
        continue
    
    # Exclude non-SNPs
    if not r.is_snp:
        continue
    
    # Major allele frequency check
    allele_freq = get_allele_frequency(r.INFO['AF'])
    if not ((allele_freq >= maj_af_min) and
            (allele_freq <= 1 - maj_af_min)):
        continue

    # Add to bed rows
    bed_rows.append((r.CHROM, r.POS - 1, r.POS))

bed_df = pandas.DataFrame(bed_rows, columns=['chrom', 'chromStart', 'chromEnd'])

In [8]:
bed_df.head(3)


Out[8]:
chrom chromStart chromEnd
0 chr1 69269 69270
1 chr1 69510 69511
2 chr1 69760 69761

In [10]:
len(bed_df)


Out[10]:
82728

In [12]:
with gzip.open('download/platforms/bed/exac.bed.gz', 'wt') as write_file:
    bed_df.to_csv(write_file, index=False, header=False, sep='\t')