Retreive the SNPs for each platform and aggregate by gene.


In [8]:
import gzip
import pandas

Create BAM Files for SNPs for genotyping chips


In [6]:
# Download genotyping platform SNPs
adf_files = [
    'A-GEOD-8882/A-GEOD-8882.adf.txt',
    'A-GEOD-6434/A-GEOD-6434.adf.txt',
    'A-AFFY-107/A-AFFY-107.adf.txt',
    'A-AFFY-72/A-AFFY-72.adf.txt',
]
base_url = 'http://www.ebi.ac.uk/arrayexpress/files'
for adf_file in adf_files:
    ! wget --no-verbose --timestamping --directory-prefix download/chips/adf {base_url}/{adf_file}
    ! gzip --force download/chips/{adf_file.rsplit('/', 1)[-1]}

In [13]:
def parse_adf(path):
    """Parse array description files from ArrayExpress."""
    with gzip.open(path, 'rt') as read_file:
        for i, line in enumerate(read_file):
            if line.strip() == '[main]':
                skip = i + 1
                break
    return pandas.read_table(path, skiprows=skip)

In [23]:
#Illumina HumanHap550 Platform: https://www.ebi.ac.uk/arrayexpress/arrays/A-GEOD-6434/
df = parse_adf('download/chips/A-GEOD-6434.adf.txt.gz')
rsids = df['Reporter Database Entry [dbsnp]']
rsids = rsids[rsids.str.startswith('rs').fillna(False)]
with open("download/chips/hh550-snp-for-bed.txt", 'w') as write_file:
    write_file.write('\n'.join(rsids))
'{} SNPs with rs#s on HumanHap550'.format(len(rsids))


Out[23]:
'561303 SNPs with rs#s on HumanHap550'

In [24]:
#Illumina HumanOmni1-Quad: https://www.ebi.ac.uk/arrayexpress/arrays/A-GEOD-8882/
df = parse_adf('download/chips/A-GEOD-8882.adf.txt.gz')
rsids = df['Reporter Database Entry [dbsnp]']
rsids = rsids[rsids.str.startswith('rs').fillna(False)]
with open('download/chips/ho1-snp-for-bed.txt', 'w') as write_file:
    write_file.write('\n'.join(rsids))
'{} SNPs with rs#s on HumanOmni1'.format(len(rsids))


Out[24]:
'970342 SNPs with rs#s on HumanOmni1'

In [30]:
#Affy 500k Array set is made up of 2 arrays (Sty and Nsp)

#Sty Array: https://www.ebi.ac.uk/arrayexpress/arrays/A-AFFY-72/
df = parse_adf('download/chips/A-AFFY-72.adf.txt.gz')
rsids = df['Composite Element Database Entry[ncbi_dbsnp:126:126]']
rsids_0 = rsids[rsids.str.startswith('rs').fillna(False)]

#Nsp Array: https://www.ebi.ac.uk/arrayexpress/arrays/A-AFFY-107/
df = parse_adf('download/chips/A-AFFY-107.adf.txt.gz')
rsids = df['Composite Element Database Entry[dbsnp:v128]']
rsids_1 = rsids[rsids.str.startswith('rs').fillna(False)]

rsids = sorted(set(rsids_0) | set(rsids_1))
with open('download/chips/affy500-snp-for-bed.txt', 'w') as write_file:
    write_file.write('\n'.join(rsids))
'{} SNPs with rs#s on Affy500k'.format(len(rsids))


Out[30]:
'497531 SNPs with rs#s on Affy500k'

Manual step: Follow instructions at http://www.gettinggeneticsdone.com/2011/06/mapping-snps-to-genes-for-gwas.html to get a BED file. Use GRCh37/hg19 and dbSNP Common SNPs(142). Name the downloaded files hh550.bed, ho1.bed, and affy500k.bed. Move these files to download/bed and gzip.

Download 1000 Genomes Phase 3 common SNPs


In [34]:
url = 'https://raw.githubusercontent.com/dhimmel/kg/de9c303442f01acde89aa956acb2f53888295169/data/common-SNPs/common-snps.bed.gz'
! wget --no-verbose --output-document download/platforms/bed/kg3.bed.gz {url}


2015-08-24 17:59:10 URL:https://raw.githubusercontent.com/dhimmel/kg/de9c303442f01acde89aa956acb2f53888295169/data/common-SNPs/common-snps.bed.gz [40372216/40372216] -> "download/platforms/bed/kg3.bed.gz" [1]

Download Entrez Gene locations


In [69]:
# Download Entrez mappings from http://dx.doi.org/10.6084/m9.figshare.103113
url = 'http://files.figshare.com/230645/entrez.txt'
entrez_df = pandas.read_table(url, names=['chrom', 'chromStart', 'chromEnd', 'name'])
entrez_df['chrom'] = 'chr' + entrez_df['chrom'].astype(str)
entrez_df = entrez_df.replace({'chrom': {'chr23': 'chrX', 'chr24': 'chrY'}})
entrez_df.to_csv('download/entrez-genes.bed', index=False, header=False, sep='\t')
entrez_df.tail(2)


Out[69]:
chrom chromStart chromEnd name
17683 chrY 23673264 23711212 5940
17684 chrY 26909216 26959626 57054

Compute SNPs per gene using bedtools


In [72]:
platforms = 'hh550', 'ho1', 'affy500', 'exac', 'kg3'

# Base pairs added upstream and downstream of each entry
window = 10000

# Count the overlaps for genotyping platforms
for platform in platforms:
    ! bedtools window -w {window} -c -a download/entrez-genes.bed \
        -b download/platforms/bed/{platform}.bed.gz > data/platforms/{platform}-entrez.bed

Combine to a single file


In [73]:
# Read SNPs per Gene
columns = ['chromosome', 'chromosome_start', 'chromosome_end', 'entrez_gene_id', 'snps']

dfs = list()
for platform in platforms:
    path = 'data/platforms/{}-entrez.bed'.format(platform)
    columns[-1] = 'snps_{}'.format(platform)
    df = pandas.read_table(path, names=columns)
    dfs.append(df)

In [74]:
count_df = dfs[0]
for df in dfs[1:]:
    count_df = count_df.merge(df)

count_df.to_csv('data/platforms/combined.tsv', index=False, sep='\t')
count_df.head(2)


Out[74]:
chromosome chromosome_start chromosome_end entrez_gene_id snps_hh550 snps_ho1 snps_affy500 snps_exac snps_kg3
0 chr1 69055 70108 79501 0 0 0 4 10
1 chr1 860260 879955 148398 0 13 0 9 159