Retreive the SNPs for each platform and aggregate by gene.



In [8]:

    
import gzip
import pandas

Create BAM Files for SNPs for genotyping chips



In [6]:

    
# Download genotyping platform SNPs
adf_files = [
    'A-GEOD-8882/A-GEOD-8882.adf.txt',
    'A-GEOD-6434/A-GEOD-6434.adf.txt',
    'A-AFFY-107/A-AFFY-107.adf.txt',
    'A-AFFY-72/A-AFFY-72.adf.txt',
]
base_url = 'http://www.ebi.ac.uk/arrayexpress/files'
for adf_file in adf_files:
    ! wget --no-verbose --timestamping --directory-prefix download/chips/adf {base_url}/{adf_file}
    ! gzip --force download/chips/{adf_file.rsplit('/', 1)[-1]}



In [13]:

    
def parse_adf(path):
    """Parse array description files from ArrayExpress."""
    with gzip.open(path, 'rt') as read_file:
        for i, line in enumerate(read_file):
            if line.strip() == '[main]':
                skip = i + 1
                break
    return pandas.read_table(path, skiprows=skip)



In [23]:

    
#Illumina HumanHap550 Platform: https://www.ebi.ac.uk/arrayexpress/arrays/A-GEOD-6434/
df = parse_adf('download/chips/A-GEOD-6434.adf.txt.gz')
rsids = df['Reporter Database Entry [dbsnp]']
rsids = rsids[rsids.str.startswith('rs').fillna(False)]
with open("download/chips/hh550-snp-for-bed.txt", 'w') as write_file:
    write_file.write('\n'.join(rsids))
'{} SNPs with rs#s on HumanHap550'.format(len(rsids))









    Out[23]:





'561303 SNPs with rs#s on HumanHap550'



In [24]:

    
#Illumina HumanOmni1-Quad: https://www.ebi.ac.uk/arrayexpress/arrays/A-GEOD-8882/
df = parse_adf('download/chips/A-GEOD-8882.adf.txt.gz')
rsids = df['Reporter Database Entry [dbsnp]']
rsids = rsids[rsids.str.startswith('rs').fillna(False)]
with open('download/chips/ho1-snp-for-bed.txt', 'w') as write_file:
    write_file.write('\n'.join(rsids))
'{} SNPs with rs#s on HumanOmni1'.format(len(rsids))









    Out[24]:





'970342 SNPs with rs#s on HumanOmni1'



In [30]:

    
#Affy 500k Array set is made up of 2 arrays (Sty and Nsp)

#Sty Array: https://www.ebi.ac.uk/arrayexpress/arrays/A-AFFY-72/
df = parse_adf('download/chips/A-AFFY-72.adf.txt.gz')
rsids = df['Composite Element Database Entry[ncbi_dbsnp:126:126]']
rsids_0 = rsids[rsids.str.startswith('rs').fillna(False)]

#Nsp Array: https://www.ebi.ac.uk/arrayexpress/arrays/A-AFFY-107/
df = parse_adf('download/chips/A-AFFY-107.adf.txt.gz')
rsids = df['Composite Element Database Entry[dbsnp:v128]']
rsids_1 = rsids[rsids.str.startswith('rs').fillna(False)]

rsids = sorted(set(rsids_0) | set(rsids_1))
with open('download/chips/affy500-snp-for-bed.txt', 'w') as write_file:
    write_file.write('\n'.join(rsids))
'{} SNPs with rs#s on Affy500k'.format(len(rsids))









    Out[30]:





'497531 SNPs with rs#s on Affy500k'

Manual step: Follow instructions at http://www.gettinggeneticsdone.com/2011/06/mapping-snps-to-genes-for-gwas.html to get a BED file. Use GRCh37/hg19 and dbSNP Common SNPs(142). Name the downloaded files hh550.bed, ho1.bed, and affy500k.bed. Move these files to download/bed and gzip.

Download 1000 Genomes Phase 3 common SNPs



In [34]:

    
url = 'https://raw.githubusercontent.com/dhimmel/kg/de9c303442f01acde89aa956acb2f53888295169/data/common-SNPs/common-snps.bed.gz'
! wget --no-verbose --output-document download/platforms/bed/kg3.bed.gz {url}









    



2015-08-24 17:59:10 URL:https://raw.githubusercontent.com/dhimmel/kg/de9c303442f01acde89aa956acb2f53888295169/data/common-SNPs/common-snps.bed.gz [40372216/40372216] -> "download/platforms/bed/kg3.bed.gz" [1]

Download Entrez Gene locations



In [69]:

    
# Download Entrez mappings from http://dx.doi.org/10.6084/m9.figshare.103113
url = 'http://files.figshare.com/230645/entrez.txt'
entrez_df = pandas.read_table(url, names=['chrom', 'chromStart', 'chromEnd', 'name'])
entrez_df['chrom'] = 'chr' + entrez_df['chrom'].astype(str)
entrez_df = entrez_df.replace({'chrom': {'chr23': 'chrX', 'chr24': 'chrY'}})
entrez_df.to_csv('download/entrez-genes.bed', index=False, header=False, sep='\t')
entrez_df.tail(2)

Compute SNPs per gene using `bedtools`



In [72]:

    
platforms = 'hh550', 'ho1', 'affy500', 'exac', 'kg3'

# Base pairs added upstream and downstream of each entry
window = 10000

# Count the overlaps for genotyping platforms
for platform in platforms:
    ! bedtools window -w {window} -c -a download/entrez-genes.bed \
        -b download/platforms/bed/{platform}.bed.gz > data/platforms/{platform}-entrez.bed

Combine to a single file



In [73]:

    
# Read SNPs per Gene
columns = ['chromosome', 'chromosome_start', 'chromosome_end', 'entrez_gene_id', 'snps']

dfs = list()
for platform in platforms:
    path = 'data/platforms/{}-entrez.bed'.format(platform)
    columns[-1] = 'snps_{}'.format(platform)
    df = pandas.read_table(path, names=columns)
    dfs.append(df)



In [74]:

    
count_df = dfs[0]
for df in dfs[1:]:
    count_df = count_df.merge(df)

count_df.to_csv('data/platforms/combined.tsv', index=False, sep='\t')
count_df.head(2)









    Out[74]:






  
    
      
      chromosome
      chromosome_start
      chromosome_end
      entrez_gene_id
      snps_hh550
      snps_ho1
      snps_affy500
      snps_exac
      snps_kg3
    
  
  
    
      0
      chr1
      69055
      70108
      79501
      0
      0
      0
      4
      10
    
    
      1
      chr1
      860260
      879955
      148398
      0
      13
      0
      9
      159