In [8]:
import gzip
import pandas
In [6]:
# Download genotyping platform SNPs
adf_files = [
'A-GEOD-8882/A-GEOD-8882.adf.txt',
'A-GEOD-6434/A-GEOD-6434.adf.txt',
'A-AFFY-107/A-AFFY-107.adf.txt',
'A-AFFY-72/A-AFFY-72.adf.txt',
]
base_url = 'http://www.ebi.ac.uk/arrayexpress/files'
for adf_file in adf_files:
! wget --no-verbose --timestamping --directory-prefix download/chips/adf {base_url}/{adf_file}
! gzip --force download/chips/{adf_file.rsplit('/', 1)[-1]}
In [13]:
def parse_adf(path):
"""Parse array description files from ArrayExpress."""
with gzip.open(path, 'rt') as read_file:
for i, line in enumerate(read_file):
if line.strip() == '[main]':
skip = i + 1
break
return pandas.read_table(path, skiprows=skip)
In [23]:
#Illumina HumanHap550 Platform: https://www.ebi.ac.uk/arrayexpress/arrays/A-GEOD-6434/
df = parse_adf('download/chips/A-GEOD-6434.adf.txt.gz')
rsids = df['Reporter Database Entry [dbsnp]']
rsids = rsids[rsids.str.startswith('rs').fillna(False)]
with open("download/chips/hh550-snp-for-bed.txt", 'w') as write_file:
write_file.write('\n'.join(rsids))
'{} SNPs with rs#s on HumanHap550'.format(len(rsids))
Out[23]:
In [24]:
#Illumina HumanOmni1-Quad: https://www.ebi.ac.uk/arrayexpress/arrays/A-GEOD-8882/
df = parse_adf('download/chips/A-GEOD-8882.adf.txt.gz')
rsids = df['Reporter Database Entry [dbsnp]']
rsids = rsids[rsids.str.startswith('rs').fillna(False)]
with open('download/chips/ho1-snp-for-bed.txt', 'w') as write_file:
write_file.write('\n'.join(rsids))
'{} SNPs with rs#s on HumanOmni1'.format(len(rsids))
Out[24]:
In [30]:
#Affy 500k Array set is made up of 2 arrays (Sty and Nsp)
#Sty Array: https://www.ebi.ac.uk/arrayexpress/arrays/A-AFFY-72/
df = parse_adf('download/chips/A-AFFY-72.adf.txt.gz')
rsids = df['Composite Element Database Entry[ncbi_dbsnp:126:126]']
rsids_0 = rsids[rsids.str.startswith('rs').fillna(False)]
#Nsp Array: https://www.ebi.ac.uk/arrayexpress/arrays/A-AFFY-107/
df = parse_adf('download/chips/A-AFFY-107.adf.txt.gz')
rsids = df['Composite Element Database Entry[dbsnp:v128]']
rsids_1 = rsids[rsids.str.startswith('rs').fillna(False)]
rsids = sorted(set(rsids_0) | set(rsids_1))
with open('download/chips/affy500-snp-for-bed.txt', 'w') as write_file:
write_file.write('\n'.join(rsids))
'{} SNPs with rs#s on Affy500k'.format(len(rsids))
Out[30]:
Manual step: Follow instructions at http://www.gettinggeneticsdone.com/2011/06/mapping-snps-to-genes-for-gwas.html to get a BED file. Use GRCh37/hg19 and dbSNP Common SNPs(142). Name the downloaded files hh550.bed, ho1.bed, and affy500k.bed. Move these files to download/bed and gzip.
In [34]:
url = 'https://raw.githubusercontent.com/dhimmel/kg/de9c303442f01acde89aa956acb2f53888295169/data/common-SNPs/common-snps.bed.gz'
! wget --no-verbose --output-document download/platforms/bed/kg3.bed.gz {url}
In [69]:
# Download Entrez mappings from http://dx.doi.org/10.6084/m9.figshare.103113
url = 'http://files.figshare.com/230645/entrez.txt'
entrez_df = pandas.read_table(url, names=['chrom', 'chromStart', 'chromEnd', 'name'])
entrez_df['chrom'] = 'chr' + entrez_df['chrom'].astype(str)
entrez_df = entrez_df.replace({'chrom': {'chr23': 'chrX', 'chr24': 'chrY'}})
entrez_df.to_csv('download/entrez-genes.bed', index=False, header=False, sep='\t')
entrez_df.tail(2)
Out[69]:
In [72]:
platforms = 'hh550', 'ho1', 'affy500', 'exac', 'kg3'
# Base pairs added upstream and downstream of each entry
window = 10000
# Count the overlaps for genotyping platforms
for platform in platforms:
! bedtools window -w {window} -c -a download/entrez-genes.bed \
-b download/platforms/bed/{platform}.bed.gz > data/platforms/{platform}-entrez.bed
In [73]:
# Read SNPs per Gene
columns = ['chromosome', 'chromosome_start', 'chromosome_end', 'entrez_gene_id', 'snps']
dfs = list()
for platform in platforms:
path = 'data/platforms/{}-entrez.bed'.format(platform)
columns[-1] = 'snps_{}'.format(platform)
df = pandas.read_table(path, names=columns)
dfs.append(df)
In [74]:
count_df = dfs[0]
for df in dfs[1:]:
count_df = count_df.merge(df)
count_df.to_csv('data/platforms/combined.tsv', index=False, sep='\t')
count_df.head(2)
Out[74]: