In [1]:
from cruzdb import Genome
local_url = "sqlite:///hg19.db"
#Genome('hg19').mirror(('refGene',), local_url)
#Genome('hg19').mirror(('gwasCatalog', 'lincRNAsTranscripts'), local_url)
In [2]:
hg19 = Genome(local_url)
In [6]:
cols = ["chrom", "chromStart", "chromEnd", "name", "trait", "genes", "orOrBeta", "pValue"]
hg19.dataframe('gwasCatalog')[cols].to_csv("gwasCatalog.txt", sep="\t", index=False)
In [4]:
with open('supplement/Additional-File-8_gwasCatalog.anno.txt', 'w') as fh:
hg19.annotate("gwasCatalog.txt", ("lincRNAsTranscripts", "refGene"), feature_strand=True, out=fh, parallel=True)
In [9]:
from toolshed import reader
from matplotlib import pyplot as plt
import numpy as np
from collections import defaultdict
linc_traits = {'closer': defaultdict(list), '10k': defaultdict(list)}
traits = defaultdict(int)
#linc_traits = defaultdict(list)
#dlincs_near = []
#dlincs_10K = []
#dlincs = []
for d in reader('supplement/Additional-File-8_gwasCatalog.anno.txt'):
if not ('refGene_distance') in d: continue
dlinc = list(set(d['lincRNAsTranscripts_distance'].split(";")))
dref = list(set(d['refGene_distance'].split(";")))
traits[d['trait']] += 1
assert len(dlinc) == len(dref) == 1
dlinc, dref = abs(int(dlinc[0])), abs(int(dref[0]))
# HEY: just switch the comment in these 2 lines to get either
# lincs where closest gene is > 10KB away or just those where linc is closer
#if dlinc == 0 and dref > 10000:
if dlinc < dref:
linc_traits['closer'][d['trait']].append(d)
if dlinc == 0 and dref > 10000:
linc_traits['10k'][d['trait']].append(d)
In [19]:
from operator import itemgetter
ratios = {}
for s in ('closer', '10k'):
ratios[s] = {}
traits6 = [k for k, count in traits.items() if count > 5 and len(linc_traits[s][k]) > 4]
print s, sum(len(linc_traits[s][k]) for k in linc_traits[s].keys())
for trait in traits6:
ratios[s][trait] = (len(linc_traits[s][trait]) / float(traits[trait]), linc_traits[s][trait], traits[trait])
In [20]:
for k, s in enumerate(('closer', '10k'), 9):
outb = open('supplement/Additional-File-%i-traits-%s.txt' % (k, s), 'w')
for trait, (freq_linc, linc_list, count) in sorted(ratios[s].items(), key=lambda f: f[1][0], reverse=True):
print >>outb, "\t".join(map(str, (trait, freq_linc, count, len(linc_list))))
for lincs in linc_list:
for linc in lincs["lincRNAsTranscripts_name"].split(";"):
pass
#res = scrape_ncbi(trait, linc)
#if res != []:
# print trait, linc, []
outb.close()
In [ ]: