In [1]:
from cruzdb import Genome
local_url = "sqlite:///hg19.db"

#Genome('hg19').mirror(('refGene',), local_url)
#Genome('hg19').mirror(('gwasCatalog', 'lincRNAsTranscripts'), local_url)

In [2]:
hg19 = Genome(local_url)

In [6]:
cols = ["chrom", "chromStart", "chromEnd", "name", "trait", "genes", "orOrBeta", "pValue"]
hg19.dataframe('gwasCatalog')[cols].to_csv("gwasCatalog.txt", sep="\t", index=False)

In [4]:
with open('supplement/Additional-File-8_gwasCatalog.anno.txt', 'w') as fh:
    hg19.annotate("gwasCatalog.txt", ("lincRNAsTranscripts", "refGene"), feature_strand=True, out=fh, parallel=True)

In [9]:
from toolshed import reader
from matplotlib import pyplot as plt
import numpy as np
from collections import defaultdict


linc_traits = {'closer': defaultdict(list), '10k': defaultdict(list)}
traits = defaultdict(int)
#linc_traits = defaultdict(list)
#dlincs_near = []
#dlincs_10K = []
#dlincs = []

for d in reader('supplement/Additional-File-8_gwasCatalog.anno.txt'):
    if not ('refGene_distance') in d: continue
    dlinc = list(set(d['lincRNAsTranscripts_distance'].split(";")))
    dref = list(set(d['refGene_distance'].split(";")))
    traits[d['trait']] += 1
    assert len(dlinc) == len(dref) == 1
    dlinc, dref = abs(int(dlinc[0])), abs(int(dref[0]))
    # HEY: just switch the comment in these 2 lines to get either
    # lincs where closest gene is > 10KB away or just those where linc is closer
    #if dlinc == 0 and dref > 10000:
    if dlinc < dref:
        linc_traits['closer'][d['trait']].append(d)
    if dlinc == 0 and dref > 10000:
        linc_traits['10k'][d['trait']].append(d)

In [19]:
from operator import itemgetter

ratios = {}
for s in ('closer', '10k'):
    ratios[s] = {}
    traits6 = [k for k, count in traits.items() if count > 5 and len(linc_traits[s][k]) > 4]
    print s, sum(len(linc_traits[s][k]) for k in linc_traits[s].keys())

    
    for trait in traits6:
        ratios[s][trait] = (len(linc_traits[s][trait]) / float(traits[trait]), linc_traits[s][trait], traits[trait])


closer 2153
10k 388

In [20]:
for k, s in enumerate(('closer', '10k'), 9):
    outb = open('supplement/Additional-File-%i-traits-%s.txt' % (k, s), 'w')
    for trait, (freq_linc, linc_list, count) in sorted(ratios[s].items(), key=lambda f: f[1][0], reverse=True):
        print >>outb, "\t".join(map(str, (trait, freq_linc, count, len(linc_list))))
        for lincs in linc_list:
            for linc in lincs["lincRNAsTranscripts_name"].split(";"):
                pass
                #res = scrape_ncbi(trait, linc)
                #if res != []:
                #    print trait, linc, []
    outb.close()

In [ ]: