In [1]:
import copy
import cPickle
import os
import subprocess
import cdpybio as cpb
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
import pybedtools as pbt
import scipy.stats as stats
import seaborn as sns
import ciepy
import cardipspy as cpy
%matplotlib inline
dy_name = 'table_eqtl_results'
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)
import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
cpy.makedir(dy)
pbt.set_tempdir(dy)
In [2]:
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
In [3]:
fn = os.path.join(ciepy.root, 'misc', 'stem_cell_population_maintenance.tsv')
go_scpm = pd.read_table(fn, header=None)
go_genes = set(go_scpm[2]) & set(gene_info.gene_name)
In [4]:
def get_gene_id(x):
return gene_info[gene_info.gene_name == x].index[0]
In [5]:
# Note: sometimes this cell fails. If I run several times eventually it works. Not sure
# why Nature's URL is weird.
url = 'http://www.nature.com/nbt/journal/v33/n11/extref/nbt.3387-S5.xlsx'
scorecard = pd.read_excel(url)
scorecard = scorecard.drop(scorecard.columns[2:], axis=1)
scorecard = scorecard[scorecard.gene.apply(lambda x: x in gene_info.gene_name.values)]
scorecard.index = [get_gene_id(x) for x in scorecard.gene]
scorecard = scorecard.drop(['gene'], axis=1)
scorecard.columns = ['tsankov_class']
In [6]:
def annotate_eqtls(fn):
df = pd.read_table(fn, index_col=0)
if 'perm_sig' in df.columns:
df = df[df.perm_sig]
td = [x for x in df.columns if 'AF' in x]
df = df.drop(td, axis=1)
df = df.merge(scorecard, left_on='gene_id', right_index=True, how='left')
tdf = pd.DataFrame(True, index=go_genes, columns=['go_stem_cell_population_maintenance'])
df = df.merge(tdf, left_on='gene_name', right_index=True, how='left')
df.ix[df.go_stem_cell_population_maintenance.isnull(), 'go_stem_cell_population_maintenance'] = False
return df
In [7]:
writer = pd.ExcelWriter(os.path.join(outdir, 'table_eqtl_results.xlsx'))
In [8]:
for i in [1, 2, 3]:
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls0{}'.format(i), 'lead_variants.tsv')
leads = annotate_eqtls(fn)
leads.drop('genocnt', axis=1).to_excel(writer, 'leads0{}'.format(i))
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls0{}'.format(i), 'gene_variant_pairs.tsv')
all_eqtls = annotate_eqtls(fn)
all_eqtls.drop('genocnt', axis=1).to_excel(writer, 'all0{}'.format(i))
In [9]:
writer.save()