3D Structure


In [3]:
import glob
import os
import re
import subprocess
import urllib2

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import pybedtools as pbt
import seaborn as sns
import vcf as pyvcf

import cardipspy as cpy
import ciepy

%matplotlib inline

dy_name = '3d_structure'

import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
    cpy.makedir(dy)
    pbt.set_tempdir(dy)
    
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)

In [ ]:
tg = pd.read_table(cpy.gencode_transcript_gene, index_col=0, 
                   header=None, squeeze=True)
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 
                  'tpm_log_filtered_phe_std_norm_peer_resid.tsv')
exp = pd.read_table(fn, index_col=0)

genes = pbt.BedTool(cpy.gencode_gene_bed)

t_to_g = pd.read_table(cpy.gencode_transcript_gene, index_col=0, header=None, squeeze=True)

fn = os.path.join(os.path.split(cpy.roadmap_15_state_annotation)[0], 'EIDlegend.txt')
roadmap_ids = pd.read_table(fn, squeeze=True, index_col=0, header=None)

TADs

I'd like to look for enrichment of eQTLs in TADs. As in the Grubert et al. 2015 paper, I can mirror the position of the variant over the gene and see whether they fall within the same TAD as often. They also shuffled the locations of the TADs and compared the number of distal (> 50 kb) QTLs falling within the same TAD as the gene against the shuffled data. I can do the same analysis using sets of null variants as well.


In [ ]:
# Download TADs from Dixon et al. 2012. These coordinates are in hg18 so we'll download the
# liftOver chain files and convert the bed file.

dy = os.path.join(private_outdir, 'hESC')
if not os.path.exists(dy):
    out = os.path.join(private_outdir, 'hESC.domain.tar.gz')
    !curl http://132.239.201.216/mouse/hi-c/hESC.domain.tar.gz > {out}
    !tar -C {private_outdir} -xvf {out}
    !rm {out}
    
fn = os.path.join(private_outdir, 'hg18ToHg19.over.chain')
if not os.path.exists(fn):
    url = ('http://hgdownload.cse.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz')
    !curl {url} | zcat > {fn}

In [ ]:
# Read in TADs, convert to hg19.
tads = pbt.BedTool(os.path.join(private_outdir, 'hESC', 'combined', 'total.combined.domain'))
n = len(tads)
mapped = os.path.join(outdir, 'hg19_hESC_tads_mapped.bed')
unmapped = os.path.join(outdir, 'hg19_hESC_tads_unmapped.txt')
tads = cpb.analysis.liftover_bed(tads, os.path.join(private_outdir, 'hg18ToHg19.over.chain'),
                                 mapped, unmapped)
print('Converted {} of {} TADs to hg19.'.format(len(tads), n))

In [ ]:
annot_beds = dict()
# H3K27ac
fn = os.path.join(ciepy.root, 'output', 'ji_et_al_2015_processing', 'h3k27ac_peaks.bed')
annot_beds['h3k27ac'] = pbt.BedTool(fn)
# Genes
annot_beds['gene'] = genes
# Promoters
promoters = pbt.BedTool('/publicdata/gencode_v19_20151104/promoters_by_gene.bed')
df = promoters.to_dataframe()
df.name = df.name.apply(lambda x: x.split('_')[0])
s = '\n'.join(df.astype(str).apply(lambda x: '\t'.join(x), axis=1)) + '\n'
promoters = pbt.BedTool(s, from_string=True)
annot_beds['promoter'] = promoters
# TSS
tss = pbt.BedTool(cpy.gencode_tss_bed)
df = tss.to_dataframe()
df.name = t_to_g[df.name.apply(lambda x: x.split('_')[0])].values
df = df.drop_duplicates()
s = '\n'.join(df.astype(str).apply(lambda x: '\t'.join(x), axis=1)) + '\n'
tss = pbt.BedTool(s, from_string=True)
annot_beds['tss'] = tss
# Exons
exons = pbt.BedTool(cpy.gencode_exon_bed)
df = exons.to_dataframe()
df.name = t_to_g[df.name].values
df = df.drop_duplicates()
s = '\n'.join(df.astype(str).apply(lambda x: '\t'.join(x), axis=1)) + '\n'
exons = pbt.BedTool(s, from_string=True)
annot_beds['exon'] = exons

In [ ]:
def parse_tang(url):
    s = cpb.general.read_gzipped_text_url(url)
    lines = [x.split() for x in s.strip().split('\n')]
    df = pd.DataFrame(lines, columns=['chrom1', 'start1', 'end1', 
                                      'chrom2', 'start2', 'end2', 'freq'])
    df.index = df.chrom1 + ':' + df.start1 + '-' + df.end1 + '==' + df.chrom2 + ':' + df.start1 + '-' + df.end2
    for c in ['start1', 'start2', 'end1', 'end2', 'freq']:
        df[c] = df[c].astype(int)
    return df
class AnnotatedBed: def __init__( self, bed, annot_beds, completely_contains=None, ): """ Initialize AnnotatedBed object. Parameters ---------- bed : str or pybedtools.Bedtool Bed file to annotate. annot_beds : dict Dict whose keys are names (like 'gene', 'promoter', etc.) and whose values are bed files to annotate the input bed file with. completely_contains : list List of keys from annot_beds. For these beds, we will check whether the features is entirely contained by features "bed." """ self._initialize_bt(bed) self._num_cols = len(self.bt[0].fields) self._has_name_col = self._num_cols > 3 self._initialize_dfs() self._initialize_annot_beds(annot_beds) for k in annot_beds.keys(): self.annotate_bed(self.bt, k, k) for k in completely_contains: self.annotate_bed(self.bt, k, k, complete=True) self._bt_path = None def load_saved_bts(self): """If the AnnotatedBed object was saved to a pickle and reloaded, this method remakes the BedTool object.""" if self._bt_path: self.bt = pbt.BedTool(self._bt_path) def save( self, path, name, ): """ Save AnnotatedBed object and bed file. The object is stored in a pickle and the bed file is saved as a separate bed file. The object can be reloaded by reading the pickle using cPickle. Parameters ---------- path : str Path to save files to. Path should include a basename for the files. For instance, path='~/abc' will create files like ~/abc.pickle, ~/abc.bed, etc. name : str Descriptive name used for bed file trackline. """ t = 'track type=bed name="{}"'.format(name) self.bt.saveas(path + '.bed', trackline=t) self._bt_path = path + '.bed' import cPickle cPickle.dump(self, open(path + '.pickle', 'w')) def _initialize_annot_beds( self, annot_beds, ): import pybedtools as pbt self.annot_beds = dict() for k in annot_beds.keys(): if type(annot_beds[k]) == str: self.annot_beds[k] = pbt.BedTool(annot_beds[k]) else: self.annot_beds[k] = annot_beds[k] def _initialize_dfs( self, ): self.df = self.bt.to_dataframe() if self._has_name_col: if len(set(self.df.name)) != self.df.shape[0]: self._has_name_col = False if self._has_name_col: self.df.index = self.df.name else: self.df.index = (self.df.chrom.astype + ':' + self.df.start.astype(str) + '-' + self.df.end.astype(str)) self.feature_to_df = pd.DataFrame(index=self.df.index) def _initialize_bt( self, bed, ): import pybedtools as pbt if type(bed) == str: self.bt = pbt.BedTool(bed) else: self.bt = bed self.bt = self.bt.sort() def bt_from_df(self): """Make a BedTool object for the input bed file.""" import pybedtools as pbt s = ('\n'.join(df.astype(str).apply(lambda x: '\t'.join(x), axis=1)) + '\n') df.bt = pbt.BedTool(s, from_string=True) def annotate_bed( self, bt, name, col_name, complete=False, ): """ Annotate the input bed file using one of the annotation beds. Parameters ---------- name : str The key for the annoation bed file in annot_beds. col_name : str Used to name the columns that will be made. complete : bool If True, this method will check whether the features in the annotation bed are completely contained by the features in the input bed. """ import numpy as np import pandas as pd has_name_col = len(self.annot_beds[name][0].fields) > 3 if complete: res = bt.intersect(self.annot_beds[name], sorted=True, wo=True, F=1) col_name = col_name + '_complete' else: res = bt.intersect(self.annot_beds[name], sorted=True, wo=True) df = res.to_dataframe(names=range(len(res[0].fields))) if self._has_name_col: ind = df[3].values else: ind = list(df[0].astype(str) + ':' + df[1].astype(str) + '-' + df[2].astype(str)) if has_name_col: vals = df[self._num_cols + 3].values else: vals = list(df[self._num_cols + 0].astype(str) + ':' + df[self._num_cols + 1].astype(str) + '-' + df[self._num_cols + 2].astype(str)) self.df[col_name] = False self.df.ix[set(ind), col_name] = True se = pd.Series(vals, index=ind) vc = pd.Series(se.index).value_counts() self.feature_to_df[col_name] = np.nan self.feature_to_df.ix[list(vc[vc == 1].index), col_name] = \ se[list(vc[vc == 1].index)].apply(lambda x: set([x])) m = list(set(vc[vc > 1].index)) v = [] for i in m: v.append(set(se[i].values)) self.feature_to_df.ix[m, col_name] = v class AnnotatedInteractions: def __init__( self, df, annot_beds, completely_contains=None, ): """ Initialize AnnotatedInteractions object. Parameters ---------- df : pandas.DataFrame Dataframe with peaks. Must contain columns chrom1, start1, end1, chrom2, start2, and end2. Other columns will not be removed but may be overwritten if they clash with column names created here. Interactions must be unique. annot_beds : dict Dict whose keys are names (like 'gene', 'promoter', etc.) and whose values are bed files to annotate the input bed file with. """ self.df = df.copy(deep=True) self.df.index = (self.df.chrom1.astype(str) + ':' + self.df.start1.astype(str) + '-' + self.df.end1.astype(str) + '==' + self.df.chrom2.astype(str) + ':' + self.df.start1.astype(str) + '-' + self.df.end2.astype(str)) assert len(set(self.df.index)) == self.df.shape[0] self.df['name'] = self.df.index self.feature_to_df = pd.DataFrame(index=self.df.index) self.annotate_interactions() self.bts_from_df() self._initialize_annot_beds(annot_beds) for k in annot_beds.keys(): self.annotate_bed(bt=self.bt1, name=k, col_name='{}1'.format(k), df_col='anchor1') if k in completely_contains: self.annotate_bed(bt=self.bt1, name=k, col_name='{}1_complete'.format(k), df_col='anchor1', complete=True) for k in annot_beds.keys(): self.annotate_bed(bt=self.bt2, name=k, col_name='{}2'.format(k), df_col='anchor2') if k in completely_contains: self.annotate_bed(bt=self.bt2, name=k, col_name='{}2_complete'.format(k), df_col='anchor2', complete=True) for k in annot_beds.keys(): self.annotate_bed(bt=self.bt1, name=k, col_name='{}_loop'.format(k), df_col='loop') if k in completely_contains: self.annotate_bed(bt=self.bt1, name=k, col_name='{}_loop_complete'.format(k), df_col='loop', complete=True) for k in annot_beds.keys(): self.annotate_bed(bt=self.bt1, name=k, col_name='{}_loop_inner'.format(k), df_col='loop_inner') if k in completely_contains: self.annotate_bed(bt=self.bt1, name=k, col_name='{}_loop_inner_complete'.format(k), df_col='loop_inner', complete=True) self._bt1_path = None self._bt2_path = None self._bt_loop_path = None self._bt_loop_inner_path = None def _initialize_annot_beds( self, annot_beds, ): import pybedtools as pbt self.annot_beds = dict() for k in annot_beds.keys(): if type(annot_beds[k]) == str: self.annot_beds[k] = pbt.BedTool(annot_beds[k]) else: self.annot_beds[k] = annot_beds[k] def load_saved_bts(self): """If the AnnotatedInteractions object was saved to a pickle and reloaded, this method remakes the BedTool objects.""" if self._bt1_path: self.bt1 = pbt.BedTool(self._bt1_path) if self._bt2_path: self.bt2 = pbt.BedTool(self._bt2_path) if self._bt_loop_path: self.bt_loop = pbt.BedTool(self._bt_loop_path) if self._bt_loop_inner_path: self.bt_loop_inner = pbt.BedTool(self._bt_loop_inner_path) def save( self, path, name, ): """ Save AnnotatedInteractions object and bed files. The object is stored in a pickle and the bed files are saved as separate bed files. The object can be reloaded by reading the pickle using cPickle and the BedTool objects can be recreated using .load_saved_bts(). Parameters ---------- path : str Path to save files to. Path should include a basename for the files. For instance, path='~/abc' will create files like ~/abc.pickle, ~/abc_anchor1.bed, etc. name : str Descriptive name used for bed file trackline. """ t = 'track type=bed name="{}_anchor1"'.format(name) self.bt1.saveas(path + '_anchor1.bed', trackline=t) self._bt1_path = path + '_anchor1.bed' t = 'track type=bed name="{}_anchor2"'.format(name) self.bt2.saveas(path + '_anchor2.bed', trackline=t) self._bt2_path = path + '_anchor2.bed' t = 'track type=bed name="{}_loop"'.format(name) self.bt_loop.saveas(path + '_loop.bed', trackline=t) self._bt_loop_path = path + '_loop.bed' t = 'track type=bed name="{}_loop_inner"'.format(name) self.bt_loop_inner.saveas(path + '_loop_inner.bed', trackline=t) self._bt_loop_inner_path = path + '_loop_inner.bed' import cPickle cPickle.dump(self, open(path + '.pickle', 'w')) def annotate_bed( self, bt, name, col_name, complete=None, df_col=None, ): """ Annotate the input bed file using one of the annotation beds. Parameters ---------- bt : pybedtools.BedTool BedTool for either one of the anchors, the loops, or the loop inners. name : str The key for the annoation bed file in annot_beds. col_name : str Used to name the columns that will be made. complete : bool If True, this method will check whether the features in the annotation bed are completely contained by the features in the input bed. df_col : str If the name for bt isn't the index of self.df, this specifies which column of self.df contains the names for bt. For instance, if bt is the anchor1 BedTool, the df_col='anchor11'. """ import numpy as np import pandas as pd has_name_col = len(self.annot_beds[name][0].fields) > 3 print('one') if complete: res = bt.intersect(self.annot_beds[name], sorted=True, wo=True, F=1) else: res = bt.intersect(self.annot_beds[name], sorted=True, wo=True) print('two') try: df = res.to_dataframe(names=range(len(res[0].fields))) ind = df[3].values if df_col is None: self.df[col_name] = False self.df.ix[set(ind), col_name] = True else: tdf = pd.DataFrame(True, index=ind, columns=[col_name]) self.df = self.df.merge(tdf, left_on=df_col, right_index=True, how='outer') self.df[col_name] = self.df[col_name].fillna(False) #self.df.ix[self.df[col_name].isnull(), col_name] = False print('a') if has_name_col: vals = df[7].values else: vals = list(df[4].astype(str) + ':' + df[5].astype(str) + '-' + df[6].astype(str)) print('b') df.index = vals gb = df.groupby(3) t = pd.Series(gb.groups) print('c') t = pd.DataFrame(t.apply(lambda x: set(x))) print('d') t.columns = ['{}_features'.format(col_name)] self.df = self.df.merge(t, left_on=df_col, right_index=True, how='outer') print('e') except IndexError: pass def annotate_interactions(self): import numpy as np self.df['anchor1'] = (self.df.chrom1.astype(str) + ':' + self.df.start1.astype(str) + '-' + self.df.end1.astype(str)) self.df['anchor2'] = (self.df.chrom2.astype(str) + ':' + self.df.start2.astype(str) + '-' + self.df.end2.astype(str)) self.df['intra'] = True self.df.ix[self.df.chrom1 != self.df.chrom2, 'intra'] = False ind = self.df[self.df.intra].index self.df['loop'] = np.nan self.df.ix[ind, 'loop'] = ( self.df.ix[ind, 'chrom1'] + ':' + self.df.ix[ind, ['start1', 'start2']].min(axis=1).astype(str) + '-' + self.df.ix[ind, ['end1', 'end2']].max(axis=1).astype(str)) self.df['loop_length'] = (self.df[['end1', 'end2']].max(axis=1) - self.df[['start1', 'start2']].min(axis=1)) ind = ind[(self.df.ix[ind, ['start1', 'start2']].max(axis=1) > self.df.ix[ind, ['end1', 'end2']].min(axis=1))] self.df['loop_inner'] = np.nan self.df.ix[ind, 'loop_inner'] = ( self.df.ix[ind, 'chrom1'] + ':' + self.df.ix[ind, ['end1', 'end2']].min(axis=1).astype(str) + '-' + self.df.ix[ind, ['start1', 'start2']].max(axis=1).astype(str)) self.df['loop_inner_length'] = ( self.df[['start1', 'start2']].max(axis=1) - self.df[['end1', 'end2']].min(axis=1)) def bts_from_df(self): import pybedtools as pbt s = '\n'.join(list(set( self.df.chrom1.astype(str) + '\t' + self.df.start1.astype(str) + '\t' + self.df.end1.astype(str) + '\t' + self.df.chrom1.astype(str) + ':' + self.df.start1.astype(str) + '-' + self.df.end1.astype(str)))) + '\n' self.bt1 = pbt.BedTool(s, from_string=True).sort() s = '\n'.join(list(set( self.df.chrom2.astype(str) + '\t' + self.df.start2.astype(str) + '\t' + self.df.end2.astype(str) + '\t' + self.df.chrom2.astype(str) + ':' + self.df.start2.astype(str) + '-' + self.df.end2.astype(str)))) + '\n' self.bt2 = pbt.BedTool(s, from_string=True).sort() ind = self.df[self.df.intra].index s = '\n'.join( self.df.ix[ind, 'chrom1'].astype(str) + '\t' + self.df.ix[ind, ['start1', 'start2']].min(axis=1).astype(str) + '\t' + self.df.ix[ind, ['end1', 'end2']].max(axis=1).astype(str) + '\t' + self.df.ix[ind, 'name']) + '\n' self.bt_loop = pbt.BedTool(s, from_string=True).sort() ind = ind[(self.df.ix[ind, ['start1', 'start2']].max(axis=1) > self.df.ix[ind, ['end1', 'end2']].min(axis=1))] s = '\n'.join( self.df.ix[ind, 'chrom1'].astype(str) + '\t' + self.df.ix[ind, ['end1', 'end2']].min(axis=1).astype(str) + '\t' + self.df.ix[ind, ['start1', 'start2']].max(axis=1).astype(str) + '\t' + self.df.ix[ind, 'name']) + '\n' self.bt_loop_inner = pbt.BedTool(s, from_string=True).sort()

In [ ]:
# GM12878_RNAPII
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1872887
url = ('http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM1872887&format=file&file='
       'GSM1872887%5FGM12878%5FRNAPII%5FPET%5Fclusters%2Etxt%2Egz')
gm_rnap = parse_tang(url)
gm_rnap_a = cpb.bedtools.AnnotatedInteractions(gm_rnap, annot_beds, completely_contains=['gene'])

In [ ]:
# GM12878_CTCF
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1872886
url = ('http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM1872886&format=file&file='
       'GSM1872886%5FGM12878%5FCTCF%5FPET%5Fclusters%2Etxt%2Egz')
gm_ctcf = parse_tang(url)
gm_ctcf_a = cpb.bedtools.AnnotatedInteractions(gm_ctcf, annot_beds, completely_contains=['gene'])

In [ ]:
# HeLa_CTCF
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1872888
url = ('http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM1872888&format=file&file='
       'GSM1872888%5FHeLa%5FCTCF%5FPET%5Fclusters%2Etxt%2Egz')
hela_ctcf = parse_tang(url)
hela_ctcf_a = cpb.bedtools.AnnotatedInteractions(hela_ctcf, annot_beds, completely_contains=['gene'])

In [ ]:
# HeLa_RNAPII
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1872889
url = ('http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM1872889&format=file&file='
       'GSM1872889%5FHeLa%5FRNAPII%5FPET%5Fclusters%2Etxt%2Egz')
hela_rnap = parse_tang(url)
hela_rnap_a = cpb.bedtools.AnnotatedInteractions(hela_rnap, annot_beds, completely_contains=['gene'])