CNV Processing

In this notebook, I'm going to parse, filter, and annotate the Genome Strip and Lumpy results. I need to figure out which CNVs I can use to for the eQTL analysis and an mCNV analysis.



In [1]:

    
import cPickle
import datetime
import glob
import os
import random
import re
import subprocess
import urllib2

import cdpybio as cpb
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pybedtools as pbt
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import vcf as pyvcf

import cardipspy as cpy
import ciepy

%matplotlib inline

dy_name = 'cnv_processing'

import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
    cpy.makedir(dy)
    pbt.set_tempdir(dy)
    
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)



In [2]:

    
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rsem_tpm.tsv')
tpm = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 'unrelateds.tsv')
unr_rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'wgs_metadata.tsv')
wgs_meta = pd.read_table(fn, index_col=0)

rna_meta = rna_meta.merge(subject_meta, left_on='subject_id', right_index=True)

gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
gene_info['ensembl_id'] = [x.split('.')[0] for x in gene_info.index]

genes = pbt.BedTool(cpy.gencode_gene_bed)
exons = pbt.BedTool(cpy.gencode_exon_bed)
tss_bed='/publicdata/gencode_v19_20151104/tss_merged.bed'
tss_bt = pbt.BedTool(tss_bed)

exp = pd.read_table(os.path.join(ciepy.root, 'output', 'eqtl_input', 
                                 'tpm_log_filtered_phe_std_norm_peer_resid.tsv'), index_col=0)



In [3]:

    
fn = os.path.join(os.path.split(cpy.roadmap_15_state_annotation)[0], 'EIDlegend.txt')
roadmap_ids = pd.read_table(fn, squeeze=True, index_col=0, header=None)



In [4]:

    
fn = '/projects/CARDIPS/pipeline/WGS/BF_GS_Discovery/eval/CopyNumberClass.report.dat'
report = pd.read_table(fn, index_col=0)



In [5]:

    
mhc_bt = pbt.BedTool('chr6\t{}\t{}'.format(int(29.6e6), int(33.1e6)), from_string=True)

GenomeStrip CNVs



In [6]:

    
report.NVARIANT.hist(bins=range(275))
plt.ylabel('Number of CNVs')
plt.xlabel('Number of variant samples');



In [7]:

    
def get_genomestrip_vcf_copy_number_states(vcf):
    """Parse a genomestrip VCF file and get the diploid copy number states
    for each sample."""
    f = open(vcf)
    line = f.readline()
    while line[0:2] == '##':
        line = f.readline()
    header = line[1:].strip().split('\t')

    ind = []
    copy_numbers = []
    line = f.readline().strip()
    while line != '':
        t = line.split('\t')
        ind.append(t[2])
        copy_numbers.append([int(x.split(':')[1]) for x in t[9:]])
        line = f.readline().strip()
    cns = pd.DataFrame(copy_numbers, index=ind, columns=header[9:])
    return cns

def annotate_genomestrip_cnvs(
    report, 
    cns,
    transcript_to_gene=cpy.gencode_transcript_gene,
    add_chr=True,
):
    report = pd.read_table(report, index_col=0)
    cnv_info = pd.DataFrame([x.split('_')[1:] for x in report.index], 
                            index=cns.index, columns=['chrom', 'start', 'end'])
    if add_chr:
        cnv_info['chrom'] = 'chr' + cnv_info.chrom
    cnv_info['start'] = cnv_info.start.astype(int)
    cnv_info['end'] = cnv_info.end.astype(int)
    cnv_info['length'] = cnv_info.end - cnv_info.start
    cnv_info['name'] = cnv_info.index

    s = '\n'.join(cnv_info.chrom + '\t' + cnv_info.start.astype(str) + 
                  '\t' + cnv_info.end.astype(str) + '\t' + cnv_info.name) + '\n'
    cnv_bt = pbt.BedTool(s, from_string=True)
    cnv_bt = cnv_bt.sort()
    
    # Find genes that the CNV overlaps.
    res = cnv_bt.intersect(genes, sorted=True, wo=True)
    df = res.to_dataframe()
    df['gene'] = df.thickEnd
    gb = df[['name', 'gene']].groupby('name')
    se = pd.Series(dict(list(gb['gene'])))
    cnv_info['overlaps_gene'] = se.apply(lambda x: set(x))

    # Find genes that the CNV contains completely.
    df = df[df.blockSizes == df.thickStart - df.strand]
    gb = df[['name', 'gene']].groupby('name')
    se = pd.Series(dict(list(gb['gene'])))
    cnv_info['contains_gene'] = se.apply(lambda x: set(x))  

    # Annotate with genes where the CNV overlaps exonic regions.
    tg = pd.read_table(transcript_to_gene, index_col=0, header=None, squeeze=True)
    res = cnv_bt.intersect(exons, sorted=True, wo=True)
    df = res.to_dataframe()
    df['gene'] = df.thickEnd.apply(lambda x: tg[x])
    gb = df[['name', 'gene']].groupby('name')
    se = pd.Series(dict(list(gb['gene'])))
    cnv_info['overlaps_gene_exon'] = se.apply(lambda x: set(x))    

    # Distance to nearest TSS.
    res = cnv_bt.closest(tss_bt, D='b')
    df = res.to_dataframe()
    cnv_info.ix[df.name, 'nearest_tss_dist'] = df.thickEnd.values
    
    cnv_info = cnv_info.join(report)
    mode = cns.mode(axis=1)[0]
    cnv_info['cn_mode'] = mode
    nvariant = [sum(cns.ix[i] != cnv_info.ix[i, 'cn_mode']) for i in cnv_info.index]
    cnv_info['diff_from_mode'] = nvariant
    cnv_info['percent_diff_from_mode'] = cnv_info.diff_from_mode / cns.shape[1]
    
    t = cns[unr_rna_meta.wgs_id]
    nvariant = [sum(t.ix[i] != cnv_info.ix[i, 'cn_mode']) for i in cnv_info.index]
    cnv_info['unrelated_diff_from_mode'] = nvariant
    cnv_info['unrelated_percent_diff_from_mode'] = cnv_info.unrelated_diff_from_mode / unr_rna_meta.shape[0]
    rna_meta_eqtl = rna_meta[rna_meta.in_eqtl]
    vc = rna_meta_eqtl.family_id.value_counts()
    vc = vc[vc > 4]
    for x in vc.index:
        t = cns[rna_meta_eqtl.ix[rna_meta_eqtl.family_id == x, 'wgs_id']]
        pvariant = [sum(t.ix[i] != cnv_info.ix[i, 'cn_mode']) for i in cnv_info.index]
        cnv_info['{}_{}_diff_from_mode'.format(x, vc[x])] = np.array(pvariant)
        cnv_info['{}_{}_percent_diff_from_mode'.format(x, vc[x])] = np.array(pvariant) / float(vc[x])
    family_cols = ['{}_{}_percent_diff_from_mode'.format(x, vc[x]) for x in vc.index]    
    return cnv_info, cnv_bt



In [8]:

    
out = os.path.join(private_outdir, 'gs_genotypes.tsv')
if not os.path.exists(out):
    fn = '/projects/CARDIPS/pipeline/WGS/BF_GS_Discovery/gs_cnv.genotypes.vcf'
    gs_cns = get_genomestrip_vcf_copy_number_states(fn)
    gs_cns.to_csv(out, sep='\t')
else:
    gs_cns = pd.read_table(out, index_col=0)



In [9]:

    
out = os.path.join(outdir, 'gs_info.pickle')
if not os.path.exists(out):
    fn = '/projects/CARDIPS/pipeline/WGS/BF_GS_Discovery/eval/CopyNumberClass.report.dat'
    gs_info, gs_bt = annotate_genomestrip_cnvs(fn, gs_cns)
    gs_bt.saveas(os.path.join(outdir, 'gs_cnvs.bed'), trackline="track type=bed name='GenomeStrip CNVs'")
    if not os.path.exists('/home/cdeboever/public_html/gs_cnvs.bed'):
        os.symlink(os.path.join(outdir, 'gs_cnvs.bed'), '/home/cdeboever/public_html/gs_cnvs.bed')
    cPickle.dump(gs_info, open(out, 'w'))
else:
    gs_info = cPickle.load(open(out))
    gs_bt = pbt.BedTool(os.path.join(outdir, 'gs_cnvs.bed'))



In [10]:

    
gs_info.unrelated_diff_from_mode.hist(bins=range(118))
plt.xlabel('Number of unrelateds different from mode')
plt.ylabel('Number of CNVs');

plt.scatter(gs_info.unrelated_percent_diff_from_mode, gs_info[family_cols[0]]) plt.xlabel('Percent unrelated samples different from mode') plt.ylabel('Percent family {}\nsamples different from mode'.format(family_cols[0].split('_')[0]));



In [11]:

    
p1 = gs_info.percent_diff_from_mode
p2 = gs_info.unrelated_percent_diff_from_mode
diff = p1 - p2
diff.sort_values(inplace=True)
plt.scatter(p1, p2)
plt.xlabel('Percent samples different from mode')
plt.ylabel('Percent unrelated samples different from mode');



In [12]:

    
gs_info.length.hist(log=True)
plt.ylabel('$\log_{10}$ number of CNVs')
plt.xlabel('CNV length');

sns.set_context('talk', font_scale=1.5)



In [13]:

    
(np.log10(gs_info.length)).hist()
plt.ylabel('Number of CNVs')
plt.xlabel('$\log_{10}$ CNV length');



In [14]:

    
gs_info.length.describe()









    Out[14]:





count     14500.000000
mean       6615.005931
std       10552.781125
min         999.000000
25%        1660.000000
50%        3049.000000
75%        6574.250000
max      208946.000000
Name: length, dtype: float64



In [15]:

    
cutoffs = np.array([2, 5, 10, 50, 100, 200]) * 1000
for c in cutoffs:
    print('{:.1f}% of CNVs are less than {:,} bp'.format(
            100 * sum(gs_info.length < c) / float(gs_info.shape[0]), c))









    



32.7% of CNVs are less than 2,000 bp
67.2% of CNVs are less than 5,000 bp
84.1% of CNVs are less than 10,000 bp
99.0% of CNVs are less than 50,000 bp
99.9% of CNVs are less than 100,000 bp
100.0% of CNVs are less than 200,000 bp



In [16]:

    
gs_info.length.median()









    Out[16]:





3049.0



In [17]:

    
gs_info.CNCATEGORY.value_counts()









    Out[17]:





DEL      9791
DUP      2645
MIXED    2064
Name: CNCATEGORY, dtype: int64



In [18]:

    
print(gs_info[gs_info.CNCATEGORY == 'DEL'].length.median())
print(gs_info[gs_info.CNCATEGORY == 'DUP'].length.median())
print(gs_info[gs_info.CNCATEGORY == 'MIXED'].length.median())

Merge CNVs

I want to merge adjacent CNVs that have highly correlated copy number estimates. These are likely just one CNV that GS has split into more than one.

For instance, the heatmap below shows the copy number correlation for the first 100 CNVs on chr1. We can see some blocks of red. These are adjacent CNVs that have highly correlated copy numbers. I think these should be merged into single larger CNVs.



In [19]:

    
# Add order so we can check whether CNVs are adjacent.
gs_info.sort_values(by=['chrom', 'start', 'end'], inplace=True)
gs_info['order'] = range(gs_info.shape[0])



In [20]:

    
t = gs_info[gs_info.chrom == 'chr1'].head(100)
corr = gs_cns.ix[t.index].T.corr()
sns.heatmap(corr, xticklabels=[], yticklabels=[]);

I will merge chromosome by chromosome as follows:

Calculate genotype correlation matrix for all CNVs on the chromosome
Create graph where nodes are CNVs and an edge exists between two CNVs if their copy number estimates are correlated > 0.9.
For each connected component with more than one CNV, we sort the CNVs by position and look at each pair of adjacent CNVs and determine whether the two should be merged. We merge if the two CNVs are adjacent amongst all the calls (not just the connected component) and if the average difference in their copy number estimates is less than 0.5.
Once we know which pairs should be merged, we merge all CNVs at once. For instance, let's say we have five CNVs $[c_1, c_2, c_3, c_4, c_5]$. We will perform 4 pairwise comparisons. We compare $c_1$ and $c_2$ and decide they should be combined. We similarly decide $c_2$ and $c_3$ should be merged. However, $c_3$ and $c_4$ should not be merged, but $c_4$ and $c_5$ should. After we've done the comparisons, we merge all of $c_1$, $c_2$, and $c_3$ at once, then we separately merge $c_4$ and $c_5$.



In [21]:

    
def combine_connected_component(cnvs, info, cns):
    """
    If appropriate, combine CNVs from the list cnvs. The CNVs in cnvs should
    have highly correlated genotypes (for instance, cnvs might be a list of connected
    nodes from a graph built from the genotype correlations).
    
    Parameters
    ----------
    cnvs : list
        List of CNVs of the form CNV_2_400_800 (aka CNV_{chrom}_{start}_{end}).
        
    info : pandas.DataFrame
        Data frame whose index contains the values in cnvs. This dataframe should at least
        have chrom, start, and end columns.
        
    cns : pandas.DataFrame
        Data frame whose index contains the values in cnvs. This data frame should contain
        the diploid copy number estimates.
    
    """
    cnvs = sorted(cnvs)
    out = dict()
    combine = []
    for i in range(len(cnvs) - 1):
        combine.append((abs(info.ix[cnvs[i], 'order'] - info.ix[cnvs[i + 1], 'order']) == 1) and
                       ((cns.ix[cnvs[i]] - cns.ix[cnvs[i + 1]]).abs().mean() < 0.5))
    i = 0
    to_combine = [cnvs[0]]
    while i < len(cnvs) - 1:
        if combine[i]:
            to_combine.append(cnvs[i + 1])
        else:
            if len(to_combine) > 1:
                out[combine_cnvs(to_combine, gs_info)] = to_combine
            to_combine = [cnvs[i + 1]]
        i += 1
    if len(to_combine) > 1:
        out[combine_cnvs(to_combine, gs_info)] = to_combine
    return out

def combine_cnvs(cnvs, info):
    """Combine the list of CNVs cnvs into a single CNV of form CNV_{chrom}_{start}_{end}."""
    return 'CNV_{}_{}_{}'.format(info.ix[cnvs[0], 'chrom'][3:], 
                                 info.ix[cnvs, 'start'].min(), 
                                 info.ix[cnvs, 'end'].max())

def merge_cnvs(a, b, info, cns):
    """Return boolean indicating whether CNVs a and b should be merged."""
    return ((abs(info.ix[a, 'order'] - info.ix[b, 'order']) == 1) and
            ((cns.ix[a] - cns.ix[b]).abs().mean() < 0.5))



In [22]:

    
combined = dict()
# We'll go through one chromosome at a time. 
for chrom in set(gs_info.chrom):
    t = gs_info[gs_info.chrom == chrom]
    corr = gs_cns.ix[t.index].T.corr()
    g = corr > 0.9
    edges = []
    for i in g.columns:
        se = g[i]
        se = se[se]
        for j in se.index:
            edges.append((i, j))
    g = nx.Graph(edges)
    cc = nx.connected_components(g)
    while True:
        try:
            c = list(cc.next())
            if len(c) > 1:
                d = combine_connected_component(c, gs_info, gs_cns)
                for k in d.keys():
                    combined[k] = d[k]
        except StopIteration:
            break



In [23]:

    
mapping_from = []
mapping_to = []
for k in combined.keys():
    for x in combined[k]:
        mapping_from.append(x)
        mapping_to.append(k)
mapping = pd.Series(mapping_to, index=mapping_from)
mapping.to_csv(os.path.join(outdir, 'combined_mapping.tsv'), sep='\t')



In [24]:

    
to_remove = []
new_genotypes = []
for k in combined.keys():
    to_remove += combined[k]
    new_genotypes.append(gs_cns.ix[combined[k]].mean())
print('{} CNVs combined into {} CNVs.'.format(len(to_remove), len(combined)))









    



4533 CNVs combined into 920 CNVs.



In [25]:

    
new_genotypes = pd.DataFrame(new_genotypes, index=combined.keys())
gs_combined_cns = pd.concat([gs_cns.drop(to_remove), new_genotypes])
fn = os.path.join(private_outdir, 'gs_combined_genotypes.tsv')
gs_combined_cns.to_csv(fn, index_col=0)



In [26]:

    
def annotate_merged_genomestrip_cnvs(
    cnv_info, 
    cns,
    transcript_to_gene=cpy.gencode_transcript_gene,
    add_chr=True,
):
    if add_chr:
        cnv_info['chrom'] = 'chr' + cnv_info.chrom
    cnv_info['start'] = cnv_info.start.astype(int)
    cnv_info['end'] = cnv_info.end.astype(int)
    cnv_info['length'] = cnv_info.end - cnv_info.start
    cnv_info['name'] = cnv_info.index

    s = '\n'.join(cnv_info.chrom + '\t' + cnv_info.start.astype(str) + 
                  '\t' + cnv_info.end.astype(str) + '\t' + cnv_info.name) + '\n'
    cnv_bt = pbt.BedTool(s, from_string=True)
    cnv_bt = cnv_bt.sort()
    
    # Find genes that the CNV overlaps.
    res = cnv_bt.intersect(genes, sorted=True, wo=True)
    df = res.to_dataframe()
    df['gene'] = df.thickEnd
    gb = df[['name', 'gene']].groupby('name')
    se = pd.Series(dict(list(gb['gene'])))
    cnv_info['overlaps_gene'] = se.apply(lambda x: set(x))

    # Find genes that the CNV contains completely.
    df = df[df.blockSizes == df.thickStart - df.strand]
    gb = df[['name', 'gene']].groupby('name')
    se = pd.Series(dict(list(gb['gene'])))
    cnv_info['contains_gene'] = se.apply(lambda x: set(x))  

    # Annotate with genes where the CNV overlaps exonic regions.
    tg = pd.read_table(transcript_to_gene, index_col=0, header=None, squeeze=True)
    res = cnv_bt.intersect(exons, sorted=True, wo=True)
    df = res.to_dataframe()
    df['gene'] = df.thickEnd.apply(lambda x: tg[x])
    gb = df[['name', 'gene']].groupby('name')
    se = pd.Series(dict(list(gb['gene'])))
    cnv_info['overlaps_gene_exon'] = se.apply(lambda x: set(x))    

    # Distance to nearest TSS.
    res = cnv_bt.closest(tss_bt, D='b')
    df = res.to_dataframe()
    cnv_info.ix[df.name, 'nearest_tss_dist'] = df.thickEnd.values
    
    mode = cns.mode(axis=1)[0]
    cnv_info['cn_mode'] = mode
    nvariant = [sum(cns.ix[i] != cnv_info.ix[i, 'cn_mode']) for i in cnv_info.index]
    cnv_info['diff_from_mode'] = nvariant
    cnv_info['percent_diff_from_mode'] = cnv_info.diff_from_mode / cns.shape[1]
    
    t = cns[unr_rna_meta.wgs_id]
    nvariant = [sum(t.ix[i] != cnv_info.ix[i, 'cn_mode']) for i in cnv_info.index]
    cnv_info['unrelated_diff_from_mode'] = nvariant
    cnv_info['unrelated_percent_diff_from_mode'] = cnv_info.unrelated_diff_from_mode / unr_rna_meta.shape[0]
    rna_meta_eqtl = rna_meta[rna_meta.in_eqtl]
    vc = rna_meta_eqtl.family_id.value_counts()
    vc = vc[vc > 4]
    for x in vc.index:
        t = cns[rna_meta_eqtl.ix[rna_meta_eqtl.family_id == x, 'wgs_id']]
        pvariant = [sum(t.ix[i] != cnv_info.ix[i, 'cn_mode']) for i in cnv_info.index]
        cnv_info['{}_{}_diff_from_mode'.format(x, vc[x])] = np.array(pvariant)
        cnv_info['{}_{}_percent_diff_from_mode'.format(x, vc[x])] = np.array(pvariant) / float(vc[x])
    family_cols = ['{}_{}_percent_diff_from_mode'.format(x, vc[x]) for x in vc.index]    
    return cnv_info, cnv_bt

gs_combined_info = gs_info.drop(to_remove)



In [27]:

    
tdf = pd.DataFrame([x.split('_') for x in combined.keys()], columns=['cnv', 'chrom', 'start', 'end'],
                   index=combined.keys()).drop('cnv', axis=1)
tdf.start = tdf.start.astype(int)
tdf.end = tdf.end.astype(int)
a,b = annotate_merged_genomestrip_cnvs(tdf, new_genotypes)
gs_combined_info = pd.concat([a, gs_info.drop(to_remove)])
fn = os.path.join(outdir, 'gs_combined_info.pickle')
gs_combined_info.to_pickle(fn)

Functional annotations

I'm going to annotate the CNVs with more info.



In [28]:

    
def cnv_counts(cnv_bt, cnv_info, url, name):
    bt = pbt.BedTool(cpb.general.read_gzipped_text_url(url), from_string=True)
    bt = bt.sort()
    res = cnv_bt.intersect(bt, sorted=True, wo=True)
    r = res[0]
    df = res.to_dataframe(names=range(len(res[0].fields)))
    vc = df[3].value_counts()
    counts = pd.DataFrame(0, columns=[name], index=cnv_info.index)
    counts.ix[vc.index, name] = vc
    return counts

def cluster_setup():
    from ipyparallel import Client
    parallel_client = Client(profile='parallel')
    dview = parallel_client[:]
    print('Cluster has {} engines.'.format(len(parallel_client.ids)))
    with dview.sync_imports():
        import cdpybio
        import pandas
        import pybedtools
    %px cpb = cdpybio
    %px pd = pandas
    %px pbt = pybedtools
    dview.push(dict(cnv_counts=cnv_counts));
    dview.push(dict(gs_bt=gs_bt));
    return dview

cluster_ready = False

Roadmap DNase and ChIP-seq



In [34]:

    
out = os.path.join(outdir, 'gs_roadmap_overlap.tsv')
if not os.path.exists(out):
    url = ('http://egg2.wustl.edu/roadmap/data/byFileType'
           '/peaks/consolidated/narrowPeak/')
    website = urllib2.urlopen(url)
    html = website.read()
    files = re.findall('href="(.*\.gz)"', html)
    lines = [x for x in roadmap_ids.index if ('ES' in roadmap_ids[x] and 'Cultured' not in roadmap_ids[x])]
    lines += [x for x in roadmap_ids.index if 'iPS' in roadmap_ids[x]]
    files = [x for x in files if x.split('-')[0] in lines]
    files = [x for x in files if 'hotspot' not in x]

    roadmap_peak_pvals = pd.DataFrame(
        -1, index=lines, 
        columns=set([x.split('-')[1].split('.')[0] for x in files]))
    roadmap_peak_oddsratios = pd.DataFrame(
        0, index=lines, 
        columns=set([x.split('-')[1].split('.')[0] for x in files]))

    urls = ['http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/{}'.format(n)
            for n in files]
    lines = [roadmap_ids[re.findall('E\d{3}', url)[0]][0:-10].replace(' ', '_') for url in urls]
    btype = [re.findall('-(.+)\.', url)[0].split('.')[0] for url in urls]
    names = ['{}_{}'.format(lines[i], btype[i]) for i in range(len(lines))]
    todo = zip(urls, names)
    if cluster_ready == False:
        dview = cluster_setup()
        cluster_ready = True
    counts = dview.map_sync(lambda x: cnv_counts(gs_bt, gs_info, x[0], x[1]), todo)
    roadmap_dnase_counts = pd.concat(counts, axis=1)
    roadmap_dnase_counts.to_csv(out, sep='\t')
else:
    roadmap_dnase_counts = pd.read_table(out, index_col=0)

Roadmap chromatin state

lines = [x for x in roadmap_ids.index if 'iPS' in roadmap_ids[x]] for line in lines: fn = ('/publicdata/roadmap_25_state_20151104/' '{}_25_imputed12marks_mnemonics_sorted.bed'.format(line)) bt = pbt.BedTool(fn) res = cnv_bt.intersect(bt, sorted=True, wo=True) df = res.to_dataframe(names=range(len(res[0].fields))) c = df.groupby([3, 7]).sum()[8] c = c.sort_values(ascending=False) t = c.unstack() t[t.isnull()] = 0 name = roadmap_ids[line][:-10].replace(' ', '_') t.to_csv(os.path.join(outdir, 'gs_{}_chromatin_state.tsv'.format(name)), sep='\t')

ENCODE DNase



In [36]:

    
out = os.path.join(outdir, 'gs_encode_dnase_overlap.tsv')
if not os.path.exists(out):
    encode_dnase = pd.read_table(os.path.join(ciepy.root, 'output', 
                                              'functional_annotation_analysis', 
                                              'encode_dnase.tsv'), index_col=0)
    encode_dnase = encode_dnase[encode_dnase.biosample_type == 'stem cell']
    encode_dnase['name'] = encode_dnase.cell_type + '_' + 'DNase'
    if cluster_ready == False:
        dview = cluster_setup()
        cluster_ready = True
    dview.push(dict(encode_dnase=encode_dnase));
    counts = dview.map_sync(lambda i: cnv_counts(gs_bt, gs_info, 
                                                 encode_dnase.ix[i, 'narrowPeak_url'], 
                                                 encode_dnase.ix[i, 'name']), 
                            encode_dnase.index)
    encode_dnase_counts = pd.concat(counts, axis=1)
    encode_dnase_counts.to_csv(out, sep='\t')
else:
    encode_dnase_counts = pd.read_table(out, index_col=0)

ENCODE ChIP-seq



In [37]:

    
out = os.path.join(outdir, 'gs_encode_tf_chip_seq_overlap.tsv')
if not os.path.exists(out):
    encode_tf_chip_seq = pd.read_table(os.path.join(ciepy.root, 'output', 
                                                    'functional_annotation_analysis', 
                                                    'encode_stem_cell_chip_seq.tsv'), index_col=0)
    encode_tf_chip_seq = encode_tf_chip_seq.drop_duplicates(subset='target')
    encode_tf_chip_seq['name'] = encode_tf_chip_seq.cell_type + '-' + encode_tf_chip_seq.target
    if cluster_ready == False:
        dview = cluster_setup()
        cluster_ready = True
    dview.push(dict(encode_tf_chip_seq=encode_tf_chip_seq));
    counts = dview.map_sync(lambda i: cnv_counts(gs_bt, gs_info,
                                                 encode_tf_chip_seq.ix[i, 'narrowPeak_url'], 
                                                 encode_tf_chip_seq.ix[i, 'name']), 
                            encode_tf_chip_seq.index)
    encode_tf_chip_seq_counts = pd.concat(counts, axis=1)
    encode_tf_chip_seq_counts.to_csv(out, sep='\t')
else:
    encode_tf_chip_seq_counts = pd.read_table(out, index_col=0)

EMMAX CNVs

I want to find CNVs that I can include in the EMMAX association analysis.

GenomeStrip

For GS, I can use CNVs that have at most three copy number states and have a minor allele frequency greater than 1% (aka at least 1% of samples differ from the mode copy number state). I can also separately test mCNVs for an association with gene expression.



In [38]:

    
# Get copy number states for samples in eQTL analysis.
cns_f = gs_cns[rna_meta[rna_meta.in_eqtl].wgs_id]
# Get the number of distinct copy number states for each CNV.
num_states = cns_f.apply(lambda x: len(set(x)), axis=1)
# Keep CNVs with two or three states.
cns_emmax = cns_f.ix[num_states[num_states.apply(lambda x: x in [2, 3])].index]
# Remove CNVs where less than 1% of the samples are different from the mode.
b = (cns_emmax.apply(lambda x: x.value_counts().max() < 
                     cns_emmax.shape[1] - np.floor(rna_meta.in_eqtl.sum() * 0.01), axis=1))
cns_emmax = cns_emmax[b]



In [39]:

    
maj_af = cns_emmax.apply(lambda x: x.value_counts().max(), axis=1) / cns_emmax.shape[1]
maj_af.hist(bins=np.arange(0, 1.01, 0.01))
plt.ylabel('Number of CNVs')
plt.xlabel('Mode CN frequency (major allele frequency)');



In [40]:

    
t = gs_info.ix[cns_emmax.index]
s = '\n'.join(t.chrom + '\t' + t.start.astype(str) + '\t' + t.end.astype(str) + 
              '\t' + t.name) + '\n'
cns_emmax_bt = pbt.BedTool(s, from_string=True)
cns_emmax_bt = cns_emmax_bt.sort()

fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 'variant_regions.bed')
variant_regions = pbt.BedTool(fn)
genes_todo = set(exp.index)

res = cns_emmax_bt.intersect(variant_regions, sorted=True, wo=True)
g = []
for r in res:
    g.append(r.fields[-3].split('_')[0])
cnv_gene_vc = pd.Series(g).value_counts()
cnv_gene_vc = cnv_gene_vc[set(cnv_gene_vc.index) & genes_todo]



In [41]:

    
cnv_gene_vc.hist(bins=range(cnv_gene_vc.max()))
plt.ylabel('Number of genes')
plt.xlabel('Number of CNVs overlapping variant window');

I think that I can collapse some mCNVs to be biallelic. For instance, the CNV below has one person with copy number 4, but if we exclude that person, we would only have three copy number states. The person who is four is likely just an error and should actually be 3, or at the least it wouldn't hurt to make them copy number 3 or just not include them in the analysis.



In [42]:

    
report.ix['CNV_19_40372246_40375846']









    Out[42]:





CALLRATE               0.989
CNMIN                      1
CNMAX                      4
CNALLELES                  3
NNONREF                  140
NVARIANT                 140
CNCATEGORY             MIXED
CNDIST        0,109,131,30,1
Name: CNV_19_40372246_40375846, dtype: object

cns_f = cns[rna_meta[rna_meta.in_eqtl].wgs_id] cns_emmax = cns_f.ix[num_states[num_states.apply(lambda x: x in [2, 3])].index] b = cns_emmax.apply(lambda x: x.value_counts().max() < cns_emmax.shape[1] - np.floor(207 * 0.05), axis=1) cns_emmax = cns_emmax[b]



In [43]:

    
t = cns_f.ix[set(cns_f.index) - set(cns_emmax.index)]
t = t[t.apply(lambda x: len(set(x)) > 3, axis=1)]
# b has the number of samples that have one of the top three copy number states
b = t.apply(lambda x: x.value_counts()[0:3].sum(), axis=1)



In [44]:

    
b.hist(bins=range(208))
plt.xlim(0, 209)
plt.ylabel('Number of CNVs')
plt.xlabel('Number of individuals with one of top three CNs');

This plot shows us that many CNVs are almost biallelic. If we remove a few samples, we can make more sites biallelic.



In [45]:

    
sum(b >= rna_meta.in_eqtl.sum() * 0.95)









    Out[45]:





807

There are 807 CNVs for which I can remove people with rare copy numbers and still have 95% call rate. I'll set the rare copy numbers to missing and include these.



In [46]:

    
rows = []
for i in b[b >= rna_meta.in_eqtl.sum() * 0.95].index:
    se = cns_f.ix[i]
    vc = se.value_counts()
    se[se.apply(lambda x: x not in vc.index[0:3])] = np.nan
    rows.append(se)
df = pd.DataFrame(rows, index=b[b >= rna_meta.in_eqtl.sum() * 0.95].index, 
                  columns=cns_f.columns)



In [47]:

    
b = df.apply(lambda x: x.value_counts().max() < df.shape[1] - np.floor(rna_meta.in_eqtl.sum() * 0.05), axis=1)
df = df[b]



In [48]:

    
cns_emmax = pd.concat([cns_emmax, df])
cns_emmax.to_csv(os.path.join(private_outdir, 'gs_emmax_cnvs.tsv'), sep='\t')



In [49]:

    
cns_emmax.shape









    Out[49]:





(5348, 215)

I want to remove CNV in the MHC region because these are likely alignment artifacts. Bioinformatics and Functional Genomics by Jonathan Pevsner says the MHC is ~29.6 to 33.1 mb on chromosome 6 so I'll use that.



In [50]:

    
t = gs_info.ix[cns_emmax.index]
s = '\n'.join(t.chrom + '\t' + t.start.astype(str) + '\t' + t.end.astype(str) + 
              '\t' + t.name) + '\n'
cns_emmax_bt = pbt.BedTool(s, from_string=True)
cns_emmax_bt = cns_emmax_bt.sort()
res = cns_emmax_bt.intersect(mhc_bt, wa=True, v=True)
df = res.to_dataframe()
cns_emmax = cns_emmax.ix[df.name]



In [51]:

    
cns_emmax.shape









    Out[51]:





(5287, 215)

I'll write a VCF file with the CNV genotypes.



In [52]:

    
cns_emmax_genotypes = cns_emmax.copy(deep=True).astype(str)
mins = cns_emmax.apply(lambda x: x == x.min(), axis=1)
maxs = cns_emmax.apply(lambda x: x == x.max(), axis=1)
nulls = cns_emmax.isnull()
hets = (mins.astype(int) + maxs.astype(int) + nulls.astype(int) == 0)
cns_emmax_genotypes[mins] = '0/0'
cns_emmax_genotypes[hets] = '0/1'
cns_emmax_genotypes[maxs] = '1/1'
cns_emmax_genotypes[nulls] = './.'



In [53]:

    
cns_emmax_info = gs_info.ix[cns_emmax.index, ['chrom', 'start']]
cns_emmax_info.chrom = cns_emmax_info.chrom.apply(lambda x: x[3:])
#cns_emmax_info = cns_emmax_info.drop(['end', 'length'], axis=1)
cns_emmax_info.start = cns_emmax_info.start.astype(str)
cns_emmax_info['id'] = cns_emmax_info.index
cns_emmax_info['ref'] = 'N'
cns_emmax_info['alt'] = 'N'
cns_emmax_info['qual'] = '100000'
cns_emmax_info['filter'] = 'PASS'
cns_emmax_info['info'] = ''
cns_emmax_info['format'] = 'GT'
cns_emmax_info = cns_emmax_info.join(cns_emmax_genotypes[sorted(cns_emmax_genotypes.columns)])



In [54]:

    
with open('/projects/CARDIPS/analysis/cardips-ipsc-eqtl/misc/header.vcf') as f:
    lines = f.readlines()
vs = lines[-1].strip().split('\t')[9:]
colnames = lines[-1].strip().split('\t')[0:9]



In [55]:

    
out = os.path.join(private_outdir, 'gs_emmax_sorted.vcf')
if not os.path.exists(out):
    with open(os.path.join(private_outdir, 'gs_emmax.vcf'), 'w') as f:
        f.write(''.join(lines[0:-1]))
        f.write('\t'.join(colnames + list(cns_emmax_info.columns[9:])) + '\n')
        f.write('\n'.join(cns_emmax_info.apply(lambda x: '\t'.join(x), axis=1)) + '\n')
    !cat {os.path.join(private_outdir, 'gs_emmax.vcf')} | vcf-sort > {out}
    !bgzip {out}
    !bcftools index -f {out}.gz
    !bcftools index -f -t {out}.gz
    !rm {os.path.join(private_outdir, 'gs_emmax.vcf')}

Let's see which genes have CNVs to test now that we've added in some more CNVs.



In [56]:

    
t = gs_info.ix[cns_emmax.index]
s = '\n'.join(t.chrom + '\t' + t.start.astype(str) + '\t' + t.end.astype(str) + 
              '\t' + t.name) + '\n'
cns_emmax_bt = pbt.BedTool(s, from_string=True)
cns_emmax_bt = cns_emmax_bt.sort()

res = cns_emmax_bt.intersect(variant_regions, sorted=True, wo=True)
g = []
for r in res:
    g.append(r.fields[-3].split('_')[0])
cnv_gene_vc = pd.Series(g).value_counts()
cnv_gene_vc = cnv_gene_vc[set(cnv_gene_vc.index) & genes_todo]



In [57]:

    
t = gs_info.ix[cns_emmax.index]
s = '\n'.join(t.chrom + '\t' + t.start.astype(str) + '\t' + t.end.astype(str) + 
              '\t' + t.name) + '\n'
cns_emmax_bt = pbt.BedTool(s, from_string=True)
cns_emmax_bt = cns_emmax_bt.sort()



In [58]:

    
cnv_gene_vc.hist(bins=range(cnv_gene_vc.max()))
plt.ylabel('Number of genes')
plt.xlabel('Number of CNVs overlapping variant window');

Lumpy



In [59]:

    
def read_merged_lumpy_vcf(fn):
    """Read and parse a merged VCF of lumpy results."""
    df = pd.read_table(fn, skiprows=641, low_memory=False)
    df.columns = [x.replace('#', '') for x in df.columns]
    df = df[df.ALT.apply(lambda x: x in ['<DEL>', '<DUP>'])]
    gcols = df.columns[8:]
    genotypes = df[gcols]
    genotypes = genotypes.apply(lambda x: [y.split(':')[0] for y in x], axis=1)
    df = df.drop(gcols, axis=1)
    cols = [x.split('=')[0] for x in df.INFO[0].split(';')]
    imprecise = []
    rows = []
    for i in df.index:
        vals = list(df.ix[i, 'INFO'].split(';'))
        if 'IMPRECISE' in vals:
            imprecise.append(True)
            vals.remove('IMPRECISE')
        else:
            imprecise.append(False)
        rows.append(dict([x.split('=') for x in vals]))
    df['imprecise'] = imprecise
    tdf = pd.DataFrame(rows, index=df.index)
    df = df.join(tdf)
    df = df.drop('INFO', axis=1)
    df.CHROM = 'chr' + df.CHROM.astype(str)
    # cols = df.FORMAT[0].split(':')
    # ds = df.apply(lambda x: pd.Series(dict(zip(x['FORMAT'].split(':'), x[df.columns[8]].split(':')))), axis=1)
    # ds = ds.drop(set(df.columns) & set(ds.columns), axis=1)
    # df = df.join(ds)
    # df = df.drop(['FORMAT', df.columns[8]], axis=1)
    df.ALT = df.ALT.apply(lambda x: x[1:4])
    df = df[df.END.isnull() == False]
    for c in ['POS', 'END', 'PE', 'SR', 'SU', 'SVLEN']:
        df[c] = df[c].astype(int)
    return df, genotypes



In [60]:

    
fn = '/frazer01/home/djakubosky/sandbox/svtools/Lumpy_274_Merged.vcf'
lumpy_info, lumpy_genotypes = read_merged_lumpy_vcf(fn)
lumpy_info['name'] = (lumpy_info.ALT + '_' + lumpy_info.CHROM.apply(lambda x: x[3:]) + 
                      '_' + lumpy_info.POS.astype(str) + '_' + lumpy_info.END.astype(str))
# Some lumpy SVs show up multiple times in the VCF (not sure why) so I'll just keep one.
lumpy_info = lumpy_info.drop_duplicates(subset=['name'])
lumpy_genotypes = lumpy_genotypes.ix[lumpy_info.index]
lumpy_info.index = lumpy_info.name
lumpy_genotypes.index = lumpy_info.name



In [61]:

    
s = '\n'.join(lumpy_info.CHROM + '\t' + lumpy_info.POS.astype(str) + '\t' + 
              lumpy_info.END.astype(str) + '\t' + lumpy_info.name) + '\n'
lumpy_bt = pbt.BedTool(s, from_string=True).sort()



In [62]:

    
# Find genes that the CNV overlaps.
res = lumpy_bt.intersect(genes, sorted=True, wo=True)
df = res.to_dataframe()
df['gene'] = df.thickEnd
gb = df[['name', 'gene']].groupby('name')
se = pd.Series(dict(list(gb['gene'])))
lumpy_info['overlaps_gene'] = se.apply(lambda x: set(x))

# Find genes that the CNV contains completely.
df = df[df.blockSizes == df.thickStart - df.strand]
gb = df[['name', 'gene']].groupby('name')
se = pd.Series(dict(list(gb['gene'])))
lumpy_info['contains_gene'] = se.apply(lambda x: set(x))  

# Annotate with genes where the CNV overlaps exonic regions.
tg = pd.read_table(cpy.gencode_transcript_gene, index_col=0, header=None, squeeze=True)
res = lumpy_bt.intersect(exons, sorted=True, wo=True)
df = res.to_dataframe()
df['gene'] = df.thickEnd.apply(lambda x: tg[x])
gb = df[['name', 'gene']].groupby('name')
se = pd.Series(dict(list(gb['gene'])))
lumpy_info['overlaps_gene_exon'] = se.apply(lambda x: set(x))    

# Distance to nearest TSS.
tss_bt = pbt.BedTool(tss_bed)
res = lumpy_bt.closest(tss_bt, D='b')
df = res.to_dataframe()
lumpy_info.ix[df.name, 'nearest_tss_dist'] = df.thickEnd.values



In [63]:

    
np.log10(lumpy_info.SU).hist()
plt.ylabel('Number of SVs')
plt.xlabel('$log_{10}$ SU');



In [64]:

    
lumpy_genotypes_f = lumpy_genotypes[rna_meta[rna_meta.in_eqtl].wgs_id]



In [65]:

    
lumpy_ref_ac = lumpy_genotypes_f.apply(lambda x: ''.join(x).count('0'), axis=1)
lumpy_alt_ac = lumpy_genotypes_f.apply(lambda x: ''.join(x).count('1'), axis=1)

lumpy_ref_af = lumpy_ref_ac.astype(float) / (lumpy_ref_ac + lumpy_alt_ac)
lumpy_alt_af = lumpy_alt_ac.astype(float) / (lumpy_ref_ac + lumpy_alt_ac)



In [66]:

    
plt.scatter(lumpy_ref_af, lumpy_info.SU)
plt.ylabel('SU')
plt.xlabel('Reference allele frequency');



In [67]:

    
lumpy_ref_af[lumpy_info[lumpy_info.SU == 4].index].hist()
plt.xlabel('Reference allele frequency')
plt.ylabel('Number of SVs with SU == 4');



In [68]:

    
lumpy_af = pd.DataFrame({'ref':lumpy_ref_af, 'alt':lumpy_alt_af})
lumpy_af['maj_af'] = lumpy_af.max(axis=1)
lumpy_af['min_af'] = lumpy_af.min(axis=1)



In [69]:

    
n = lumpy_af[lumpy_af.min_af >= 0.01].shape[0]
print('{:,} lumpy SVs have minor allele frequency > 1%.'.format(n))









    



11,301 lumpy SVs have minor allele frequency > 1%.

I'll make a VCF file like I did for GS. I'll make them similarly formatted.



In [70]:

    
lumpy_keep = lumpy_af[lumpy_af.min_af >= 0.01].index



In [71]:

    
t = lumpy_info.ix[lumpy_keep]
s = '\n'.join(t.CHROM + '\t' + t.POS.astype(str) + '\t' + t.END.astype(str) + 
              '\t' + t.name) + '\n'
lumpy_emmax_bt = pbt.BedTool(s, from_string=True)
lumpy_emmax_bt = lumpy_emmax_bt.sort()
mhc_bt = pbt.BedTool('chr6\t{}\t{}'.format(int(29.6e6), int(33.1e6)), from_string=True)
res = lumpy_emmax_bt.intersect(mhc_bt, wa=True, v=True)
df = res.to_dataframe()
lumpy_keep = list(df.name)



In [72]:

    
lumpy_emmax_info = lumpy_info.ix[lumpy_keep, ['CHROM', 'POS','ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'END']]
lumpy_emmax_info.CHROM = lumpy_emmax_info.CHROM.apply(lambda x: x[3:])
lumpy_emmax_info.ID = (lumpy_emmax_info.ALT + '_' + lumpy_emmax_info.CHROM + 
                       '_' + lumpy_emmax_info.POS.astype(str) + '_' + lumpy_emmax_info.END.astype(str))
lumpy_emmax_info.ALT = 'N'
lumpy_emmax_info.FILTER = 'PASS'
lumpy_emmax_info = lumpy_emmax_info.drop('END', axis=1)
lumpy_emmax_info['INFO'] = ''
lumpy_emmax_info['FORMAT'] = 'GT'
lumpy_emmax_info = lumpy_emmax_info.join(lumpy_genotypes_f[sorted(lumpy_genotypes_f.columns)])
lumpy_emmax_info = lumpy_emmax_info.astype(str)



In [73]:

    
with open('/projects/CARDIPS/analysis/cardips-ipsc-eqtl/misc/header.vcf') as f:
    lines = f.readlines()
vs = lines[-1].strip().split('\t')[9:]
colnames = lines[-1].strip().split('\t')[0:9]



In [74]:

    
out = os.path.join(private_outdir, 'lumpy_emmax_sorted.vcf')
if not os.path.exists(out):
    with open(os.path.join(private_outdir, 'lumpy_emmax.vcf'), 'w') as f:
        f.write(''.join(lines[0:-1]))
        f.write('\t'.join(colnames + list(lumpy_emmax_info.columns[9:])) + '\n')
        f.write('\n'.join(lumpy_emmax_info.apply(lambda x: '\t'.join(x), axis=1)) + '\n')
    !cat {os.path.join(private_outdir, 'lumpy_emmax.vcf')} | vcf-sort > {out}
    !bgzip {out}
    !bcftools index -f {out}.gz
    !bcftools index -f -t {out}.gz
    !rm {os.path.join(private_outdir, 'lumpy_emmax.vcf')}



In [75]:

    
lumpy_info.columns = [x.lower() for x in lumpy_info.columns]



In [76]:

    
lumpy_info['start'] = lumpy_info.pos
lumpy_info = lumpy_info.drop('pos', axis=1)



In [77]:

    
lumpy_info = lumpy_info.drop(['ref', 'qual', 'alg'], axis=1)
lumpy_info['ref_ac'] = lumpy_ref_ac[lumpy_info.index]
lumpy_info['alt_ac'] = lumpy_alt_ac[lumpy_info.index]
lumpy_info['ref_af'] = lumpy_ref_af[lumpy_info.index]
lumpy_info['alt_af'] = lumpy_alt_af[lumpy_info.index]
lumpy_info['maf'] = lumpy_info[['ref_af', 'alt_af']].min(axis=1)



In [78]:

    
fig,axs = plt.subplots(1, 2)
ax = axs[0]
np.log10(lumpy_info.svlen.abs()).hist(ax=ax)
ax.set_ylabel('Number of SVs')
ax.set_xlabel('$log_{10}$ length');
ax.set_title('All')
ax = axs[1]
np.log10(lumpy_info[lumpy_info.su > 14].svlen.abs()).hist(ax=ax)
ax.set_ylabel('Number of SVs')
ax.set_xlabel('$log_{10}$ length')
ax.set_title('SU > 14');
fig.tight_layout()



In [79]:

    
lumpy_info.svtype.value_counts()









    Out[79]:





DEL    40185
DUP     2445
Name: svtype, dtype: int64



In [80]:

    
print(lumpy_info.svlen.abs().median())
print(lumpy_info[lumpy_info.svtype == 'DEL'].svlen.abs().median())
print(lumpy_info[lumpy_info.svtype == 'DUP'].svlen.abs().median())



In [81]:

    
cPickle.dump(lumpy_info, open(os.path.join(outdir, 'lumpy_info.pickle'), 'w'))



In [82]:

    
conv = {'0/0':0, '0/1':1, '1/1':2, './.':np.nan}
t = lumpy_genotypes.apply(lambda x: [conv[y] for y in x])
fn = os.path.join(private_outdir, 'lumpy_genotypes.tsv')
t.to_csv(fn, sep='\t')



In [83]:

    
out = os.path.join(outdir, 'lumpy_roadmap_overlap.tsv')
if not os.path.exists(out):
    url = ('http://egg2.wustl.edu/roadmap/data/byFileType'
           '/peaks/consolidated/narrowPeak/')
    website = urllib2.urlopen(url)
    html = website.read()
    files = re.findall('href="(.*\.gz)"', html)
    lines = [x for x in roadmap_ids.index if ('ES' in roadmap_ids[x] and 'Cultured' not in roadmap_ids[x])]
    lines += [x for x in roadmap_ids.index if 'iPS' in roadmap_ids[x]]
    files = [x for x in files if x.split('-')[0] in lines]
    files = [x for x in files if 'hotspot' not in x]

    roadmap_peak_pvals = pd.DataFrame(
        -1, index=lines, 
        columns=set([x.split('-')[1].split('.')[0] for x in files]))
    roadmap_peak_oddsratios = pd.DataFrame(
        0, index=lines, 
        columns=set([x.split('-')[1].split('.')[0] for x in files]))

    urls = ['http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/{}'.format(n)
            for n in files]
    lines = [roadmap_ids[re.findall('E\d{3}', url)[0]][0:-10].replace(' ', '_') for url in urls]
    btype = [re.findall('-(.+)\.', url)[0].split('.')[0] for url in urls]
    names = ['{}_{}'.format(lines[i], btype[i]) for i in range(len(lines))]
    todo = zip(urls, names)
    dview.push(dict(lumpy_bt=lumpy_bt, lumpy_info=lumpy_info));
    if cluster_ready == False:
        dview = cluster_setup()
        cluster_ready = True
    counts = dview.map_sync(lambda x: cnv_counts(lumpy_bt, lumpy_info, x[0], x[1]), todo)
    roadmap_dnase_counts = pd.concat(counts, axis=1)
    roadmap_dnase_counts.to_csv(out, sep='\t')
else:
    roadmap_dnase_counts = pd.read_table(out, index_col=0)

Combined info



In [84]:

    
gs_info['svlen'] = gs_info.length
gs_info['svtype'] = gs_info.CNCATEGORY
gs_info['variant_caller'] = 'genomestrip'
lumpy_info['variant_caller'] = 'lumpy'
lumpy_info['emmax'] = False
lumpy_info.ix[lumpy_emmax_info.index, 'emmax'] = True
gs_info['emmax'] = False
gs_info.ix[cns_emmax_info.index, 'emmax'] = True
combined_info = pd.concat([lumpy_info, gs_info], join='inner')
fn = os.path.join(outdir, 'combined_info.pickle')
cPickle.dump(combined_info, open(fn, 'w'))



In [85]:

    
combined_info.head()









    Out[85]:






  
    
      
      chrom
      start
      end
      name
      overlaps_gene
      contains_gene
      overlaps_gene_exon
      nearest_tss_dist
      svlen
      svtype
      variant_caller
      emmax
    
    
      name
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      DEL_1_756790_757179
      chr1
      756790
      757179
      DEL_1_756790_757179
      NaN
      NaN
      NaN
      -3699
      -389
      DEL
      lumpy
      False
    
    
      DEL_1_757414_758302
      chr1
      757414
      758302
      DEL_1_757414_758302
      NaN
      NaN
      NaN
      -4323
      -888
      DEL
      lumpy
      False
    
    
      DEL_1_758133_758844
      chr1
      758133
      758844
      DEL_1_758133_758844
      NaN
      NaN
      NaN
      4042
      -711
      DEL
      lumpy
      False
    
    
      DEL_1_859331_859613
      chr1
      859331
      859613
      DEL_1_859331_859613
      NaN
      NaN
      NaN
      647
      -282
      DEL
      lumpy
      False
    
    
      DEL_1_869725_870228
      chr1
      869725
      870228
      DEL_1_869725_870228
      {ENSG00000187634.6}
      NaN
      NaN
      -3281
      -503
      DEL
      lumpy
      True

mCNVs

I want to identify mCNVs that I can test for associations with gene expression. I'll filter the mCNVs more in the notebook where I do the analysis. For now, I'll define mCNVs as CNVs with more than three copy number states in my 131 unrelated individuals. I'll keep mCNVs where there are more than 5 non-mode copy number estimates and more than 5 samples with copy number estimates outside the top three states.

mcnvs = cns_f.ix[num_states[num_states > 3].index] mcnvs_f = mcnvs.ix[set(mcnvs.index) - set(cns_emmax.index)] # Remove CNVs in MHC. t = gs_info.ix[mcnvs_f.index] s = '\n'.join(t.chrom + '\t' + t.start.astype(str) + '\t' + t.end.astype(str) + '\t' + t.name) + '\n' mcnv_bt = pbt.BedTool(s, from_string=True) df = mcnv_bt.intersect(mhc_bt, wa=True, v=True).to_dataframe() mcnvs_f = mcnvs_f.ix[df.name] mcnvs_f.to_csv(os.path.join(private_outdir, 'mcnvs.tsv'), sep='\t')



In [86]:

    
t = gs_combined_cns[unr_rna_meta.wgs_id]
ns = t.apply(lambda x: len(set(x)), axis=1)
mcnvs = t[ns > 3]
# Remove CNVs in MHC.
t = gs_combined_info.ix[mcnvs.index]
s = '\n'.join(t.chrom + '\t' + t.start.astype(str) + '\t' + t.end.astype(str) + 
              '\t' + t.name) + '\n'
mcnv_bt = pbt.BedTool(s, from_string=True)
df = mcnv_bt.intersect(mhc_bt, wa=True, v=True).to_dataframe()
mcnvs = mcnvs.ix[df.name]

t = gs_combined_cns.ix[mcnvs.index, unr_rna_meta.wgs_id]
mode = t.mode(axis=1)[0]
nvariant = [sum(t.ix[i] != mode[i]) for i in t.index]
mcnvs = mcnvs[np.array(nvariant) > 5]
keep = []
for i in t.index:
    vc = t.round().ix[i].value_counts()
    if vc[vc.index[3:]].sum() > 5:
        keep.append(i)
mcnvs = mcnvs.ix[keep]

mcnvs.to_csv(os.path.join(private_outdir, 'mcnvs.tsv'), sep='\t')

	chrom	start	end	name	overlaps_gene	contains_gene	overlaps_gene_exon	nearest_tss_dist	svlen	svtype	variant_caller	emmax
name
DEL_1_756790_757179	chr1	756790	757179	DEL_1_756790_757179	NaN	NaN	NaN	-3699	-389	DEL	lumpy	False
DEL_1_757414_758302	chr1	757414	758302	DEL_1_757414_758302	NaN	NaN	NaN	-4323	-888	DEL	lumpy	False
DEL_1_758133_758844	chr1	758133	758844	DEL_1_758133_758844	NaN	NaN	NaN	4042	-711	DEL	lumpy	False
DEL_1_859331_859613	chr1	859331	859613	DEL_1_859331_859613	NaN	NaN	NaN	647	-282	DEL	lumpy	False
DEL_1_869725_870228	chr1	869725	870228	DEL_1_869725_870228	{ENSG00000187634.6}	NaN	NaN	-3281	-503	DEL	lumpy	True