notebook.community

Edit and run



In [ ]:

    
from __future__ import division
import os
import math
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pylab
from matplotlib import rc
import pandas as pd
import scipy.stats as stats

from sklearn.decomposition import PCA
from IPython.html.widgets import interact
from IPython.html import widgets
from IPython.display import display


from clustering import cluster_pathway_score, combined_heatmap, get_data
from survival import survival_analysis
from clean_data import CANCER_TYPES



In [ ]:

    
def KM(memb, clinical_df, cancer_type, clusters, drop=''):
    memb = memb.copy()
    clusts = [int(x) for x in str(clusters)]
    rest = list(set(memb.unique()).difference(clusts))

    if drop and drop > 0:    
        ignore = [int(x) for x in str(drop)]
        memb = memb[~memb.isin(ignore)]
    if drop and drop < 0:    
        ignore = rest
        memb = memb[~memb.isin(ignore)]


    memb[memb.isin(rest)] = rest[0]
#     print memb.value_counts()
    clinical_df = clinical_df.join(memb, how='inner', lsuffix='old')
    survival_analysis(clinical_df, title=cancer_type)
    
    
    return memb, clinical_df



In [ ]:

    
can_types = []
for c in CANCER_TYPES:
    f1 = '../results/' + c + os.sep + 'silent_mutation_pathway_score.txt'
    f2 = '../results/' + c + os.sep + 'nsilent_mutation_pathway_score.txt'
    f3 = '../data/processed/' + c + os.sep + c + '_clinical.csv'

    if os.path.exists(f1) and os.path.exists(f2) and os.path.exists(f3):
        can_types.append(c)
    
print "There are %d cancer types ready to be analysed" % len(can_types)
can_type_wid = widgets.Dropdown(description="Select Cancer Type", options=can_types)
display(can_type_wid)



In [ ]:

    
cancer_type = can_type_wid.value
num_pathways = 30
memb, clinical_df, df = cluster_pathway_score(cancer_type, num_pathways)
print memb.value_counts()



In [ ]:

    
print clinical_df.shape
idx, clinical_df = KM(memb, clinical_df, cancer_type, 6)
outdir = '../notes/draft-manuscript/figures/heatmaps/scratch/'
outfile = outdir + cancer_type + '_KM' + str(num_pathways) + '_clust' + str(len(memb.value_counts())) + '.png'
plt.savefig(outfile, transparent=True, bbox_inches='tight', pad_inches=0)

# print memb.value_counts()



In [ ]:

    
num_genes=10
fig = combined_heatmap(cancer_type, num_pathways, num_genes, groups=clinical_df['groups'].copy());
outdir = '../notes/draft-manuscript/figures/heatmaps/scratch/'
outfile = outdir + cancer_type + '_heatmap_pathways' + str(num_pathways) + '_genes' + str(num_genes) + '.png'
fig.savefig(outfile, transparent=True, bbox_inches='tight', pad_inches=0)



In [ ]:

    
# # http://en.wikipedia.org/wiki/DNA_codon_table
# amino_acids = 'arndcqeghi'
# amino_acids += 'lkmfpstwyv'
# amino_acids += 'bo'
# codons = '4622222423'
# codons += '6212464124'
# codons += '13'
# spr = 1; nspr = 1;
# spr =  sum([int(x)-1 for x in codons])/(22*27)
# nspr = 1 - spr



In [ ]:

    
def preprocess_pathway_data():
    """Load GSEA MSigDB Broad Pathway DB"""
    input_dir = '../data/pathways'
    filename = input_dir + os.sep + 'kegg_biocarta_pid_positional.txt'
    pathways = {}
    with open(filename, 'r') as f:
        for line in f:
            p = line.strip().split('\t')
            pathways[p[0]] = p[2:]

    df = pd.DataFrame.from_dict(pathways, orient='index').transpose()

    return df

path_df = preprocess_pathway_data()
cancer_type = can_type_wid.value
(sig_nsdf, clinical_df) = get_data(cancer_type, 'mut', nsdf_norm_factor=1)
path_df['chr4p11'].head()



In [ ]:

    
for i,p in enumerate(sig_nsdf.index[:5]):
    print p
    path_genes1 = path_df[sig_nsdf.index[i]].dropna()
    path_genes2 = path_df[sig_nsdf.index[i+1]].dropna()
    print len(path_genes1), len(path_genes2), set(path_genes1).intersection(set(path_genes2))
#     path_genes_df = mut_df.loc[path_genes, sig_nsdf.columns[idx2]].fillna(0)
#     path_genes_df = path_genes_df.loc[path_genes_df.sum(axis=1) != 0, :]
#     sorted_df = path_genes_df.sum(axis=1).order(ascending=False)
#     path_genes_df = path_genes_df.loc[sorted_df.index[:num_genes]]