In [ ]:
from __future__ import division
import os
import math
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pylab
from matplotlib import rc
import pandas as pd
import scipy.stats as stats
from sklearn.decomposition import PCA
from IPython.html.widgets import interact
from IPython.html import widgets
from IPython.display import display
from clustering import cluster_pathway_score, combined_heatmap, get_data
from survival import survival_analysis
from clean_data import CANCER_TYPES
In [ ]:
def KM(memb, clinical_df, cancer_type, clusters, drop=''):
memb = memb.copy()
clusts = [int(x) for x in str(clusters)]
rest = list(set(memb.unique()).difference(clusts))
if drop and drop > 0:
ignore = [int(x) for x in str(drop)]
memb = memb[~memb.isin(ignore)]
if drop and drop < 0:
ignore = rest
memb = memb[~memb.isin(ignore)]
memb[memb.isin(rest)] = rest[0]
# print memb.value_counts()
clinical_df = clinical_df.join(memb, how='inner', lsuffix='old')
survival_analysis(clinical_df, title=cancer_type)
return memb, clinical_df
In [ ]:
can_types = []
for c in CANCER_TYPES:
f1 = '../results/' + c + os.sep + 'silent_mutation_pathway_score.txt'
f2 = '../results/' + c + os.sep + 'nsilent_mutation_pathway_score.txt'
f3 = '../data/processed/' + c + os.sep + c + '_clinical.csv'
if os.path.exists(f1) and os.path.exists(f2) and os.path.exists(f3):
can_types.append(c)
print "There are %d cancer types ready to be analysed" % len(can_types)
can_type_wid = widgets.Dropdown(description="Select Cancer Type", options=can_types)
display(can_type_wid)
In [ ]:
cancer_type = can_type_wid.value
num_pathways = 30
memb, clinical_df, df = cluster_pathway_score(cancer_type, num_pathways)
print memb.value_counts()
In [ ]:
print clinical_df.shape
idx, clinical_df = KM(memb, clinical_df, cancer_type, 6)
outdir = '../notes/draft-manuscript/figures/heatmaps/scratch/'
outfile = outdir + cancer_type + '_KM' + str(num_pathways) + '_clust' + str(len(memb.value_counts())) + '.png'
plt.savefig(outfile, transparent=True, bbox_inches='tight', pad_inches=0)
# print memb.value_counts()
In [ ]:
num_genes=10
fig = combined_heatmap(cancer_type, num_pathways, num_genes, groups=clinical_df['groups'].copy());
outdir = '../notes/draft-manuscript/figures/heatmaps/scratch/'
outfile = outdir + cancer_type + '_heatmap_pathways' + str(num_pathways) + '_genes' + str(num_genes) + '.png'
fig.savefig(outfile, transparent=True, bbox_inches='tight', pad_inches=0)
In [ ]:
# # http://en.wikipedia.org/wiki/DNA_codon_table
# amino_acids = 'arndcqeghi'
# amino_acids += 'lkmfpstwyv'
# amino_acids += 'bo'
# codons = '4622222423'
# codons += '6212464124'
# codons += '13'
# spr = 1; nspr = 1;
# spr = sum([int(x)-1 for x in codons])/(22*27)
# nspr = 1 - spr
In [ ]:
def preprocess_pathway_data():
"""Load GSEA MSigDB Broad Pathway DB"""
input_dir = '../data/pathways'
filename = input_dir + os.sep + 'kegg_biocarta_pid_positional.txt'
pathways = {}
with open(filename, 'r') as f:
for line in f:
p = line.strip().split('\t')
pathways[p[0]] = p[2:]
df = pd.DataFrame.from_dict(pathways, orient='index').transpose()
return df
path_df = preprocess_pathway_data()
cancer_type = can_type_wid.value
(sig_nsdf, clinical_df) = get_data(cancer_type, 'mut', nsdf_norm_factor=1)
path_df['chr4p11'].head()
In [ ]:
for i,p in enumerate(sig_nsdf.index[:5]):
print p
path_genes1 = path_df[sig_nsdf.index[i]].dropna()
path_genes2 = path_df[sig_nsdf.index[i+1]].dropna()
print len(path_genes1), len(path_genes2), set(path_genes1).intersection(set(path_genes2))
# path_genes_df = mut_df.loc[path_genes, sig_nsdf.columns[idx2]].fillna(0)
# path_genes_df = path_genes_df.loc[path_genes_df.sum(axis=1) != 0, :]
# sorted_df = path_genes_df.sum(axis=1).order(ascending=False)
# path_genes_df = path_genes_df.loc[sorted_df.index[:num_genes]]