In [129]:
def get_id(line):
return "_".join(map(str, [line['chromosome_name'], line['start']]))
import pandas as pd
table = pd.read_table("A1.tsv")
table['id']=table.apply(get_id, axis=1)
table = table.set_index('id')
table.columns
ref_cols = ['tumor.rcnt.llr3_ref','adrenalmet.rcnt.llr3_ref', 'livermet.rcnt.llr3_ref', 'lungmet.rcnt.llr3_ref', 'spinalmet.rcnt.llr3_ref']
var_cols = ['tumor.rcnt.llr3_var','adrenalmet.rcnt.llr3_var', 'livermet.rcnt.llr3_var', 'lungmet.rcnt.llr3_var', 'spinalmet.rcnt.llr3_var']
#breast,adrenal,liver,lung,spinal
cols = ['breast', 'adrenal', 'liver', 'lung', 'spinal']
table = table[['cluster']+ref_cols+var_cols]
table.columns = ['cluster']+['ref-'+c for c in cols] + ['var-'+c for c in cols]
table.head()
table.head()
Out[129]:
In [130]:
filename="../../../../result/hoadley_2016/treeomics/A1_sub/mutations.txt"
clusters = []
tree = {}
with open(filename) as f:
for i,line in enumerate(f):
line = line.strip().split(" ")
if line[1] not in tree:
tree[line[1][:-1]] = (line[0][:-1], i+1)
else:
tree[line[1][:-1]+"_2"] = (line[0][:-1], i+1)
clusters.append(line[2:])
table['tcluster'] = 0
for i, cluster in enumerate(clusters):
for idx in cluster:
v = "_".join(idx.split('__')[:2])
try:
table.loc[v]["tcluster"] = i+1
except:
print idx
In [131]:
for sample in tree:
curs = sample
if sample.startswith('SC'): continue
print sample
while True:
try:
print tree[curs][1]
curs = tree[curs][0]
except:
break
In [132]:
def get_vaf(row, sam):
return float(row['var-'+sam])/float(row['var-'+sam]+row['ref-'+sam])
#ctable_cutoff = table.groupby('cluster').mean()
vafs = pd.DataFrame()
for sam in cols:
vafs[sam] = table.apply(get_vaf, args=[sam], axis=1)
vafs['cluster'] = table['cluster']
vafs['tcluster'] = table['tcluster']
In [133]:
vafs.groupby(['tcluster', 'cluster']).mean()
Out[133]:
In [143]:
%matplotlib inline
from matplotlib import pyplot
v_sub = vafs[(vafs['tcluster'] == 8) | (vafs['tcluster'] == 8)]
import seaborn as sns
g = sns.PairGrid(v_sub, hue = 'tcluster', vars = ['breast', 'liver', 'adrenal'])
g = g.map_upper(pyplot.scatter)
g = g.map_diag(pyplot.hist, alpha = 0.6)
g = g.set(xlim=(0,0.7), ylim=(0,0.7))
In [139]:
from matplotlib import pyplot
v_sub = vafs[(vafs['tcluster'] == 7 ) | (vafs['tcluster'] == 8)]
import seaborn as sns
g = sns.PairGrid(v_sub, hue = 'tcluster', vars = ['breast', 'liver', 'adrenal'])
g = g.map_upper(pyplot.scatter)
g = g.map_diag(pyplot.hist, alpha = 0.6)
g = g.set(xlim=(0,0.7), ylim=(0,0.7))
In [136]:
print vafs[vafs['tcluster']==8].shape
print sum(table[table['tcluster']==8]['var-breast'] > 0)
pyplot.subplot(2,2,1)
sns.countplot(table[table['tcluster']==8]['var-breast'], palette = sns.color_palette(["#2ecc71"]))
pyplot.xlabel("Number of Variant Reads in Breast")
pyplot.ylabel("Number of Mutations")
pyplot.xlim((-2, 9))
pyplot.subplot(2,2,2)
sns.countplot(table[table['tcluster']==10]['var-breast'], palette = sns.color_palette(["#34495e"]+["#2ecc71"]*10))
pyplot.xlabel("Number of Variant Reads in Breast")
pyplot.ylabel("Number of Mutations")
pyplot.xlim((-1, 10))
#pyplot.xticks([0.90*v for v in range(0,11)], range(0,11))
pyplot.subplot(2,2,3)
sns.countplot(table[table['tcluster']==6]['var-breast'], palette = sns.color_palette(["#34495e"]+["#2ecc71"]*10))
pyplot.xlabel("Number of Variant Reads in Breast")
pyplot.ylabel("Number of Mutations")
pyplot.xlim((-1, 10))
pyplot.xticks([0.95*v for v in range(0,11)], range(0,11))
pyplot.subplot(2,2,4)
sns.countplot(table[table['tcluster']==9]['var-breast'], palette = sns.color_palette(["#34495e"]+["#2ecc71"]*10))
pyplot.xlabel("Number of Variant Reads in Breast")
pyplot.ylabel("Number of Mutations")
pyplot.xlim((-1, 10))
pyplot.xticks([0.95*v for v in range(0,11)], range(0,11))
pyplot.gcf().set_size_inches(10,10)
#pyplot.savefig
#"#34495e",
In [228]:
sns.set_style('dark')
#sns.palplot(sns.color_palette("Set2", 10))
for j,cluster in enumerate([8,6,9]):
pyplot.subplot(1,3,j+1)
v = table[table['tcluster']==cluster]['var-breast'].value_counts()
for i in range(0,11):
if i not in v: v[i]=0
v.sort_index(inplace=True)
sns.set_palette(sns.color_palette(["#34495e"]+["#2ecc71"]*10))
v.plot(kind='bar', rot=0)
sns.set_palette(sns.color_palette("Set2", 10))
v[[0]].plot(kind='bar', rot=0)
pyplot.xlim((-1,11))
pyplot.xticks(range(0,11), range(0,11))
#if i in [0,1]: pyplot.ylim(0,20)
#else: pyplot.ylim(0,60)
pyplot.xlabel('# Variant reads in breast')
if j in [0,2]: pyplot.ylabel('count')
pyplot.gcf().set_size_inches(14,6)
pyplot.savefig('Read_Distribution.pdf')
pyplot.tight_layout()
In [230]:
sns.set_context("paper", font_scale=1.6)
sns.set_palette(sns.color_palette(["#3498db", "#34495e"]))
colors = ['dusty pink']
sns.xkcd_palette(colors)
vafs['c8'] = (vafs['tcluster'] == 8)
for i,site in enumerate(['breast', 'liver', 'adrenal']):
pyplot.subplot(1,3,i+1)
#v_sub = vafs[(vafs['tcluster'] == 8)]
vsub = vafs[vafs[site]!= 0]
# pyplot.xlim((0,0.6))
#sns.distplot(vsub[vsub['c8'] == False][site], kde=False, hist=True, bins=30, label='Other Mutations')
#sns.distplot(vsub[vsub['c8'] == True][site], kde=False, hist=True, label='Cluster 2/4)
pyplot.hist(vsub[vsub['c8'] == False][site],bins=30, label='Other Mutations' ,normed=True )
if site == 'breast':
bins = 5
else: bins = 30
pyplot.hist(vsub[vsub['c8'] == True][site], bins = bins, label='Cluster 2/4', alpha = 0.6, normed=True)
pyplot.xlim(0,0.6)
#sns.stripplot(vafs[site])
pyplot.legend()
#sns.stripplot(v_sub[site])
#sns.swarmplot(data = vafs, y = site, x = 'c8' )
#sns.violinplot(data = vafs, y = 'breast', hue = 'c8', split = 'True')
pyplot.xlabel('VAF in '+site)
if i == 0: pyplot.ylabel('count')
pyplot.gcf().set_size_inches(14,6)
pyplot.savefig('Hist_cluster2.pdf')
In [177]:
print len((vafs['tcluster'] == 8) )
In [ ]:
ax = sns.violinplot(x="day", y="total_bill", hue="smoker", data=tips, palette="muted", split=True)
In [178]:
print len(vafs[site])
In [179]:
(vafs['tcluster'] == 8)
Out[179]:
In [ ]: