In [1]:
from collections import defaultdict
from random import random
import pysam
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import glob
import os
import sklearn
from sklearn import cluster
In [2]:
#reading functions
In [3]:
def read_kal_out(t2gFile, txpList, ecMatFile, countTsvFile, dfname):
with open(txpList) as f:
tlist = pd.read_table(f, header=None).rename(columns={0:'tid'})
with open(countTsvFile) as f:
counts = pd.DataFrame(pd.read_table(f, header=None).set_index(0)[2]).rename(columns={2:'counts'})
sorted_idx = sorted(counts.to_dict()['counts'].keys())
eqIdx = [0] * (sorted_idx[-1]+1)
for elem in sorted_idx:
eqIdx[elem] = 1
kal_gene_count = defaultdict(int)
other = 0
eqCount = 0
with open(ecMatFile) as f:
for idx, line in enumerate(f):
if eqIdx[idx]:
eqCount += 1
txps = line.strip().split()[1].split(',')
gSet = set([])
for txp in txps:
gSet.add(t2g[tlist.loc[int(txp)]['tid']])
if len(gSet) == 1:
kal_gene_count[list(gSet)[0]] += counts.loc[idx]['counts']
else:
other += counts.loc[idx]['counts']
print eqCount
kal_count = pd.DataFrame([kal_gene_count], columns=kal_gene_count.keys()).T.rename(columns={0:dfname})
return kal_count
In [4]:
def read_eq(eqFile, dfname):
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']
with open(eqFile) as f:
T = int(f.readline())
E = int(f.readline())
txps = []
for _ in range(T):
txps.append(f.readline().strip())
counts = defaultdict(int)
for line in f:
toks = line.strip().split()
t = int(toks[0])
eqCount = int(toks[t+1])
gSet = set([])
for tid in range(t):
gSet.add(t2g[txps[int(toks[tid+1])]])
if len(gSet) == 1:
counts[list(gSet)[0]] += eqCount
return pd.DataFrame(counts.items()).set_index(0).rename(columns={1:dfname})
In [5]:
def read_sal(sfFile, dfname):
data = pd.DataFrame(pd.read_table(sfFile).set_index('Name')['NumReads'])
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME')['GENE_NAME']
ct = pd.concat([data, t2g], axis=1)
gene_ct = ct.groupby('GENE_NAME').sum()
return gene_ct[(gene_ct['NumReads'] != 0)].rename(columns={'NumReads':dfname})
In [6]:
def read_ut_out(utFile, dfname):
ut_data = pd.read_table(open(utFile))
cell_ut_data = ut_data.set_index('gene')
del cell_ut_data['cell']
cell_ut_data = cell_ut_data.rename(columns={'count':dfname})
return cell_ut_data
In [7]:
t2g_fname = "../../data/mohu/gtf/txp2gene.tsv"
txpList_fname = "../kalData/txpList.txt"
In [8]:
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']
In [8]:
#read data
In [9]:
#kal-input
first = True
for dname in glob.glob("../kalData/*"):
if dname != "../kalData/txpList.txt":
if first:
first=False
kal = read_kal_out(t2g_fname, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))
else:
temp = read_kal_out(t2g_fname, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))
kal = pd.concat([kal, temp], axis=1)
In [17]:
first = True
for dname in glob.glob("../salOut/*"):
if first:
first = False
sf = read_sal( dname+'/alevin/quant.sf', os.path.basename(dname))
else:
temp = read_sal( dname+'/alevin/quant.sf', os.path.basename(dname))
sf = pd.concat([sf, temp], axis=1)
In [9]:
first = True
for dname in glob.glob("../salOut/alevin/cell/*"):
if first:
first = False
malv = read_sal( dname+'/quant.sf', os.path.basename(dname))
else:
temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
malv = pd.concat([malv, temp], axis=1)
In [11]:
first = True
for dname in glob.glob("../salOut/*"):
if first:
first = False
alv = read_eq( dname+'/alevin/counts.alv', os.path.basename(dname))
else:
temp = read_eq( dname+'/alevin/counts.alv', os.path.basename(dname))
alv = pd.concat([alv, temp], axis=1)
In [10]:
#utools import
first = True
for dname in glob.glob("../utData/*"):
if first:
first = False
utl = read_ut_out( dname+'/utools.count', os.path.basename(dname))
else:
temp = read_ut_out( dname+'/utools.count', os.path.basename(dname))
utl = pd.concat([utl, temp], axis=1)
In [ ]:
In [13]:
#create clusters
In [19]:
kal = kal.fillna(0)
#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
mat = kal.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
kal_results = pd.DataFrame([kal.T.index,labels]).T.set_index(0).rename(columns={1:'kallisto'})
In [24]:
#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
alv = alv.fillna(0)
mat = alv.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
alv_results = pd.DataFrame([alv.T.index,labels]).T.set_index(0).rename(columns={1:'alv'})
In [ ]:
In [11]:
#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
malv = malv.fillna(0)
mat = malv.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
malv_results = pd.DataFrame([malv.T.index,labels]).T.set_index(0).rename(columns={1:'malv'})
In [26]:
#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
sf = sf.fillna(0)
mat = sf.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
sf_results = pd.DataFrame([sf.T.index,labels]).T.set_index(0).rename(columns={1:'sf'})
In [12]:
#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
utl = utl.fillna(0)
mat = utl.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
utl_results = pd.DataFrame([utl.T.index,labels]).T.set_index(0).rename(columns={1:'umi_tools'})
In [ ]:
In [ ]:
In [13]:
tr_results = pd.read_table("../../data/10x/mohu/100/whitelist.tsv", header=None).set_index(0)
tr_results['tr'] = [0]*50 + [1]*50
In [ ]:
In [14]:
#combine data
In [15]:
# results = pd.concat([tr_results, alv_results, sf_results, utl_results, kal_results], axis=1)
results = pd.concat([tr_results, utl_results, malv_results], axis=1)
In [16]:
results.sum()
Out[16]:
In [52]:
# alv_intra = alv[results[results['alv']==1].index]
malv_intra = malv[results[results['malv']==0].index]
utl_intra = utl[results[results['umi_tools']==1].index]
# sf_intra = sf[results[results['sf']==0].index]
# kal_intra = kal[results[results['kallisto']==0].index]
In [27]:
# genes = set(alv_intra.index) & set(utl_intra.index) & set(sf_intra.index) & set(kal_intra.index)
genes = set(malv_intra.index) & set(utl_intra.index) & set(sf_intra.index)
In [ ]:
In [28]:
# alv_intra = alv[results[results['alv']==0].index].corr(method="spearman")
# utl_intra = utl[results[results['umi_tools']==1].index].corr(method="spearman")
# sf_intra = sf[results[results['sf']==1].index].corr(method="spearman")
# kal_intra = kal[results[results['kallisto']==1].index].corr(method="spearman")
# alv_intra = alv_intra.loc[genes].corr(method="spearman")
malv_intra = malv_intra.corr(method="spearman")
utl_intra = utl_intra.corr(method="spearman")
# sf_intra = sf_intra.loc[genes].corr(method="spearman")
# kal_intra = kal_intra.loc[genes].corr(method="spearman")
In [29]:
#make intra cluster correlations
In [30]:
# alv_intra = alv[results[results['alv']==1].index].corr(method="spearman")
# utl_intra = utl[results[results['umi_tools']==0].index].corr(method="spearman")
# sf_intra = sf[results[results['sf']==0].index].corr(method="spearman")
# kal_intra = kal[results[results['kallisto']==0].index].corr(method="spearman")
In [ ]:
In [31]:
# plt.subplot(2, 2, 1)
# plt.axis('off')
# plt.title("alevin")
# sns.heatmap(alv_intra)
plt.subplot(2, 2, 1)
plt.axis('off')
plt.title("multi_alevin")
sns.heatmap(malv_intra)
# plt.subplot(2, 2, 2)
# plt.axis('off')
# plt.title("sf")
# sns.heatmap(sf_intra)
plt.subplot(2, 2, 2)
plt.axis('off')
plt.title("umi_tools")
sns.heatmap(utl_intra)
# plt.subplot(2, 2, 4)
# plt.axis('off')
# plt.title("kallisto")
# sns.heatmap(kal_intra)
Out[31]:
In [32]:
# alv_list = []
# sf_list = []
utl_list = []
malv_list = []
# kal_list = []
# for x in alv_intra.values:
# for e in x.tolist():
# if e !=0 and e != 1:
# alv_list.append(e)
for x in malv_intra.values:
for e in x.tolist():
if e !=0 and e != 1:
malv_list.append(e)
# for x in sf_intra.values:
# for e in x.tolist():
# if e !=0 and e != 1:
# sf_list.append(e)
for x in utl_intra.values:
for e in x.tolist():
if e !=0 and e != 1:
utl_list.append(e)
# for x in kal_intra.values:
# for e in x.tolist():
# if e !=0 and e != 1:
# kal_list.append(e)
In [33]:
# import seaborn
# np.random.seed(0)
# n_bins = 30
# # x = np.random.randn(1000, 3)
# x = [alv_list, sf_list, utl_list]
# colors = ['red', 'blue', 'lime']
# n, x, _ = plt.hist(x, n_bins, histtype='bar', color=colors, label=['alevin', 'eq_alevin', 'umi_tools'])
# plt.legend(prop={'size': 10})
# plt.show()
In [ ]:
In [34]:
# sns.kdeplot(np.array(alv_list), label= "eq_class")
# sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()
Out[34]:
In [147]:
sns.kdeplot(np.array(alv_list), label= "eq_class")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()
Out[147]:
In [35]:
len(malv_list), len(utl_list)
Out[35]:
In [36]:
malv.shape, utl.shape
Out[36]:
In [39]:
malv_intra.shape, utl_intra.shape
Out[39]:
In [37]:
malv_intra.to_csv("malv.mat")
utl_intra.to_csv("utl.mat")
In [40]:
In [53]:
malv_intra.rename(columns=lambda x: x+"-1", inplace=True)
In [54]:
utl_intra.rename(columns=lambda x: x+"-2", inplace=True)
In [57]:
ct = pd.concat([malv_intra, utl_intra], axis=1).fillna(0)
In [59]:
ct.to_csv("agg.mat")
In [ ]: