In [1]:
from collections import defaultdict
from random import random
import pysam
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import glob
import os
import sklearn
from sklearn import cluster
In [28]:
#reading functions
In [29]:
def read_kal_out(t2gFile, txpList, ecMatFile, countTsvFile, dfname):
with open(txpList) as f:
tlist = pd.read_table(f, header=None).rename(columns={0:'tid'})
with open(countTsvFile) as f:
counts = pd.DataFrame(pd.read_table(f, header=None).set_index(0)[2]).rename(columns={2:'counts'})
sorted_idx = sorted(counts.to_dict()['counts'].keys())
eqIdx = [0] * (sorted_idx[-1]+1)
for elem in sorted_idx:
eqIdx[elem] = 1
kal_gene_count = defaultdict(int)
other = 0
eqCount = 0
with open(ecMatFile) as f:
for idx, line in enumerate(f):
if eqIdx[idx]:
eqCount += 1
txps = line.strip().split()[1].split(',')
gSet = set([])
for txp in txps:
gSet.add(t2g[tlist.loc[int(txp)]['tid']])
if len(gSet) == 1:
kal_gene_count[list(gSet)[0]] += counts.loc[idx]['counts']
else:
other += counts.loc[idx]['counts']
print eqCount
kal_count = pd.DataFrame([kal_gene_count], columns=kal_gene_count.keys()).T.rename(columns={0:dfname})
return kal_count
In [30]:
def read_eq(eqFile, dfname):
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']
with open(eqFile) as f:
T = int(f.readline())
E = int(f.readline())
txps = []
for _ in range(T):
txps.append(f.readline().strip())
counts = defaultdict(int)
for line in f:
toks = line.strip().split()
t = int(toks[0])
eqCount = int(toks[t+1])
gSet = set([])
for tid in range(t):
gSet.add(t2g[txps[int(toks[tid+1])]])
if len(gSet) == 1:
counts[list(gSet)[0]] += eqCount
return pd.DataFrame(counts.items()).set_index(0).rename(columns={1:dfname})
In [31]:
def read_sal(sfFile, dfname):
data = pd.DataFrame(pd.read_table(sfFile).set_index('Name')['NumReads'])
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME')['GENE_NAME']
ct = pd.concat([data, t2g], axis=1)
gene_ct = ct.groupby('GENE_NAME').sum()
return gene_ct[(gene_ct['NumReads'] != 0)].rename(columns={'NumReads':dfname})
In [32]:
def read_ut_out(utFile, dfname):
ut_data = pd.read_table(open(utFile))
cell_ut_data = ut_data.set_index('gene')
del cell_ut_data['cell']
cell_ut_data = cell_ut_data.rename(columns={'count':dfname})
return cell_ut_data
In [33]:
t2g_fname = "../../data/mohu/gtf/txp2gene.tsv"
txpList_fname = "../kalData/txpList.txt"
In [34]:
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']
In [9]:
#read data
In [10]:
#kal-input
first = True
for dname in glob.glob("../../1kcell_testing/kalData/*"):
if dname != "../kalData/txpList.txt":
if first:
first=False
kal = read_kal_out(t2g_fname, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))
else:
temp = read_kal_out(t2g_fname, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))
kal = pd.concat([kal, temp], axis=1)
In [ ]:
first = True
for dname in glob.glob("../../1kcell_testing/salOut/alevin/cell/*"):
if first:
first = False
sf = read_sal( dname+'/quant.sf', os.path.basename(dname))
else:
temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
sf = pd.concat([sf, temp], axis=1)
In [ ]:
first = True
for dname in glob.glob("../../1kcell_testing/salOutBcUmi/alevin/cell/*"):
if first:
first = False
malv = read_sal( dname+'/quant.sf', os.path.basename(dname))
else:
temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
malv = pd.concat([malv, temp], axis=1)
In [ ]:
first = True
for dname in glob.glob("../../1kcell_testing/salOutBc/alevin/cell/*"):
if first:
first = False
alv = read_sal( dname+'/quant.sf', os.path.basename(dname))
else:
temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
alv = pd.concat([alv, temp], axis=1)
In [84]:
#utools import
first = True
for dname in glob.glob("../../1kcell_testing/utData/*"):
if first:
first = False
utl = read_ut_out( dname+'/utools.count', os.path.basename(dname))
else:
temp = read_ut_out( dname+'/utools.count', os.path.basename(dname))
utl = pd.concat([utl, temp], axis=1)
In [ ]:
In [43]:
kal.to_csv("kal-1k.mat")
utl.to_csv("utl-1k.mat")
malv.to_csv("malv-1k.mat")
sf.to_csv("sf-1k.mat")
alv.to_csv("alvBc-1k.mat")
In [13]:
#create clusters
In [2]:
kal = pd.read_csv('./kal-1k.mat', index_col=[0])
utl = pd.read_csv('./utl-1k.mat', index_col=[0])
malv = pd.read_csv('./malv-1k.mat', index_col=[0])
sf = pd.read_csv('./sf-1k.mat', index_col=[0])
alv = pd.read_csv('./alvBc-1k.mat', index_col=[0])
In [ ]:
In [3]:
# kal = kal.fillna(0)
# #from https://stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# mat = kal.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# kal_results = pd.DataFrame([kal.T.index,labels]).T.set_index(0).rename(columns={1:'kallisto'})
In [4]:
# #from https://stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# alv = alv.fillna(0)
# mat = alv.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# alv_results = pd.DataFrame([alv.T.index,labels]).T.set_index(0).rename(columns={1:'alv'})
In [ ]:
In [5]:
# # from https //stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# malv = malv.fillna(0)
# mat = malv.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# malv_results = pd.DataFrame([malv.T.index,labels]).T.set_index(0).rename(columns={1:'malv'})
In [6]:
# #from https://stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# sf = sf.fillna(0)
# mat = sf.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# sf_results = pd.DataFrame([sf.T.index,labels]).T.set_index(0).rename(columns={1:'sf'})
In [7]:
# #from https://stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# utl = utl.fillna(0)
# mat = utl.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# utl_results = pd.DataFrame([utl.T.index,labels]).T.set_index(0).rename(columns={1:'umi_tools'})
In [ ]:
In [ ]:
In [3]:
tr_results = pd.read_table("../../data/10x/mohu/1k/whitelist.tsv", header=None).set_index(0)
tr_results['tr'] = [0]*504 + [1]*516
In [4]:
import collections
print [item for item, count in collections.Counter(tr_results.index).items() if count > 1]
In [5]:
tr_results = tr_results.drop(['AGCTCCTGTTGTGGAG', 'ACGCCAGTCGGATGGA', 'GGCAATTAGGAATCGC'])
In [6]:
first_cluster = tr_results[tr_results['tr'] == 1].index
In [7]:
second_cluster = tr_results[tr_results['tr'] == 0].index
In [ ]:
In [8]:
malv_results = malv[first_cluster].fillna(0)
sf_results = sf[first_cluster].fillna(0)
utl_results = utl[first_cluster].fillna(0)
kal_results = kal[first_cluster].fillna(0)
alv_results = alv[first_cluster].fillna(0)
In [ ]:
In [92]:
#combine data
In [93]:
# results = pd.concat([tr_results, malv_results, sf_results, utl_results, kal_results], axis=1)
# results = pd.concat([tr_results, kal_results], axis=1)
In [94]:
# results.sum()
In [95]:
# alv_intra = alv[results[results['alv']==1].index]
# malv_intra = malv[results[results['malv']==0].index]
# utl_intra = utl[results[results['umi_tools']==1].index]
# sf_intra = sf[results[results['sf']==0].index]
# kal_intra = kal[results[results['kallisto']==0].index]
In [9]:
# genes = set(alv_intra.index) & set(utl_intra.index) & set(sf_intra.index) & set(kal_intra.index)
genes = set(malv_results.index) & set(kal_results.index)
In [10]:
len(genes), len(malv_results.index)
Out[10]:
In [ ]:
# alv_intra = alv[results[results['alv']==0].index].corr(method="spearman")
# utl_intra = utl[results[results['umi_tools']==1].index].corr(method="spearman")
# sf_intra = sf[results[results['sf']==1].index].corr(method="spearman")
# kal_intra = kal[results[results['kallisto']==0].index].corr(method="spearman")
# alv_intra = alv_results.loc[genes].corr(method="spearman")
# malv_intra = malv_results.corr(method="spearman")
# utl_intra = utl_results.corr(method="spearman")
# sf_intra = sf_results.loc[genes].corr(method="spearman")
# kal_intra = kal_results.loc[genes].corr(method="spearman")
alv_intra = alv_results.corr(method="spearman")
malv_intra = malv_results.corr(method="spearman")
utl_intra = utl_results.corr(method="spearman")
sf_intra = sf_results.corr(method="spearman")
kal_intra = kal_results.corr(method="spearman")
In [ ]:
In [112]:
#make intra cluster correlations
In [113]:
# alv_intra = alv[results[results['alv']==1].index].corr(method="spearman")
# utl_intra = utl[results[results['umi_tools']==0].index].corr(method="spearman")
# sf_intra = sf[results[results['sf']==0].index].corr(method="spearman")
# kal_intra = kal[results[results['kallisto']==0].index].corr(method="spearman")
In [12]:
malv_intra.shape, kal_intra.shape
Out[12]:
In [14]:
# plt.subplot(2, 3, 1)
# plt.axis('off')
# plt.title("alevin-bc")
# sns.heatmap(alv_intra)
plt.subplot(2, 2, 1)
plt.axis('off')
plt.title("multi_alevin")
sns.heatmap(malv_intra)
# plt.subplot(2, 3, 3)
# plt.axis('off')
# plt.title("old_alevin")
# sns.heatmap(sf_intra)
# plt.subplot(2, 3, 4)
# plt.axis('off')
# plt.title("umi_tools")
# sns.heatmap(utl_intra)
plt.subplot(2, 2, 2)
plt.axis('off')
plt.title("kallisto")
sns.heatmap(kal_intra)
Out[14]:
In [ ]:
alv_list = []
sf_list = []
utl_list = []
malv_list = []
kal_list = []
for x in alv_intra.values:
for e in x.tolist():
if e !=0 and e != 1:
alv_list.append(e)
for x in malv_intra.values:
for e in x.tolist():1
if e !=0 and e != 1:
malv_list.append(e)
for x in sf_intra.values:
for e in x.tolist():
if e !=0 and e != 1:
sf_list.append(e)
for x in utl_intra.values:
for e in x.tolist():
if e !=0 and e != 1:
utl_list.append(e)
for x in kal_intra.values:
for e in x.tolist():
if e !=0 and e != 1:
kal_list.append(e)
In [ ]:
# import seaborn
# np.random.seed(0)
# n_bins = 30
# # x = np.random.randn(1000, 3)
# x = [alv_list, sf_list, utl_list]
# colors = ['red', 'blue', 'lime']
# n, x, _ = plt.hist(x, n_bins, histtype='bar', color=colors, label=['alevin', 'eq_alevin', 'umi_tools'])
# plt.legend(prop={'size': 10})
# plt.show()
In [90]:
# sns.kdeplot(np.array(alv_list), label= "alv-bc")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()
Out[90]:
In [110]:
# sns.kdeplot(np.array(alv_list), label= "alv-bc")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()
Out[110]:
In [16]:
# sns.kdeplot(np.array(alv_list), label= "alv-bc")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
# sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()
Out[16]:
In [21]:
# sns.kdeplot(np.array(alv_list), label= "alv-bc")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()
Out[21]:
In [2]:
# len(malv_list), len(utl_list)
In [3]:
# malv.shape, utl.shape
In [4]:
# malv_intra.shape, utl_intra.shape
In [37]:
# malv_intra.to_csv("malv.mat")
# utl_intra.to_csv("utl.mat")
In [40]:
In [53]:
# malv_intra.rename(columns=lambda x: x+"-1", inplace=True)
In [54]:
# utl_intra.rename(columns=lambda x: x+"-2", inplace=True)
In [57]:
# ct = pd.concat([malv_intra, utl_intra], axis=1).fillna(0)
In [59]:
# ct.to_csv("agg.mat")
In [ ]: