In [1]:
from collections import defaultdict
from random import random
import pysam
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import glob
import os
import sklearn
from sklearn import cluster
import cPickle as pickle
In [2]:
def read_eq(eqFile, dfname):
with open(eqFile) as f:
T = int(f.readline())
# E = int(f.readline())
txps = []
for _ in range(T):
txps.append(f.readline().strip())
counts = defaultdict(int)
for line in f:
toks = line.strip().split()
t = int(toks[0])
eqCount = int(toks[t+1])
gSet = set([])
for tid in range(t):
gSet.add(t2g[txps[int(toks[tid+1])]])
if len(gSet) == 1:
counts[list(gSet)[0]] += eqCount
return pd.DataFrame(counts.items()).set_index(0).rename(columns={1:dfname})
def read_sal(sfFile, dfname):
data = pd.DataFrame(pd.read_table(sfFile).set_index('Name')['NumMolecules'])
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME')['GENE_NAME']
ct = pd.concat([data, t2g], axis=1)
gene_ct = ct.groupby('GENE_NAME').sum()
return gene_ct[(gene_ct['NumMolecules'] != 0)].rename(columns={'NumMolecules':dfname})
def read_ut_out(utFile, dfname):
ut_data = pd.read_table(open(utFile))
cell_ut_data = ut_data.set_index('gene')
del cell_ut_data['cell']
cell_ut_data = cell_ut_data.rename(columns={'count':dfname})
return cell_ut_data
def read_kal_out(t2gFile, numCells, txpList, ecMatFile, countTsvFile, dfname):
with open(txpList) as f:
tlist = pd.read_table(f, header=None).rename(columns={0:'tid'})
eclass = []
with open(ecMatFile) as f:
for line in f:
eclass.append( tuple(line.strip().split()[1].split(',')) )
kal_count = [defaultdict(int) for _ in range(numCells)]
progress = 0
with open(countTsvFile) as f:
for line in f:
eqId, cell_id, ct = line.strip().split()
txps = eclass[int(eqId)]
progress = cell_id
gSet = set([])
for txp in txps:
gSet.add(t2g[tlist.loc[int(txp)]['tid']])
if len(gSet) == 1:
kal_count[int(cell_id)][list(gSet)[0]] += int(ct)
print "\r Done"+str(progress),
return kal_count
In [ ]:
In [ ]:
In [ ]:
In [3]:
t2g_fname = "../../data/mohu/gtf/txp2gene.tsv"
txpList_fname = "../kalData/txpList.txt"
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']
In [ ]:
In [ ]:
In [4]:
alv_full = pd.read_csv('./fast-eq-1k.mat', index_col=[0])
qalv_full = pd.read_csv('./fast-malv-1k.mat', index_col=[0])
utl_full = pd.read_csv('./utl-1k.mat', index_col=[0])
kal_full = pd.read_csv('./kal-1k.mat', index_col=[0])
In [ ]:
In [ ]:
In [ ]:
In [5]:
first = True
count = 0
for dname in glob.glob("/mnt/scratch5/avi/alevin/subsample_testing/alv_quarter/alevin/cell/*"):
count += 1
print "\r"+str(count),
if first:
first = False
qalv_quart = read_sal( dname+'/quant.sf', os.path.basename(dname))
else:
temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
qalv_quart = pd.concat([qalv_quart, temp], axis=1)
In [9]:
first = True
count = 0
for dname in glob.glob("/mnt/scratch5/avi/alevin/subsample_testing/alv_quarter/alevin/cell/*"):
count += 1
print "\r"+str(count),
if first:
first = False
alv_quart = read_eq( dname+'/cell_eq_classes.txt', os.path.basename(dname))
else:
temp = read_eq( dname+'/cell_eq_classes.txt', os.path.basename(dname))
alv_quart = pd.concat([alv_quart, temp], axis=1)
In [8]:
first = True
for dname in glob.glob("/mnt/scratch5/avi/alevin/subsample_testing/utData/*"):
if first:
first = False
utl_quart = read_ut_out( dname+'/utools.count', os.path.basename(dname))
else:
temp = read_ut_out( dname+'/utools.count', os.path.basename(dname))
utl_quart = pd.concat([utl_quart, temp], axis=1)
In [5]:
# #kal-input
first = True
for dname in glob.glob("../../subsample_testing/kalData_quarter/quants/"):
kal_quart = read_kal_out(t2g_fname, 1017, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))
kal_quart = pd.DataFrame.from_dict(kal_quart).T
In [ ]:
In [6]:
# alv_quart.to_csv('./alv_quarter.mat')
# utl_quart.to_csv('./utl_quarter.mat')
# kal_quart.to_csv('./kal_quarter.mat')
qalv_quart.to_csv('./qalv_quarter.mat')
In [ ]:
In [7]:
alv_quart = pd.read_csv('./alv_quarter.mat', index_col=[0])
utl_quart = pd.read_csv('./utl_quarter.mat', index_col=[0])
kal_quart = pd.read_csv('./kal_quarter.mat', index_col=[0])
In [8]:
alv_full.shape, utl_full.shape, kal_full.shape, qalv_full.shape
Out[8]:
In [9]:
alv_quart.shape, utl_quart.shape, kal_quart.shape, qalv_quart.shape
Out[9]:
In [ ]:
In [10]:
alv_full.fillna(0, inplace=True)
utl_full.fillna(0, inplace=True)
kal_full.fillna(0, inplace=True)
qalv_full.fillna(0, inplace=True)
alv_quart.fillna(0, inplace=True)
utl_quart.fillna(0, inplace=True)
kal_quart.fillna(0, inplace=True)
qalv_quart.fillna(0, inplace=True)
In [11]:
alv_full = alv_full[(alv_full.T != 0).any()]
utl_full = utl_full[(utl_full.T != 0).any()]
kal_full = kal_full[(kal_full.T != 0).any()]
qalv_full = qalv_full[(qalv_full.T != 0).any()]
alv_quart = alv_quart[(alv_quart.T != 0).any()]
utl_quart = utl_quart[(utl_quart.T != 0).any()]
kal_quart = kal_quart[(kal_quart.T != 0).any()]
qalv_quart = qalv_quart[(qalv_quart.T != 0).any()]
In [12]:
alv_full.shape, utl_full.shape, kal_full.shape, qalv_full.shape
Out[12]:
In [13]:
alv_quart.shape, utl_quart.shape, kal_quart.shape, qalv_quart.shape
Out[13]:
In [14]:
sum(alv_full.sum()), sum(utl_full.sum()), sum(kal_full.sum()), sum(qalv_full.sum())
Out[14]:
In [15]:
sum(alv_quart.sum()), sum(utl_quart.sum()), sum(kal_quart.sum()), sum(qalv_quart.sum())
Out[15]:
In [16]:
cell_names=[]
with open("../../subsample_testing/kalData_quarter/batch.txt") as f:
for line in f:
cell_names.append(line.strip().split()[1].split("/")[1])
In [ ]:
In [30]:
name_dict = {}
for inx, cn in enumerate(cell_names):
name_dict[str(inx)] = cn
kal_quart.rename(columns=name_dict)
In [ ]:
In [32]:
corrs_utl = []
corrs_alv = []
corrs_kal = []
corrs_qalv = []
for count, cell in enumerate(alv_full):
print "\r Done " + str(count),
temp = pd.concat([utl_full[cell], utl_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_utl.append(temp.corr(method="spearman").iloc[1][0])
ind = temp.index
full = alv_full[cell]
quart = alv_quart[cell]
temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_alv.append(temp.corr(method="spearman").iloc[1][0])
full = kal_full[cell]
quart = kal_quart[cell]
temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_kal.append(temp.corr(method="spearman").iloc[1][0])
full = qalv_full[cell]
quart = qalv_quart[cell]
temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_qalv.append(temp.corr(method="spearman").iloc[1][0])
In [35]:
sns.kdeplot(np.array(corrs_utl), color='red', label = "utools")
sns.kdeplot(np.array(corrs_alv), color='blue', label = "alevin")
sns.kdeplot(np.array(corrs_kal), color='green', label = "kallisto")
sns.kdeplot(np.array(corrs_qalv), color='cyan', label = "Full_alv")
plt.legend(loc=2)
Out[35]:
In [ ]:
In [21]:
corrs_utl = []
corrs_alv = []
corrs_cross = []
corrs_rev = []
corrs_full = []
corrs_quart = []
for count, cell in enumerate(alv_full):
print "\r Done " + str(count),
temp = pd.concat([utl_full[cell], alv_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_cross.append(temp.corr(method="spearman").iloc[1][0])
temp = pd.concat([utl_full[cell], alv_full[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_full.append(temp.corr(method="spearman").iloc[1][0])
temp = pd.concat([utl_quart[cell], alv_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_quart.append(temp.corr(method="spearman").iloc[1][0])
temp = pd.concat([alv_full[cell], utl_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_rev.append(temp.corr(method="spearman").iloc[1][0])
temp = pd.concat([utl_full[cell], utl_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_utl.append(temp.corr(method="spearman").iloc[1][0])
ind = temp.index
full = alv_full[cell]
quart = alv_quart[cell]
temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_alv.append(temp.corr(method="spearman").iloc[1][0])
In [22]:
sns.kdeplot(np.array(corrs_utl), color='red', label = "utools")
sns.kdeplot(np.array(corrs_alv), color='blue', label = "alevin")
sns.kdeplot(np.array(corrs_cross), color='green', label = "utl full v alv quarter")
sns.kdeplot(np.array(corrs_rev), color='yellow', label = "alv full v utl quarter")
sns.kdeplot(np.array(corrs_full), color='black', label = "full v full")
sns.kdeplot(np.array(corrs_quart), color='brown', label = "quarter v quarter")
plt.legend(loc=2)
Out[22]:
In [ ]: