In [1]:
from collections import defaultdict
from random import random
import pysam
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import glob
import os
import sklearn
from sklearn import cluster
import cPickle as pickle


/home/avi/miniconda2/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [2]:
def read_eq(eqFile, dfname):
    with open(eqFile) as f:
        T = int(f.readline())
#         E = int(f.readline())
        txps = []
        for _ in range(T):
            txps.append(f.readline().strip())

        counts = defaultdict(int)
        for line in f:
            toks = line.strip().split()
            t = int(toks[0])
            eqCount = int(toks[t+1])

            gSet = set([])
            for tid in range(t):
                gSet.add(t2g[txps[int(toks[tid+1])]])
            if len(gSet) == 1:
                counts[list(gSet)[0]] += eqCount
    return pd.DataFrame(counts.items()).set_index(0).rename(columns={1:dfname})

def read_sal(sfFile, dfname):
    data = pd.DataFrame(pd.read_table(sfFile).set_index('Name')['NumMolecules'])
    with open(t2g_fname) as f:
        t2g = pd.read_table(f, sep=",").set_index('TX_NAME')['GENE_NAME']
    ct = pd.concat([data, t2g], axis=1)
    gene_ct = ct.groupby('GENE_NAME').sum()
    return gene_ct[(gene_ct['NumMolecules'] != 0)].rename(columns={'NumMolecules':dfname})

def read_ut_out(utFile, dfname):
    ut_data = pd.read_table(open(utFile))
    cell_ut_data = ut_data.set_index('gene')
    del cell_ut_data['cell']
    cell_ut_data = cell_ut_data.rename(columns={'count':dfname})
    return cell_ut_data

def read_kal_out(t2gFile, numCells, txpList, ecMatFile, countTsvFile, dfname):
    with open(txpList) as f:
        tlist = pd.read_table(f, header=None).rename(columns={0:'tid'})

    eclass = []
    with open(ecMatFile) as f:
        for line in f:
            eclass.append( tuple(line.strip().split()[1].split(',')) )
            
    kal_count = [defaultdict(int) for _ in range(numCells)]
    progress = 0
    with open(countTsvFile) as f:
        for line in f:
            eqId, cell_id, ct = line.strip().split()
            txps = eclass[int(eqId)]
            progress = cell_id
            gSet = set([])
            for txp in txps:
                gSet.add(t2g[tlist.loc[int(txp)]['tid']])
            if len(gSet) == 1:
                kal_count[int(cell_id)][list(gSet)[0]] += int(ct)
            print "\r Done"+str(progress),
    return kal_count

In [ ]:


In [ ]:


In [ ]:


In [3]:
t2g_fname = "../../data/mohu/gtf/txp2gene.tsv"
txpList_fname = "../kalData/txpList.txt"
with open(t2g_fname) as f:
    t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']

In [ ]:


In [ ]:


In [4]:
alv_full = pd.read_csv('./fast-eq-1k.mat', index_col=[0])
qalv_full = pd.read_csv('./fast-malv-1k.mat', index_col=[0])
utl_full = pd.read_csv('./utl-1k.mat', index_col=[0])
kal_full = pd.read_csv('./kal-1k.mat', index_col=[0])

In [ ]:


In [ ]:


In [ ]:


In [5]:
first = True
count = 0
for dname in glob.glob("/mnt/scratch5/avi/alevin/subsample_testing/alv_quarter/alevin/cell/*"):
    count += 1
    print "\r"+str(count),
    if first:
        first = False
        qalv_quart = read_sal( dname+'/quant.sf', os.path.basename(dname))
    else:
        temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
        qalv_quart = pd.concat([qalv_quart, temp], axis=1)


1017                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [9]:
first = True
count = 0
for dname in glob.glob("/mnt/scratch5/avi/alevin/subsample_testing/alv_quarter/alevin/cell/*"):
    count += 1
    print "\r"+str(count),
    if first:
        first = False
        alv_quart = read_eq( dname+'/cell_eq_classes.txt', os.path.basename(dname))
    else:
        temp = read_eq( dname+'/cell_eq_classes.txt', os.path.basename(dname))
        alv_quart = pd.concat([alv_quart, temp], axis=1)


1017                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [8]:
first = True
for dname in glob.glob("/mnt/scratch5/avi/alevin/subsample_testing/utData/*"):
    if first:
        first = False
        utl_quart = read_ut_out( dname+'/utools.count', os.path.basename(dname))
    else:
        temp = read_ut_out( dname+'/utools.count', os.path.basename(dname))
        utl_quart = pd.concat([utl_quart, temp], axis=1)

In [5]:
# #kal-input
first = True
for dname in glob.glob("../../subsample_testing/kalData_quarter/quants/"):
    kal_quart = read_kal_out(t2g_fname, 1017, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))

kal_quart = pd.DataFrame.from_dict(kal_quart).T


 Done1016                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [ ]:


In [6]:
# alv_quart.to_csv('./alv_quarter.mat')
# utl_quart.to_csv('./utl_quarter.mat')
# kal_quart.to_csv('./kal_quarter.mat')
qalv_quart.to_csv('./qalv_quarter.mat')

In [ ]:


In [7]:
alv_quart = pd.read_csv('./alv_quarter.mat', index_col=[0])
utl_quart = pd.read_csv('./utl_quarter.mat', index_col=[0])
kal_quart = pd.read_csv('./kal_quarter.mat', index_col=[0])

In [8]:
alv_full.shape, utl_full.shape, kal_full.shape, qalv_full.shape


Out[8]:
((53284, 1017), (44021, 1017), (51937, 1017), (61185, 1017))

In [9]:
alv_quart.shape, utl_quart.shape, kal_quart.shape, qalv_quart.shape


Out[9]:
((43414, 1017), (36313, 1017), (42817, 1017), (51992, 1017))

In [ ]:


In [10]:
alv_full.fillna(0, inplace=True)
utl_full.fillna(0, inplace=True)
kal_full.fillna(0, inplace=True)
qalv_full.fillna(0, inplace=True)

alv_quart.fillna(0, inplace=True)
utl_quart.fillna(0, inplace=True)
kal_quart.fillna(0, inplace=True)
qalv_quart.fillna(0, inplace=True)

In [11]:
alv_full = alv_full[(alv_full.T != 0).any()]
utl_full = utl_full[(utl_full.T != 0).any()]
kal_full = kal_full[(kal_full.T != 0).any()]
qalv_full = qalv_full[(qalv_full.T != 0).any()]

alv_quart = alv_quart[(alv_quart.T != 0).any()]
utl_quart = utl_quart[(utl_quart.T != 0).any()]
kal_quart = kal_quart[(kal_quart.T != 0).any()]
qalv_quart = qalv_quart[(qalv_quart.T != 0).any()]

In [12]:
alv_full.shape, utl_full.shape, kal_full.shape, qalv_full.shape


Out[12]:
((53284, 1017), (44021, 1017), (51937, 1017), (59671, 1017))

In [13]:
alv_quart.shape, utl_quart.shape, kal_quart.shape, qalv_quart.shape


Out[13]:
((43414, 1017), (36313, 1017), (42817, 1017), (50478, 1017))

In [14]:
sum(alv_full.sum()), sum(utl_full.sum()), sum(kal_full.sum()), sum(qalv_full.sum())


Out[14]:
(28396841.0, 26494794.0, 28662799.0, 37562052.697470345)

In [15]:
sum(alv_quart.sum()), sum(utl_quart.sum()), sum(kal_quart.sum()), sum(qalv_quart.sum())


Out[15]:
(8216180.0, 8066631.0, 8250236.0, 10720778.777274361)

In [16]:
cell_names=[]
with open("../../subsample_testing/kalData_quarter/batch.txt") as f:
    for line in f:
        cell_names.append(line.strip().split()[1].split("/")[1])

In [ ]:


In [30]:
name_dict = {}
for inx, cn in enumerate(cell_names):
    name_dict[str(inx)] = cn
kal_quart.rename(columns=name_dict)

In [ ]:


In [32]:
corrs_utl = []
corrs_alv = []
corrs_kal = []
corrs_qalv = []

for count, cell in enumerate(alv_full):
    print "\r Done " + str(count),
    
    temp = pd.concat([utl_full[cell], utl_quart[cell]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_utl.append(temp.corr(method="spearman").iloc[1][0])
    
    ind = temp.index
    full = alv_full[cell]
    quart = alv_quart[cell]
    temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_alv.append(temp.corr(method="spearman").iloc[1][0])
    
    full = kal_full[cell]
    quart = kal_quart[cell]
    temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_kal.append(temp.corr(method="spearman").iloc[1][0])
    
    full = qalv_full[cell]
    quart = qalv_quart[cell]
    temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_qalv.append(temp.corr(method="spearman").iloc[1][0])


 Done 1016                                                                                                                                                                                

In [35]:
sns.kdeplot(np.array(corrs_utl), color='red', label = "utools")
sns.kdeplot(np.array(corrs_alv), color='blue', label = "alevin")
sns.kdeplot(np.array(corrs_kal), color='green', label = "kallisto")
sns.kdeplot(np.array(corrs_qalv), color='cyan', label = "Full_alv")
plt.legend(loc=2)


Out[35]:
<matplotlib.legend.Legend at 0x7fe831a97350>

In [ ]:


In [21]:
corrs_utl = []
corrs_alv = []
corrs_cross = []
corrs_rev = []
corrs_full = []
corrs_quart = []

for count, cell in enumerate(alv_full):
    print "\r Done " + str(count),
    
    temp = pd.concat([utl_full[cell], alv_quart[cell]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_cross.append(temp.corr(method="spearman").iloc[1][0])

    temp = pd.concat([utl_full[cell], alv_full[cell]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_full.append(temp.corr(method="spearman").iloc[1][0])
    
    temp = pd.concat([utl_quart[cell], alv_quart[cell]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_quart.append(temp.corr(method="spearman").iloc[1][0])
    
    temp = pd.concat([alv_full[cell], utl_quart[cell]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_rev.append(temp.corr(method="spearman").iloc[1][0])
    
    temp = pd.concat([utl_full[cell], utl_quart[cell]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_utl.append(temp.corr(method="spearman").iloc[1][0])
    
    ind = temp.index
    full = alv_full[cell]
    quart = alv_quart[cell]
    temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
    temp = temp[(temp.T != 0).any()]
    corrs_alv.append(temp.corr(method="spearman").iloc[1][0])


 Done 1016                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [22]:
sns.kdeplot(np.array(corrs_utl), color='red', label = "utools")
sns.kdeplot(np.array(corrs_alv), color='blue', label = "alevin")
sns.kdeplot(np.array(corrs_cross), color='green', label = "utl full v alv quarter")
sns.kdeplot(np.array(corrs_rev), color='yellow', label = "alv full v utl quarter")
sns.kdeplot(np.array(corrs_full), color='black', label = "full v full")
sns.kdeplot(np.array(corrs_quart), color='brown', label = "quarter v quarter")
plt.legend(loc=2)


Out[22]:
<matplotlib.legend.Legend at 0x7f10985c7090>

In [ ]: