In [1]:
from collections import defaultdict
from random import random
import pysam
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import glob
import os
import sklearn
from sklearn import cluster


/home/avi/miniconda2/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [2]:
#reading functions

In [3]:
def read_kal_out(t2gFile, txpList, ecMatFile, countTsvFile, dfname):
    with open(txpList) as f:
        tlist = pd.read_table(f, header=None).rename(columns={0:'tid'})

    with open(countTsvFile) as f:
        counts = pd.DataFrame(pd.read_table(f, header=None).set_index(0)[2]).rename(columns={2:'counts'})
    sorted_idx = sorted(counts.to_dict()['counts'].keys())
    eqIdx = [0] * (sorted_idx[-1]+1)
    for elem in sorted_idx:
        eqIdx[elem] = 1
        
    kal_gene_count = defaultdict(int)
    other = 0
    eqCount = 0
    with open(ecMatFile) as f:
        for idx, line in enumerate(f):
            if eqIdx[idx]:
                eqCount += 1
                txps = line.strip().split()[1].split(',')
                gSet = set([])
                for txp in txps:
                    gSet.add(t2g[tlist.loc[int(txp)]['tid']])
                if len(gSet) == 1:
                    kal_gene_count[list(gSet)[0]] += counts.loc[idx]['counts']
                else:
                    other += counts.loc[idx]['counts']
        print eqCount
    
    kal_count = pd.DataFrame([kal_gene_count], columns=kal_gene_count.keys()).T.rename(columns={0:dfname})
    return kal_count

In [4]:
def read_eq(eqFile, dfname):
    with open(t2g_fname) as f:
        t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']
    with open(eqFile) as f:
        T = int(f.readline())
        E = int(f.readline())
        txps = []
        for _ in range(T):
            txps.append(f.readline().strip())

        counts = defaultdict(int)
        for line in f:
            toks = line.strip().split()
            t = int(toks[0])
            eqCount = int(toks[t+1])

            gSet = set([])
            for tid in range(t):
                gSet.add(t2g[txps[int(toks[tid+1])]])
            if len(gSet) == 1:
                counts[list(gSet)[0]] += eqCount
    return pd.DataFrame(counts.items()).set_index(0).rename(columns={1:dfname})

In [5]:
def read_sal(sfFile, dfname):
    data = pd.DataFrame(pd.read_table(sfFile).set_index('Name')['NumReads'])
    with open(t2g_fname) as f:
        t2g = pd.read_table(f, sep=",").set_index('TX_NAME')['GENE_NAME']
    ct = pd.concat([data, t2g], axis=1)
    gene_ct = ct.groupby('GENE_NAME').sum()
    return gene_ct[(gene_ct['NumReads'] != 0)].rename(columns={'NumReads':dfname})

In [6]:
def read_ut_out(utFile, dfname):
    ut_data = pd.read_table(open(utFile))
    cell_ut_data = ut_data.set_index('gene')
    del cell_ut_data['cell']
    cell_ut_data = cell_ut_data.rename(columns={'count':dfname})
    return cell_ut_data

In [7]:
t2g_fname = "../../data/mohu/gtf/txp2gene.tsv"
txpList_fname = "../kalData/txpList.txt"

In [8]:
with open(t2g_fname) as f:
    t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']

In [8]:
#read data

In [9]:
#kal-input

first = True
for dname in glob.glob("../kalData/*"):
    if dname != "../kalData/txpList.txt":
        if first:
            first=False
            kal = read_kal_out(t2g_fname, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))
        else:
            temp = read_kal_out(t2g_fname, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))
            kal = pd.concat([kal, temp], axis=1)


10613
13815
6840
16943
8656
11176
15025
12288
17066
7647
7730
8450
11061
9122
8954
12313
10692
11576
10306
17357
11269
7838
4975
6846
8030
9935
7655
11568
6513
7205
6558
13833
6295
8409
12504
14724
12537
13378
7529
7645
12268
6281
7121
14149
13726
13192
12639
11852
13469
8590
9637
13613
7411
14189
10901
6718
14904
12513
11894
7694
8767
9026
13066
15763
11975
3171
13423
1944
8018
13631
6544
10839
11182
7754
8022
6729
8075
11728
8258
7175
7743
8809
7063
10937
7367
8410
15286
12557
8367
11866
12893
12280
6165
13180
6931
10397
9878
13904
8171
15923

In [17]:
first = True
for dname in glob.glob("../salOut/*"):
    if first:
        first = False
        sf = read_sal( dname+'/alevin/quant.sf', os.path.basename(dname))
    else:
        temp = read_sal( dname+'/alevin/quant.sf', os.path.basename(dname))
        sf = pd.concat([sf, temp], axis=1)

In [9]:
first = True
for dname in glob.glob("../salOut/alevin/cell/*"):
    if first:
        first = False
        malv = read_sal( dname+'/quant.sf', os.path.basename(dname))
    else:
        temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
        malv = pd.concat([malv, temp], axis=1)

In [11]:
first = True
for dname in glob.glob("../salOut/*"):
    if first:
        first = False
        alv = read_eq( dname+'/alevin/counts.alv', os.path.basename(dname))
    else:
        temp = read_eq( dname+'/alevin/counts.alv', os.path.basename(dname))
        alv = pd.concat([alv, temp], axis=1)

In [10]:
#utools import
first = True
for dname in glob.glob("../utData/*"):
    if first:
        first = False
        utl = read_ut_out( dname+'/utools.count', os.path.basename(dname))
    else:
        temp = read_ut_out( dname+'/utools.count', os.path.basename(dname))
        utl = pd.concat([utl, temp], axis=1)

In [ ]:


In [13]:
#create clusters

In [19]:
kal = kal.fillna(0)

#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
mat = kal.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
kal_results = pd.DataFrame([kal.T.index,labels]).T.set_index(0).rename(columns={1:'kallisto'})

In [24]:
#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
alv = alv.fillna(0)
mat = alv.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
alv_results = pd.DataFrame([alv.T.index,labels]).T.set_index(0).rename(columns={1:'alv'})


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-41dfe2cefe0e> in <module>()
      3 #from https://stackoverflow.com/a/30111487/6871844
      4 # Convert DataFrame to matrix
----> 5 alv = alv.fillna(0)
      6 mat = alv.T.as_matrix()
      7 # Using sklearn

NameError: name 'alv' is not defined

In [ ]:


In [11]:
#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
malv = malv.fillna(0)
mat = malv.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
malv_results = pd.DataFrame([malv.T.index,labels]).T.set_index(0).rename(columns={1:'malv'})

In [26]:
#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
sf = sf.fillna(0)
mat = sf.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
sf_results = pd.DataFrame([sf.T.index,labels]).T.set_index(0).rename(columns={1:'sf'})

In [12]:
#from https://stackoverflow.com/a/30111487/6871844
# Convert DataFrame to matrix
utl = utl.fillna(0)
mat = utl.T.as_matrix()
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
utl_results = pd.DataFrame([utl.T.index,labels]).T.set_index(0).rename(columns={1:'umi_tools'})

In [ ]:


In [ ]:


In [13]:
tr_results = pd.read_table("../../data/10x/mohu/100/whitelist.tsv", header=None).set_index(0)
tr_results['tr'] = [0]*50 + [1]*50

In [ ]:


In [14]:
#combine data

In [15]:
# results = pd.concat([tr_results, alv_results, sf_results, utl_results, kal_results], axis=1)
results = pd.concat([tr_results, utl_results, malv_results], axis=1)

In [16]:
results.sum()


Out[16]:
tr           50.0
umi_tools    50.0
malv         50.0
dtype: float64

In [52]:
# alv_intra = alv[results[results['alv']==1].index]
malv_intra = malv[results[results['malv']==0].index]
utl_intra = utl[results[results['umi_tools']==1].index]
# sf_intra = sf[results[results['sf']==0].index]
# kal_intra = kal[results[results['kallisto']==0].index]

In [27]:
# genes = set(alv_intra.index) & set(utl_intra.index) & set(sf_intra.index) & set(kal_intra.index)
genes = set(malv_intra.index) & set(utl_intra.index) & set(sf_intra.index)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-5c11a5194c31> in <module>()
      1 # genes = set(alv_intra.index) & set(utl_intra.index) & set(sf_intra.index) & set(kal_intra.index)
----> 2 genes = set(malv_intra.index) & set(utl_intra.index) & set(sf_intra.index)

NameError: name 'sf_intra' is not defined

In [ ]:


In [28]:
# alv_intra = alv[results[results['alv']==0].index].corr(method="spearman")
# utl_intra = utl[results[results['umi_tools']==1].index].corr(method="spearman")
# sf_intra = sf[results[results['sf']==1].index].corr(method="spearman")
# kal_intra = kal[results[results['kallisto']==1].index].corr(method="spearman")

# alv_intra = alv_intra.loc[genes].corr(method="spearman")
malv_intra = malv_intra.corr(method="spearman")
utl_intra = utl_intra.corr(method="spearman")
# sf_intra = sf_intra.loc[genes].corr(method="spearman")
# kal_intra = kal_intra.loc[genes].corr(method="spearman")

In [29]:
#make intra cluster correlations

In [30]:
# alv_intra = alv[results[results['alv']==1].index].corr(method="spearman")
# utl_intra = utl[results[results['umi_tools']==0].index].corr(method="spearman")
# sf_intra = sf[results[results['sf']==0].index].corr(method="spearman")
# kal_intra = kal[results[results['kallisto']==0].index].corr(method="spearman")

In [ ]:


In [31]:
# plt.subplot(2, 2, 1)
# plt.axis('off')
# plt.title("alevin")
# sns.heatmap(alv_intra)

plt.subplot(2, 2, 1)
plt.axis('off')
plt.title("multi_alevin")
sns.heatmap(malv_intra)
# plt.subplot(2, 2, 2)
# plt.axis('off')
# plt.title("sf")
# sns.heatmap(sf_intra)
plt.subplot(2, 2, 2)
plt.axis('off')
plt.title("umi_tools")
sns.heatmap(utl_intra)
# plt.subplot(2, 2, 4)
# plt.axis('off')
# plt.title("kallisto")
# sns.heatmap(kal_intra)


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fce75130e10>

In [32]:
# alv_list = []
# sf_list = []
utl_list = []
malv_list = []
# kal_list = []
# for x in alv_intra.values:
#     for e in x.tolist():
#         if e !=0 and e != 1:
#             alv_list.append(e)
for x in malv_intra.values:
    for e in x.tolist():
        if e !=0 and e != 1:
            malv_list.append(e)
# for x in sf_intra.values:
#     for e in x.tolist():
#         if e !=0 and e != 1:
#             sf_list.append(e)
for x in utl_intra.values:
    for e in x.tolist():
        if e !=0 and e != 1:
            utl_list.append(e)
# for x in kal_intra.values:
#     for e in x.tolist():
#         if e !=0 and e != 1:
#             kal_list.append(e)

In [33]:
# import seaborn
# np.random.seed(0)

# n_bins = 30
# # x = np.random.randn(1000, 3)
# x = [alv_list, sf_list, utl_list]

# colors = ['red', 'blue', 'lime']
# n, x, _ = plt.hist(x, n_bins, histtype='bar', color=colors, label=['alevin', 'eq_alevin', 'umi_tools'])
# plt.legend(prop={'size': 10})

# plt.show()

In [ ]:


In [34]:
# sns.kdeplot(np.array(alv_list), label= "eq_class")
# sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()


Out[34]:
<matplotlib.legend.Legend at 0x7fce6d5aa710>

In [147]:
sns.kdeplot(np.array(alv_list), label= "eq_class")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()


Out[147]:
<matplotlib.legend.Legend at 0x7fbe5a9f5990>

In [35]:
len(malv_list), len(utl_list)


Out[35]:
(2450, 2450)

In [36]:
malv.shape, utl.shape


Out[36]:
((42596, 100), (30729, 100))

In [39]:
malv_intra.shape, utl_intra.shape


Out[39]:
((42596, 50), (30729, 50))

In [37]:
malv_intra.to_csv("malv.mat")
utl_intra.to_csv("utl.mat")

In [40]:


In [53]:
malv_intra.rename(columns=lambda x: x+"-1", inplace=True)

In [54]:
utl_intra.rename(columns=lambda x: x+"-2", inplace=True)

In [57]:
ct = pd.concat([malv_intra, utl_intra], axis=1).fillna(0)

In [59]:
ct.to_csv("agg.mat")

In [ ]: