In [1]:
from collections import defaultdict
from random import random
import pysam
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import glob
import os
import sklearn
from sklearn import cluster


/home/avi/miniconda2/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [28]:
#reading functions

In [29]:
def read_kal_out(t2gFile, txpList, ecMatFile, countTsvFile, dfname):
    with open(txpList) as f:
        tlist = pd.read_table(f, header=None).rename(columns={0:'tid'})

    with open(countTsvFile) as f:
        counts = pd.DataFrame(pd.read_table(f, header=None).set_index(0)[2]).rename(columns={2:'counts'})
    sorted_idx = sorted(counts.to_dict()['counts'].keys())
    eqIdx = [0] * (sorted_idx[-1]+1)
    for elem in sorted_idx:
        eqIdx[elem] = 1
        
    kal_gene_count = defaultdict(int)
    other = 0
    eqCount = 0
    with open(ecMatFile) as f:
        for idx, line in enumerate(f):
            if eqIdx[idx]:
                eqCount += 1
                txps = line.strip().split()[1].split(',')
                gSet = set([])
                for txp in txps:
                    gSet.add(t2g[tlist.loc[int(txp)]['tid']])
                if len(gSet) == 1:
                    kal_gene_count[list(gSet)[0]] += counts.loc[idx]['counts']
                else:
                    other += counts.loc[idx]['counts']
        print eqCount
    
    kal_count = pd.DataFrame([kal_gene_count], columns=kal_gene_count.keys()).T.rename(columns={0:dfname})
    return kal_count

In [30]:
def read_eq(eqFile, dfname):
    with open(t2g_fname) as f:
        t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']
    with open(eqFile) as f:
        T = int(f.readline())
        E = int(f.readline())
        txps = []
        for _ in range(T):
            txps.append(f.readline().strip())

        counts = defaultdict(int)
        for line in f:
            toks = line.strip().split()
            t = int(toks[0])
            eqCount = int(toks[t+1])

            gSet = set([])
            for tid in range(t):
                gSet.add(t2g[txps[int(toks[tid+1])]])
            if len(gSet) == 1:
                counts[list(gSet)[0]] += eqCount
    return pd.DataFrame(counts.items()).set_index(0).rename(columns={1:dfname})

In [31]:
def read_sal(sfFile, dfname):
    data = pd.DataFrame(pd.read_table(sfFile).set_index('Name')['NumReads'])
    with open(t2g_fname) as f:
        t2g = pd.read_table(f, sep=",").set_index('TX_NAME')['GENE_NAME']
    ct = pd.concat([data, t2g], axis=1)
    gene_ct = ct.groupby('GENE_NAME').sum()
    return gene_ct[(gene_ct['NumReads'] != 0)].rename(columns={'NumReads':dfname})

In [32]:
def read_ut_out(utFile, dfname):
    ut_data = pd.read_table(open(utFile))
    cell_ut_data = ut_data.set_index('gene')
    del cell_ut_data['cell']
    cell_ut_data = cell_ut_data.rename(columns={'count':dfname})
    return cell_ut_data

In [33]:
t2g_fname = "../../data/mohu/gtf/txp2gene.tsv"
txpList_fname = "../kalData/txpList.txt"

In [34]:
with open(t2g_fname) as f:
    t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']

In [9]:
#read data

In [10]:
#kal-input

first = True
for dname in glob.glob("../../1kcell_testing/kalData/*"):
    if dname != "../kalData/txpList.txt":
        if first:
            first=False
            kal = read_kal_out(t2g_fname, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))
        else:
            temp = read_kal_out(t2g_fname, txpList_fname, dname+"/matrix.ec", dname+"/matrix.tsv", os.path.basename(dname))
            kal = pd.concat([kal, temp], axis=1)


9765
10465
5642
10596
5708
11656
9513
8254
11082
5637
8767
10497
11855
6989
8027
7525
10649
11361
11587
15142
12488
8435
10120
10231
8753
12642
22150
11340
14168
13728
7968
7727
6709
10006
4243
6077
9030
11075
14641
12753
7969
6991
8338
12616
15883
7118
10515
4723
8406
13498
12973
8726
7892
12931
13296
8522
11380
12537
7432
10557
16489
7808
12610
16147
7960
4214
10811
7955
8851
9872
13234
19153
8989
15958
8849
6218
4147
7617
7872
5087
13378
17526
7542
10677
8827
4717
5437
8799
8179
12830
13454
7335
11459
12695
10630
4191
13164
7598
15472
17609
9387
17267
16582
12357
6867
12855
7680
7546
14830
13931
7651
10727
11735
4110
14908
9025
13058
8545
8662
7677
10143
10870
10835
3136
7377
7913
12126
12959
8063
8486
9695
17527
12955
14242
10330
16068
7409
8632
6232
8565
8730
2564
10321
10785
8970
10964
11178
9071
13555
9510
2543
14011
15394
11438
14037
14767
14644
10498
12504
6250
3219
9951
8593
9145
12065
15435
8127
12257
9917
10762
10516
6280
5966
11009
11518
10262
12817
5010
6205
11910
17156
9454
6573
7031
14604
13287
12677
9220
15307
17180
12645
10626
8016
6466
9924
5294
8868
11044
10127
10144
13541
12063
4438
15520
6860
14382
10951
7975
10801
12806
13990
7263
8318
5496
13800
9355
12478
12051
7580
13445
14763
13024
13814
8352
6353
8430
8166
13127
12119
10624
11468
11929
8599
8865
11063
13873
6184
8400
14432
8322
12790
15114
10910
11983
12735
12998
7945
8178
7337
7837
13953
10691
12729
17692
8971
7122
10120
9705
14803
4063
6057
3734
8179
8468
14259
3490
14576
12072
6922
6791
13611
6828
10064
7886
7561
15415
14928
8849
7467
6883
12436
8476
10622
6689
11881
8528
11685
12040
13909
11673
6448
5692
13347
10183
12412
11166
8833
4837
2598
9327
15796
2608
10800
10322
7568
9918
6197
7658
6186
6025
10956
13840
11150
14612
17531
14612
14789
12710
14734
6317
4891
9621
6449
12867
8710
12489
11164
10631
6315
6458
7720
6950
7572
4962
6297
9741
14038
7709
7846
21326
3820
5927
9001
13253
11305
8706
10920
10328
7074
5180
8218
12269
7208
9606
15339
6776
8011
9077
1841
7586
8265
14871
10555
9145
14080
6061
8691
7479
13213
10639
9703
12996
9085
7750
3607
9659
11500
16904
16036
8793
2485
8002
7440
13441
7021
11589
13380
14159
14577
8933
11069
7478
4677
13924
7239
6749
6479
7334
6670
9500
10812
4363
9242
13281
6843
8830
7624
9482
8046
8034
5315
10044
9446
8399
6102
8414
11141
7429
8304
8052
11520
12085
13370
13925
15122
11943
12164
10788
11919
9054
12941
13955
2525
9962
9101
11308
6750
13443
8376
10832
11930
2672
14558
5168
9826
9254
7966
7828
7998
10130
7544
3057
12709
10635
11789
14424
15458
12655
13328
6704
9053
11735
10646
12418
15307
7735
12401
4217
13008
10543
9150
7778
15094
8407
9982
9630
2993
12497
11334
8034
12515
7103
13931
8367
15520
12160
8904
5334
7678
13765
16997
11872
10486
13014
9207
10108
10323
23264
7379
8585
14269
14844
9245
15831
7965
8063
4067
15175
9053
7134
8631
11786
8394
9378
8989
12574
8544
12882
7034
7100
6297
16875
8269
8955
13917
7431
16256
14567
8054
4716
7709
13867
16994
7143
13347
8735
3628
9430
11919
7856
11219
7609
7722
6971
12413
7836
6969
6131
12133
12339
13880
8160
6768
10937
5058
9603
6715
14258
6674
12637
7721
13799
8060
13922
8822
15375
8835
9315
6140
7723
6907
14326
7164
7814
11494
7670
9625
9957
7056
6866
14997
13645
11143
9806
8910
9060
7362
10860
7859
7569
13123
14104
9057
7254
11979
10566
12626
6715
9170
8805
7800
7309
12290
8208
7123
17084
7698
7606
6049
11096
13608
8583
13200
6420
9606
14401
9174
7665
9022
8212
13228
9103
13600
13434
8506
7138
12109
6485
14462
11056
11884
16566
15419
7210
9894
7009
7425
15614
12864
5142
11216
6094
7735
7545
10453
10142
13888
19515
20960
8247
13156
14089
5103
16635
12339
8554
7961
3406
4025
8480
14289
9593
12265
14386
3382
7477
9854
7577
14658
7171
13642
12810
9961
11272
7392
12689
13574
10563
13483
8259
11754
7998
12958
8054
2712
11341
6565
11421
12397
10164
15404
6878
14898
13693
8566
14832
12707
18104
13771
2848
15174
6726
7936
8362
7892
5085
6530
7426
7213
4245
13794
10147
14129
6675
12338
14157
3229
6547
8602
11342
9678
15669
11277
4381
7708
13332
6945
8135
5438
7950
7768
11143
6859
7759
12928
10836
8951
5495
13035
13045
7508
7119
12673
12042
9090
14504
4702
10090
9195
16320
13170
2179
9884
11088
12539
12525
14526
5741
7958
12294
14379
11617
9051
9560
8269
9969
14680
9556
3846
9738
7610
5848
10958
9869
13497
14261
8841
8361
8381
7463
8147
14245
2208
6612
11724
10248
10115
10556
13714
16173
9004
12172
9152
7367
14000
15868
6739
10262
6421
11068
13241
9469
14243
13925
9095
14124
10093
9608
2543
9633
9284
13506
14011
9297
8291
12907
11320
10392
7113
11747
17201
10590
5710
10368
7788
12281
8791
10092
8939
7904
11572
10099
8140
9773
5890
9998
14122
7328
8336
13691
8107
7462
10474
13809
12529
13308
16701
10569
15959
8445
15345
8044
14656
12988
4616
15923
15228
5301
5750
14578
12858
13300
7121
6837
7002
6809
10036
12868
11118
7650
8281
12667
8986
14487
10408
7919
8480
13144
8308
12489
12680
17094
6325
8214
13879
7990
6414
13873
8751
14549
13372
14003
7516
7986
2560
5660
13296
6839
8533
11260
10634
18610
6813
14250
10545
10602
7406
10869
14954
11497
12630
2465
16492
8212
14993
9232
7485
5200
11734
2402
7727
4655
11436
11438
7459
13395
12598
9172
12764
12640
8272
7707
8566
10381
7283
6532
10751
9331
13638
4235
10005
7666
9695
17487
6875
11911
7625
10112
7361
7601
10541
13903
10021
7641
9036
13177
5615
14502
8436
14310
10071
14842
7293
8883
12377
11181
6999
8924
8528
6721
9849
16427
14813
15949
7721
14223
4118
7629
14493
8210
7007
12842
9588
12829
7653
12525
8713
6968
5718
11587
7863
5922
8247
10418
5071
10847
16022
11626
3342
17565
8741
3638
15305
6847
7991
14139
18236
10920
12472
13246
11076
11747
9517
4453
7716
14656
8227

In [ ]:
first = True
for dname in glob.glob("../../1kcell_testing/salOut/alevin/cell/*"):
    if first:
        first = False
        sf = read_sal( dname+'/quant.sf', os.path.basename(dname))
    else:
        temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
        sf = pd.concat([sf, temp], axis=1)

In [ ]:
first = True
for dname in glob.glob("../../1kcell_testing/salOutBcUmi/alevin/cell/*"):
    if first:
        first = False
        malv = read_sal( dname+'/quant.sf', os.path.basename(dname))
    else:
        temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
        malv = pd.concat([malv, temp], axis=1)

In [ ]:
first = True
for dname in glob.glob("../../1kcell_testing/salOutBc/alevin/cell/*"):
    if first:
        first = False
        alv = read_sal( dname+'/quant.sf', os.path.basename(dname))
    else:
        temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
        alv = pd.concat([alv, temp], axis=1)

In [84]:
#utools import
first = True
for dname in glob.glob("../../1kcell_testing/utData/*"):
    if first:
        first = False
        utl = read_ut_out( dname+'/utools.count', os.path.basename(dname))
    else:
        temp = read_ut_out( dname+'/utools.count', os.path.basename(dname))
        utl = pd.concat([utl, temp], axis=1)

In [ ]:


In [43]:
kal.to_csv("kal-1k.mat")
utl.to_csv("utl-1k.mat")
malv.to_csv("malv-1k.mat")
sf.to_csv("sf-1k.mat")
alv.to_csv("alvBc-1k.mat")

In [13]:
#create clusters

In [2]:
kal = pd.read_csv('./kal-1k.mat', index_col=[0])
utl = pd.read_csv('./utl-1k.mat', index_col=[0])
malv = pd.read_csv('./malv-1k.mat', index_col=[0])
sf = pd.read_csv('./sf-1k.mat', index_col=[0])
alv = pd.read_csv('./alvBc-1k.mat', index_col=[0])

In [ ]:


In [3]:
# kal = kal.fillna(0)

# #from https://stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# mat = kal.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# kal_results = pd.DataFrame([kal.T.index,labels]).T.set_index(0).rename(columns={1:'kallisto'})

In [4]:
# #from https://stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# alv = alv.fillna(0)
# mat = alv.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# alv_results = pd.DataFrame([alv.T.index,labels]).T.set_index(0).rename(columns={1:'alv'})

In [ ]:


In [5]:
# # from https //stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# malv = malv.fillna(0)
# mat = malv.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# malv_results = pd.DataFrame([malv.T.index,labels]).T.set_index(0).rename(columns={1:'malv'})

In [6]:
# #from https://stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# sf = sf.fillna(0)
# mat = sf.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# sf_results = pd.DataFrame([sf.T.index,labels]).T.set_index(0).rename(columns={1:'sf'})

In [7]:
# #from https://stackoverflow.com/a/30111487/6871844
# # Convert DataFrame to matrix
# utl = utl.fillna(0)
# mat = utl.T.as_matrix()
# # Using sklearn
# km = sklearn.cluster.KMeans(n_clusters=2)
# km.fit(mat)
# # Get cluster assignment labels
# labels = km.labels_
# # Format results as a DataFrame
# utl_results = pd.DataFrame([utl.T.index,labels]).T.set_index(0).rename(columns={1:'umi_tools'})

In [ ]:


In [ ]:


In [3]:
tr_results = pd.read_table("../../data/10x/mohu/1k/whitelist.tsv", header=None).set_index(0)
tr_results['tr'] = [0]*504 + [1]*516

In [4]:
import collections
print [item for item, count in collections.Counter(tr_results.index).items() if count > 1]


['AGCTCCTGTTGTGGAG', 'ACGCCAGTCGGATGGA', 'GGCAATTAGGAATCGC']

In [5]:
tr_results = tr_results.drop(['AGCTCCTGTTGTGGAG', 'ACGCCAGTCGGATGGA', 'GGCAATTAGGAATCGC'])

In [6]:
first_cluster = tr_results[tr_results['tr'] == 1].index

In [7]:
second_cluster = tr_results[tr_results['tr'] == 0].index

In [ ]:


In [8]:
malv_results = malv[first_cluster].fillna(0)
sf_results = sf[first_cluster].fillna(0)
utl_results = utl[first_cluster].fillna(0)
kal_results = kal[first_cluster].fillna(0)
alv_results = alv[first_cluster].fillna(0)

In [ ]:


In [92]:
#combine data

In [93]:
# results = pd.concat([tr_results, malv_results, sf_results, utl_results, kal_results], axis=1)
# results = pd.concat([tr_results, kal_results], axis=1)

In [94]:
# results.sum()

In [95]:
# alv_intra = alv[results[results['alv']==1].index]
# malv_intra = malv[results[results['malv']==0].index]
# utl_intra = utl[results[results['umi_tools']==1].index]
# sf_intra = sf[results[results['sf']==0].index]
# kal_intra = kal[results[results['kallisto']==0].index]

In [9]:
# genes = set(alv_intra.index) & set(utl_intra.index) & set(sf_intra.index) & set(kal_intra.index)
genes = set(malv_results.index) & set(kal_results.index)

In [10]:
len(genes), len(malv_results.index)


Out[10]:
(50989, 61187)

In [ ]:
# alv_intra = alv[results[results['alv']==0].index].corr(method="spearman")
# utl_intra = utl[results[results['umi_tools']==1].index].corr(method="spearman")
# sf_intra = sf[results[results['sf']==1].index].corr(method="spearman")
# kal_intra = kal[results[results['kallisto']==0].index].corr(method="spearman")

# alv_intra = alv_results.loc[genes].corr(method="spearman")
# malv_intra = malv_results.corr(method="spearman")
# utl_intra = utl_results.corr(method="spearman")
# sf_intra = sf_results.loc[genes].corr(method="spearman")
# kal_intra = kal_results.loc[genes].corr(method="spearman")

alv_intra = alv_results.corr(method="spearman")
malv_intra = malv_results.corr(method="spearman")
utl_intra = utl_results.corr(method="spearman")
sf_intra = sf_results.corr(method="spearman")
kal_intra = kal_results.corr(method="spearman")

In [ ]:


In [112]:
#make intra cluster correlations

In [113]:
# alv_intra = alv[results[results['alv']==1].index].corr(method="spearman")
# utl_intra = utl[results[results['umi_tools']==0].index].corr(method="spearman")
# sf_intra = sf[results[results['sf']==0].index].corr(method="spearman")
# kal_intra = kal[results[results['kallisto']==0].index].corr(method="spearman")

In [12]:
malv_intra.shape, kal_intra.shape


Out[12]:
((513, 513), (513, 513))

In [14]:
# plt.subplot(2, 3, 1)
# plt.axis('off')
# plt.title("alevin-bc")
# sns.heatmap(alv_intra)
plt.subplot(2, 2, 1)
plt.axis('off')
plt.title("multi_alevin")
sns.heatmap(malv_intra)
# plt.subplot(2, 3, 3)
# plt.axis('off')
# plt.title("old_alevin")
# sns.heatmap(sf_intra)
# plt.subplot(2, 3, 4)
# plt.axis('off')
# plt.title("umi_tools")
# sns.heatmap(utl_intra)
plt.subplot(2, 2, 2)
plt.axis('off')
plt.title("kallisto")
sns.heatmap(kal_intra)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe9cae7fed0>

In [ ]:
alv_list = []
sf_list = []
utl_list = []
malv_list = []
kal_list = []
for x in alv_intra.values:
    for e in x.tolist():
        if e !=0 and e != 1:
            alv_list.append(e)
for x in malv_intra.values:
    for e in x.tolist():1
        if e !=0 and e != 1:
            malv_list.append(e)
for x in sf_intra.values:
    for e in x.tolist():
        if e !=0 and e != 1:
            sf_list.append(e)
for x in utl_intra.values:
    for e in x.tolist():
        if e !=0 and e != 1:
            utl_list.append(e)
for x in kal_intra.values:
    for e in x.tolist():
        if e !=0 and e != 1:
            kal_list.append(e)

In [ ]:
# import seaborn
# np.random.seed(0)

# n_bins = 30
# # x = np.random.randn(1000, 3)
# x = [alv_list, sf_list, utl_list]

# colors = ['red', 'blue', 'lime']
# n, x, _ = plt.hist(x, n_bins, histtype='bar', color=colors, label=['alevin', 'eq_alevin', 'umi_tools'])
# plt.legend(prop={'size': 10})

# plt.show()

In [90]:
# sns.kdeplot(np.array(alv_list), label= "alv-bc")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()


Out[90]:
<matplotlib.legend.Legend at 0x7f6d8d94c9d0>

In [110]:
# sns.kdeplot(np.array(alv_list), label= "alv-bc")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()


Out[110]:
<matplotlib.legend.Legend at 0x7f6da4cbab10>

In [16]:
# sns.kdeplot(np.array(alv_list), label= "alv-bc")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
# sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()


Out[16]:
<matplotlib.legend.Legend at 0x7fe93ff072d0>

In [21]:
# sns.kdeplot(np.array(alv_list), label= "alv-bc")
sns.kdeplot(np.array(kal_list), label= "kallisto")
sns.kdeplot(np.array(malv_list), label= "malv")
# sns.kdeplot(np.array(sf_list), label = "alevin")
sns.kdeplot(np.array(utl_list), label = "Umi-tools")
plt.legend()


Out[21]:
<matplotlib.legend.Legend at 0x7fe88c1f9c10>

In [2]:
#  len(malv_list), len(utl_list)

In [3]:
# malv.shape, utl.shape

In [4]:
# malv_intra.shape, utl_intra.shape

In [37]:
# malv_intra.to_csv("malv.mat")
# utl_intra.to_csv("utl.mat")

In [40]:


In [53]:
# malv_intra.rename(columns=lambda x: x+"-1", inplace=True)

In [54]:
# utl_intra.rename(columns=lambda x: x+"-2", inplace=True)

In [57]:
# ct = pd.concat([malv_intra, utl_intra], axis=1).fillna(0)

In [59]:
# ct.to_csv("agg.mat")

In [ ]: