In [1]:
from collections import defaultdict
from random import random
import pysam
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import glob
import os
import sklearn
from sklearn import cluster
import cPickle as pickle
In [2]:
def read_eq(eqFile, dfname):
with open(eqFile) as f:
T = int(f.readline())
# E = int(f.readline())
txps = []
for _ in range(T):
txps.append(f.readline().strip())
counts = defaultdict(int)
for line in f:
toks = line.strip().split()
t = int(toks[0])
eqCount = int(toks[t+1])
gSet = set([])
for tid in range(t):
gSet.add(t2g[txps[int(toks[tid+1])]])
if len(gSet) == 1:
counts[list(gSet)[0]] += eqCount
return pd.DataFrame(counts.items()).set_index(0).rename(columns={1:dfname})
In [3]:
def read_sal(sfFile, dfname):
data = pd.DataFrame(pd.read_table(sfFile).set_index('Name')['NumMolecules'])
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME')['GENE_NAME']
ct = pd.concat([data, t2g], axis=1)
gene_ct = ct.groupby('GENE_NAME').sum()
return gene_ct[(gene_ct['NumMolecules'] != 0)].rename(columns={'NumMolecules':dfname})
In [4]:
def read_ut_out(utFile, dfname):
ut_data = pd.read_table(open(utFile))
cell_ut_data = ut_data.set_index('gene')
del cell_ut_data['cell']
cell_ut_data = cell_ut_data.rename(columns={'count':dfname})
return cell_ut_data
In [5]:
t2g_fname = "../../data/mohu/gtf/txp2gene.tsv"
txpList_fname = "../kalData/txpList.txt"
with open(t2g_fname) as f:
t2g = pd.read_table(f, sep=",").set_index('TX_NAME').to_dict()['GENE_NAME']
In [ ]:
In [ ]:
In [6]:
alv_full = pd.read_csv('./fast-eq-1k.mat', index_col=[0])
In [8]:
qalv_full = pd.read_csv('./fast-malv-1k.mat', index_col=[0])
In [7]:
utl_full = pd.read_csv('./utl-1k.mat', index_col=[0])
In [ ]:
In [9]:
first = True
for dname in glob.glob("/mnt/scratch5/avi/alevin/subsample_testing/alv_half/alevin/cell/*"):
if first:
first = False
alv_quart = read_eq( dname+'/cell_eq_classes.txt', os.path.basename(dname))
else:
temp = read_eq( dname+'/cell_eq_classes.txt', os.path.basename(dname))
alv_quart = pd.concat([alv_quart, temp], axis=1)
In [9]:
first = True
for dname in glob.glob("/mnt/scratch5/avi/alevin/subsample_testing/alv_half/alevin/cell/*"):
if first:
first = False
qalv_quart = read_sal( dname+'/quant.sf', os.path.basename(dname))
else:
temp = read_sal( dname+'/quant.sf', os.path.basename(dname))
qalv_quart = pd.concat([qalv_quart, temp], axis=1)
In [8]:
first = True
for dname in glob.glob("/mnt/scratch5/avi/alevin/subsample_testing/utData_half/*"):
if first:
first = False
utl_quart = read_ut_out( dname+'/utools.count', os.path.basename(dname))
else:
temp = read_ut_out( dname+'/utools.count', os.path.basename(dname))
utl_quart = pd.concat([utl_quart, temp], axis=1)
In [10]:
# alv_quart.to_csv('./alv_half.mat')
# utl_quart.to_csv('./utl_half.mat')
qalv_quart.to_csv('./qalv_half.mat')
In [ ]:
In [11]:
alv_quart = pd.read_csv('./alv_half.mat', index_col=[0])
utl_quart = pd.read_csv('./utl_half.mat', index_col=[0])
# qalv_quart = pd.read_csv('./qalv_half.mat', index_col=[0])
In [12]:
alv_full.shape, utl_full.shape, qalv_full.shape, alv_quart.shape, utl_quart.shape, qalv_quart.shape
Out[12]:
In [13]:
alv_full.fillna(0, inplace=True)
utl_full.fillna(0, inplace=True)
qalv_full.fillna(0, inplace=True)
alv_quart.fillna(0, inplace=True)
utl_quart.fillna(0, inplace=True)
qalv_quart.fillna(0, inplace=True)
In [14]:
alv_full.shape, utl_full.shape, qalv_full.shape, alv_quart.shape, utl_quart.shape, qalv_quart.shape
Out[14]:
In [15]:
corrs_utl = []
corrs_alv = []
corrs_qalv = []
for count, cell in enumerate(alv_full):
print "\r Done " + str(count),
temp = pd.concat([utl_full[cell], utl_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_utl.append(temp.corr(method="spearman").iloc[1][0])
ind = temp.index
full = alv_full[cell]
quart = alv_quart[cell]
temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_alv.append(temp.corr(method="spearman").iloc[1][0])
ind = temp.index
full = qalv_full[cell]
quart = qalv_quart[cell]
temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_qalv.append(temp.corr(method="spearman").iloc[1][0])
In [17]:
sns.kdeplot(np.array(corrs_utl), color='red', label = "utools")
sns.kdeplot(np.array(corrs_alv), color='blue', label = "alevin")
sns.kdeplot(np.array(corrs_qalv), color='green', label = "full_alevin")
plt.legend()
Out[17]:
In [ ]:
In [16]:
corrs_utl = []
corrs_alv = []
corrs_cross = []
corrs_rev = []
corrs_full = []
corrs_quart = []
for count, cell in enumerate(alv_full):
print "\r Done " + str(count),
temp = pd.concat([utl_full[cell], alv_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_cross.append(temp.corr(method="spearman").iloc[1][0])
temp = pd.concat([utl_full[cell], alv_full[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_full.append(temp.corr(method="spearman").iloc[1][0])
temp = pd.concat([utl_quart[cell], alv_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_quart.append(temp.corr(method="spearman").iloc[1][0])
temp = pd.concat([alv_full[cell], utl_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_rev.append(temp.corr(method="spearman").iloc[1][0])
temp = pd.concat([utl_full[cell], utl_quart[cell]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_utl.append(temp.corr(method="spearman").iloc[1][0])
ind = temp.index
full = alv_full[cell]
quart = alv_quart[cell]
temp = pd.concat([pd.DataFrame(full.loc[ind]), quart.loc[ind]], axis=1).fillna(0)
temp = temp[(temp.T != 0).any()]
corrs_alv.append(temp.corr(method="spearman").iloc[1][0])
In [17]:
sns.kdeplot(np.array(corrs_utl), color='red', label = "utools")
sns.kdeplot(np.array(corrs_alv), color='blue', label = "alevin")
sns.kdeplot(np.array(corrs_cross), color='green', label = "utl full v alv quarter")
sns.kdeplot(np.array(corrs_rev), color='yellow', label = "alv full v utl quarter")
sns.kdeplot(np.array(corrs_full), color='black', label = "full v full")
sns.kdeplot(np.array(corrs_quart), color='brown', label = "quarter v quarter")
plt.legend(loc=2)
Out[17]:
In [ ]: