In [1]:
import pandas as pd
import numpy as np
import pylab as plt
import seaborn as sns
from sklearn.cluster import MiniBatchKMeans
#plt.rcParams['font.family']='Serif'

def import_data():

    path = '../data/stem-cell/'
    file_names=['NG_norm_concat.csv','NN_norm_concat.csv','4FI_norm_concat.csv']

    NN_filenames = ['NN_00.export.csv',
    'NN_02.export.csv',
    'NN_04.export.csv',
    'NN_06.export.csv',
    'NN_08.export.csv',
    'NN_10.export.csv',
    'NN_12.export.csv',
    'NN_14.export.csv',
    'NN_16.export.csv',
    'NN_18.export.csv',
    'NN_20.export.csv',
    'NN_22.export.csv',
    'NN_24.export.csv',
    'NN_30.export.csv']

    NG_filenames = ['NG_00.export.csv',
    'NG_02.export.csv',
    'NG_04.export.csv',
    'NG_06.export.csv',
    'NG_08.export.csv',
    'NG_10.export.csv',
    'NG_12.export.csv',
    'NG_14.export.csv',
    'NG_16.export.csv',
    'NG_18.export.csv',
    'NG_20.export.csv',
    'NG_22.export.csv',
    'NG_24.export.csv',
    'NG_30.export.csv']

    Oct4_filenames = [
    '4FI_00.export.csv',
    '4FI_01.export.csv',
    '4FI_02.export.csv',
    '4FI_04.export.csv',
    '4FI_06.export.csv',
    '4FI_08.export.csv',
    '4FI_10.export.csv',
    '4FI_12.export.csv',
    '4FI_14.export.csv',
    '4FI_16.export.csv',
    '4FI_17.export.csv',
    '4FI_18.export.csv',
    '4FI_20.export.csv']

    NG = [pd.read_csv(path+'NG/'+fname) for fname in NG_filenames]
    NN = [pd.read_csv(path+'NN/'+fname) for fname in NN_filenames]
    Oct4 = [pd.read_csv(path+'4FI/'+fname) for fname in Oct4_filenames]

    # short, nonredundant lists of the days covered
    days_ng = [int(s[3:5]) for s in NG_filenames]
    days_nn = [int(s[3:5]) for s in NN_filenames]
    days_oct = [int(s[4:6]) for s in Oct4_filenames]

    # full-length arrays assigning each observation to a day
    times_ng = np.hstack(np.ones(len(n))*days_ng[i] for i,n in enumerate(NG))
    times_nn = np.hstack(np.ones(len(n))*days_nn[i] for i,n in enumerate(NN))
    times_oct = np.hstack(np.ones(len(n))*days_oct[i] for i,n in enumerate(Oct4))


    #if dataframes:
    #    return NG,NN,Oct4,times_ng,times_nn,times_oct
    #else:
    data_NG = np.vstack(n for n in NG)[:,2:-3]
    data_NN = np.vstack(n for n in NN)[:,2:-3]
    data_Oct4 = np.vstack(n for n in Oct4)[:,2:-3]

    return data_NG,data_NN,data_Oct4,times_ng,times_nn,times_oct

def plot_cells_per_day():
    plt.plot(times,[len(n) for n in NG],label='Nanog-GFP')
    plt.plot(times,[len(n) for n in NN],label='Nanog-Neo')
    plt.plot(times_oct,[len(n) for n in Oct4],label='Oct4-GFP')
    plt.legend(loc='best')
    plt.ylabel('Number of cells')
    plt.xlabel('Day')
    plt.title('Cells per day')
    plt.savefig('../figures/cells_per_day.pdf')

def process_data(X,cofactor=15):
    return np.arcsinh(X/cofactor)

def compare_2d_embeddings(X,Y,algos):
    raise NotImplementedError('[WIP]')

    X_ = dict()
    for i,algo in enumerate(algos):
        X_[i] = algo.fit_transform(X)

def cluster_prog_analysis(X,times,num_clust=20):
    algorithm=MiniBatchKMeans(num_clust)
    y = algorithm.fit_predict(X)

    def num_in_clust(Y,num_clusters=50):
        assert(num_clusters >= max(Y))
        occupancy = np.zeros(num_clusters)
        for y in Y:
            occupancy[y] += 1
        return occupancy

    clust_ind = sorted(range(num_clust),key=lambda i:(y==i).dot(np.arange(len(y))))
    clust_mat = np.array([num_in_clust(y[times==t],num_clust)[clust_ind] for t in sorted(set(times))]).T


    aspect = float(clust_mat.shape[1]) / clust_mat.shape[0]
    plt.imshow(clust_mat,aspect=aspect,
              interpolation='none',cmap='Blues')
    plt.xlabel('Timestep')
    plt.ylabel('Cluster ID')
    plt.title('Cluster occupancy per time step')
    plt.colorbar()
    plt.savefig('../figures/clust_prog_analysis.pdf')

    return clust_mat, y

NG_raw,NN_raw,Oct4_raw, t_NG,t_NN,t_Oct4 = import_data()
NG,NN,Oct4 = (process_data(dataset) for dataset in (NG_raw,NN_raw,Oct4_raw))

X = np.vstack((NG,NN,Oct4))
#Y = np.vstack((t_NG,t_NN+2*t_NG.max(),t_Oct4+2*(t_NG.max()+t_NN.max())))

#compare_2d_embeddings(X,Y)
clust_mat,y = cluster_prog_analysis(NG,t_NG)

In [2]:
NG.shape,NN.shape,Oct4.shape


Out[2]:
((203017, 44), (559654, 44), (296682, 44))

In [3]:
Y = np.hstack((np.zeros(len(NG)),np.ones(len(NN)),2*np.ones(len(Oct4)))).T

In [4]:
from sklearn.decomposition import PCA
pca = PCA()

In [5]:
pca.fit(X)


Out[5]:
PCA(copy=True, n_components=None, whiten=False)

In [6]:
%matplotlib inline

In [7]:
plt.plot(pca.explained_variance_ratio_)


Out[7]:
[<matplotlib.lines.Line2D at 0x10cb06dd8>]

In [12]:
ind = np.random.rand(len(X)) < 0.01
X_ = pca.transform(X)
X__ = X_[ind]
plt.scatter(X__[:,0],X__[:,1],c=Y[ind],cmap='rainbow',alpha=0.2)


Out[12]:
<matplotlib.collections.PathCollection at 0x1125bd710>

In [13]:
Y.shape,X_.shape


Out[13]:
((1059353,), (1059353, 44))

In [14]:
#downsample
np.random.seed(0)
ind = np.random.rand(len(X))<0.01
X_s = X_[ind]
Y_s = Y[ind]

In [15]:
# all three treatments on same axes, with same color scheme
sns.kdeplot(X_s[Y_s==0][:,0],X_s[Y_s==0][:,1])
sns.kdeplot(X_s[Y_s==1][:,0],X_s[Y_s==1][:,1])
sns.kdeplot(X_s[Y_s==2][:,0],X_s[Y_s==2][:,1])


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1124dacf8>

In [16]:
# separate axes, seperate color schemes
sns.kdeplot(X_s[Y_s==0][:,0],X_s[Y_s==0][:,1],cmap='Blues')
plt.figure()
sns.kdeplot(X_s[Y_s==1][:,0],X_s[Y_s==1][:,1],cmap='Greens')
plt.figure()
sns.kdeplot(X_s[Y_s==2][:,0],X_s[Y_s==2][:,1],cmap='Reds')


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x1127b6e48>

In [17]:
# all on the same axes, but different colorschemes
sns.kdeplot(X_s[Y_s==0][:,0],X_s[Y_s==0][:,1],cmap='Blues')
sns.kdeplot(X_s[Y_s==1][:,0],X_s[Y_s==1][:,1],cmap='Greens')
sns.kdeplot(X_s[Y_s==2][:,0],X_s[Y_s==2][:,1],cmap='Reds')


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cb64a58>

In [ ]:
sns.kdeplot(X_s[Y_s==0][:,0],X_s[Y_s==0][:,1],cmap='Blues')
plt.figure()
sns.kdeplot(X_s[Y_s==1][:,0],X_s[Y_s==1][:,1],cmap='Greens')
plt.figure()
sns.kdeplot(X_s[Y_s==2][:,0],X_s[Y_s==2][:,1],cmap='Reds')

In [18]:
X_nn = pca.transform(NN)
X_nn_f = X_nn[t_NN==max(t_NN)]
sns.kdeplot(X_nn_f[:,0],X_nn_f[:,1])


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d66be80>

In [19]:
# NN
for t in set(t_NN):
    X_nn = pca.transform(NN)
    X_nn_f = X_nn[t_NN==t]
    sns.kdeplot(X_nn_f[:,0],X_nn_f[:,1],cmap='Blues')
    name = 'Day {0}'.format(int(t))
    plt.title(name)
    plt.xlim(-10,10)
    plt.ylim(-8,10)
    plt.savefig('../figures/density-animation/NN-' + name + '.jpg')
    plt.close()

In [20]:
# NG
for t in set(t_NG):
    X_ng = pca.transform(NG)
    X_ng_f = X_ng[t_NG==t]
    sns.kdeplot(X_ng_f[:,0],X_ng_f[:,1],cmap='Blues')
    name = 'Day {0}'.format(int(t))
    plt.title(name)
    plt.xlim(-10,10)
    plt.ylim(-8,10)
    plt.savefig('../figures/density-animation/NG-' + name + '.jpg')
    plt.close()

In [21]:
set(t_NG)==set(t_NN)


Out[21]:
True

In [22]:
# both
for t in set(t_NG):
    X_ng = pca.transform(NG)
    X_ng_f = X_ng[t_NG==t]
    sns.kdeplot(X_ng_f[:,0],X_ng_f[:,1],cmap='Blues')
    
    X_nn = pca.transform(NN)
    X_nn_f = X_nn[t_NN==t]
    sns.kdeplot(X_nn_f[:,0],X_nn_f[:,1],cmap='Greens')
    
    name = 'Day {0}'.format(int(t))
    plt.title(name)
    plt.xlim(-10,10)
    plt.ylim(-8,10)
    plt.savefig('../figures/density-animation/NG+NN-' + name + '.jpg')
    plt.close()

In [58]:
len(set(t_NN))


Out[58]:
14

In [52]:
[t for t in set(t_NG)]


Out[52]:
[0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 30.0]

In [42]:
X_nn.shape,t_NG.shape


Out[42]:
((203017, 44), (203017,))

In [47]:
NN.shape,NG.shape,Oct4.shape


Out[47]:
((203017, 44), (203017, 44), (203017, 44))

In [41]:
X_ng = pca.transform(NG)
X_ng_f = X_ng[t_NG==max(t_NG)] # error: need to fix t_NG doesn't match up with NN
sns.kdeplot(X_ng_f[:,0],X_ng_f[:,1])


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ee696a0>

In [37]:
X_nn.shape,X_ng.shape,t_NN.shape,t_NG.shape,t_Oct4.shape


Out[37]:
((203017, 44), (559654,), (203017,), (296682,))

In [36]:
X_nn_f.shape,t_NN_f.shape


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-36-06d2fcb41d2f> in <module>()
----> 1 X_nn_f.shape,t_NN_f.shape

NameError: name 'X_nn_f' is not defined

In [23]:
X_[Y==0]


Out[23]:
array([[ -1.31565059e+00,   5.60858060e-01,  -3.27357196e+00, ...,
         -1.51114063e-01,   1.13190719e-01,  -8.25093753e-02],
       [  3.78952897e+00,   3.87097434e-01,  -2.18600034e+00, ...,
          1.91832145e-01,  -6.20886794e-02,   7.95127610e-02],
       [ -2.51574924e+00,   6.97657461e-01,  -4.27735369e+00, ...,
         -2.55300550e-01,  -1.58705254e-01,   1.58778777e-05],
       ..., 
       [  5.54144996e+00,  -1.05691653e+00,   8.26916473e-01, ...,
         -8.23163783e-02,   2.85532291e-01,  -2.90732540e-02],
       [ -5.70408197e+00,  -3.20146607e+00,  -2.80741914e+00, ...,
          2.51765556e-01,  -1.39491910e-01,   4.36253435e-01],
       [ -2.26574481e+00,  -1.54490405e+00,  -2.36573552e+00, ...,
          4.95447800e-01,  -1.77994365e-01,  -5.97305592e-03]])

In [22]:
X_.shape,Y.shape


Out[22]:
((609051, 44), (609051,))

In [ ]:
# have each time point a separate array

In [ ]:
# idea: use kernel density estimator

In [59]:
from sklearn.neighbors import KernelDensity

In [60]:
kde = KernelDensity()

In [ ]:
kde.fit(X)

In [61]:
X.shape


Out[61]:
(609051, 44)

In [ ]:
kde.fit()