In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
root=Path('../').resolve()
nci_val_pth=Path('../').resolve().joinpath('src','nci60.npy')
#sanger_cell_line_proj.npy/nci60.npy/GSE36133.npy
#nci_val=np.load(nci_val_pth.as_posix(),mmap_mode='r')
#pd.isnull(nci_val).any(1).nonzero()[0]
#SANGER:start from row[0]---->row[22275]to row[22280] has nan
#nci_val=nci_val[~np.isnan(nci_val).any(axis=1)]
#pd.isnull(nci_val).any(1).nonzero()[0]

In [3]:
import sklearn
from sklearn.decomposition import PCA

In [95]:
val_pth=Path('../').resolve().joinpath('src','raw','ptest_data.xlsx')
val=pd.read_excel(val_pth.as_posix())
val=val.as_matrix()
val


Out[95]:
array([[-5.366 , -5.7631, -5.6608, -5.6256, -4.8057, -5.6661, -5.5468,
        -5.3902, -5.0451, -5.2662, -4.8927, -4.9939, -4.3508, -4.3943,
        -5.421 , -5.2474],
       [-7.1705, -7.3635, -5.2312, -5.1687, -7.5804, -6.7095, -6.1366,
        -7.5169, -6.9669, -6.0586, -7.5595, -5.2557, -7.2247, -5.8687,
        -7.0951, -6.9543],
       [-6.1506, -6.4969, -5.673 , -5.4194, -6.6505, -5.8046, -5.7578,
        -6.7381, -5.7962, -5.5539, -6.1081, -4.7379, -6.3432, -5.4007,
        -5.9156, -5.9778],
       [-7.4163, -7.6428, -6.2846, -6.0678, -7.7376, -7.6556, -6.4619,
        -7.7967, -7.8114, -6.2607, -7.6419, -5.7602, -7.5193, -6.1356,
        -7.8308, -7.8084],
       [-6.7061, -7.0073, -5.8592, -5.5943, -6.6342, -6.8321, -6.0719,
        -7.2331, -7.0227, -5.86  , -6.987 , -5.1277, -6.334 , -5.5927,
        -7.0556, -7.1553],
       [-7.2542, -7.5315, -6.0611, -6.0768, -7.5042, -6.9675, -6.3497,
        -7.2016, -6.7728, -6.1676, -7.3919, -5.5271, -7.2688, -5.9041,
        -7.0795, -7.216 ],
       [-5.351 , -6.4082, -5.7879, -5.6501, -6.4023, -6.5835, -6.2056,
        -7.2589, -6.7977, -6.0623, -7.1961, -5.5584, -7.0688, -5.8349,
        -6.5194, -6.1492],
       [-5.6758, -5.6102, -4.732 , -4.6727, -6.1917, -5.7479, -5.8774,
        -5.8232, -5.0315, -5.1327, -6.2051, -4.4296, -5.7149, -4.9915,
        -5.6598, -5.8521],
       [-6.7913, -7.1181, -5.9237, -5.7847, -7.1502, -6.9125, -6.1993,
        -7.1933, -6.8246, -5.9417, -7.0551, -5.3498, -6.9104, -5.7297,
        -6.9821, -7.0431],
       [-4.7406, -5.8436, -5.389 , -5.1811, -6.6555, -5.7675, -5.699 ,
        -6.8278, -6.1393, -5.8158, -6.5356, -5.3201, -6.2358, -5.1938,
        -5.8467, -5.9764]])

In [96]:
#val=nci_val
#val=np.array([[-4.8927,-4.9939,-4.3508,-4.3943],
#[-7.5595,-5.2557,-7.2247,-5.8687],
#[-6.1081,-4.7379,-6.3432,-5.4007],
#[-7.6419,-5.7602,-7.5193,-6.1356],
#[-6.987,-5.1277,-6.334,-5.5927],
#[-7.3919,-5.5271,-7.2688,-5.9041],
#[-7.1961,-5.5584,-7.0688,-5.8349],
#[-6.2051,-4.4296,-5.7149,-4.9915],
#[-7.0551,-5.3498,-6.9104,-5.7297],
#[-6.5356,-5.3201,-6.2358,-5.1938],
#
#])
 
t_val=np.transpose(val)
t_val


Out[96]:
array([[-5.366 , -7.1705, -6.1506, -7.4163, -6.7061, -7.2542, -5.351 ,
        -5.6758, -6.7913, -4.7406],
       [-5.7631, -7.3635, -6.4969, -7.6428, -7.0073, -7.5315, -6.4082,
        -5.6102, -7.1181, -5.8436],
       [-5.6608, -5.2312, -5.673 , -6.2846, -5.8592, -6.0611, -5.7879,
        -4.732 , -5.9237, -5.389 ],
       [-5.6256, -5.1687, -5.4194, -6.0678, -5.5943, -6.0768, -5.6501,
        -4.6727, -5.7847, -5.1811],
       [-4.8057, -7.5804, -6.6505, -7.7376, -6.6342, -7.5042, -6.4023,
        -6.1917, -7.1502, -6.6555],
       [-5.6661, -6.7095, -5.8046, -7.6556, -6.8321, -6.9675, -6.5835,
        -5.7479, -6.9125, -5.7675],
       [-5.5468, -6.1366, -5.7578, -6.4619, -6.0719, -6.3497, -6.2056,
        -5.8774, -6.1993, -5.699 ],
       [-5.3902, -7.5169, -6.7381, -7.7967, -7.2331, -7.2016, -7.2589,
        -5.8232, -7.1933, -6.8278],
       [-5.0451, -6.9669, -5.7962, -7.8114, -7.0227, -6.7728, -6.7977,
        -5.0315, -6.8246, -6.1393],
       [-5.2662, -6.0586, -5.5539, -6.2607, -5.86  , -6.1676, -6.0623,
        -5.1327, -5.9417, -5.8158],
       [-4.8927, -7.5595, -6.1081, -7.6419, -6.987 , -7.3919, -7.1961,
        -6.2051, -7.0551, -6.5356],
       [-4.9939, -5.2557, -4.7379, -5.7602, -5.1277, -5.5271, -5.5584,
        -4.4296, -5.3498, -5.3201],
       [-4.3508, -7.2247, -6.3432, -7.5193, -6.334 , -7.2688, -7.0688,
        -5.7149, -6.9104, -6.2358],
       [-4.3943, -5.8687, -5.4007, -6.1356, -5.5927, -5.9041, -5.8349,
        -4.9915, -5.7297, -5.1938],
       [-5.421 , -7.0951, -5.9156, -7.8308, -7.0556, -7.0795, -6.5194,
        -5.6598, -6.9821, -5.8467],
       [-5.2474, -6.9543, -5.9778, -7.8084, -7.1553, -7.216 , -6.1492,
        -5.8521, -7.0431, -5.9764]])

In [97]:
pca= PCA(n_components=3)
X = pca.fit_transform(t_val)
print(X)


[[  3.57909214e-01   1.45167116e+00  -8.34760590e-01]
 [  1.43705365e+00   6.57368909e-01   1.69662377e-01]
 [ -2.14339445e+00   2.44562198e-01   5.10341660e-01]
 [ -2.55896716e+00   2.95749196e-01   3.19789509e-01]
 [  1.84064926e+00  -3.40068949e-01  -5.35856542e-01]
 [  6.97232647e-01   3.67917331e-01   4.34733141e-01]
 [ -8.78073683e-01  -4.08845839e-02   1.33606554e-01]
 [  2.12257996e+00  -5.11199088e-01   5.66596069e-01]
 [  8.08135778e-01  -2.44727149e-01   3.84347661e-01]
 [ -1.47559264e+00  -3.31625912e-01   8.32919369e-02]
 [  1.87500035e+00  -6.39713082e-01  -6.66600459e-02]
 [ -3.36681271e+00  -3.94783019e-01  -1.07724701e-01]
 [  1.18423070e+00  -8.76125244e-01  -5.41964184e-01]
 [ -2.13883587e+00  -4.43423820e-01  -7.24914269e-01]
 [  1.10211353e+00   3.52543067e-01   2.07640348e-01]
 [  1.13677141e+00   4.52738990e-01   1.87107653e-03]]

In [98]:
t=pca.explained_variance_ratio_
print('propotion:',sum(t[0:2]))
#print('propotion:',sum(t[0:10]))
#print('propotion:',sum(t[0:20]))
#print('propotion:',sum(t[0:30]))


propotion: 0.887234952209

In [100]:
import pylab
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = pylab.figure()
ax = Axes3D(fig)  
#r=gse36133,b=sanger,g=nci60
#col=['r','r','b','b','r','g','b','r','g','b','r','b','r','b','g','g']
#r=TT,g=UACC-62,b=LOXIMVI,c=A3-KAW,m=COLO-741
col=['r','r','r','r','g','g','g','b','b','b','c','c','m','m','g','g']
#col=['r']*15+['b']*18+['g']*21+['m']*18+['k']*26+['c']*26+['y']*26+['#05f6f0']*21+['#75f605']*6+['#f472ce']*23
xx=np.array(X[:,0]).flatten()
yy=np.array(X[:,1]).flatten()
zz=np.array(X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=col[:])
plt.show()

In [ ]:
import django
import os
os.environ['DJANGO_SETTINGS_MODULE'] = 'Carkinos.settings.local'
django.setup()
from probes.models import Sample,ProbeID,CellLine,Dataset,Platform
nci60_offset=Sample.objects.filter(dataset_id__name__in=['NCI60']).values_list('offset',flat=True)

In [ ]:
nci_sample=Sample.objects.filter(dataset_id__name__in=['NCI60'])
pprobe=ProbeID.objects.filter(platform=3)
poffset=pprobe.values_list('offset',flat=True)
nci_val[np.ix_([0,1,2],[0,1])]

In [ ]:
print(nci_sample[1].offset)
nci_sample[15].cell_line_id.primary_site

In [ ]:
#pd.isnull(new_sanger_val).any(1).nonzero()[0]

In [ ]:


In [ ]: