In [79]:
from pathlib import Path
import pandas as pd
import numpy as np

In [80]:
root=Path('../').resolve()
root


Out[80]:
WindowsPath('C:/Users/user/Documents/GitHub/Carkinos')

In [81]:
val_pth=Path('../').resolve().joinpath('src','raw','ptest_data.xlsx')
val=pd.read_excel(val_pth.as_posix())

In [82]:
nci_val_pth=Path('../').resolve().joinpath('src','nci60.npy')
nci_val=np.load(nci_val_pth.as_posix(),mmap_mode='r')

In [83]:
val #assign datas in


Out[83]:
GSM803621 GSM803622 GSM803623 GSM803625 GSM803673 GSM803627 GSM803628 GSM803629 GSM803630 GSM803631
0 8.6411 9.1225 11.7064 9.4505 10.7963 9.0860 9.5048 8.7645 9.9437 9.7562
1 8.7376 7.8232 8.3946 8.7498 8.2350 9.0479 8.4730 8.4184 9.0894 8.6690
2 6.5835 6.7591 6.7469 6.7086 6.7443 6.5651 6.6788 6.6414 6.7041 6.7211
3 8.0363 7.8883 8.0737 8.0391 7.8640 7.8576 7.8939 7.7059 7.9346 7.8148
4 5.8235 5.8564 5.9594 5.8791 5.9691 5.9568 5.9607 5.9074 5.9644 5.9673
5 6.9572 7.4644 6.9672 7.3993 6.9801 7.1211 7.6634 8.0684 6.9568 7.5553

In [86]:
val=val.as_matrix()
val


Out[86]:
array([[  8.6411,   9.1225,  11.7064,   9.4505,  10.7963,   9.086 ,
          9.5048,   8.7645,   9.9437,   9.7562],
       [  8.7376,   7.8232,   8.3946,   8.7498,   8.235 ,   9.0479,
          8.473 ,   8.4184,   9.0894,   8.669 ],
       [  6.5835,   6.7591,   6.7469,   6.7086,   6.7443,   6.5651,
          6.6788,   6.6414,   6.7041,   6.7211],
       [  8.0363,   7.8883,   8.0737,   8.0391,   7.864 ,   7.8576,
          7.8939,   7.7059,   7.9346,   7.8148],
       [  5.8235,   5.8564,   5.9594,   5.8791,   5.9691,   5.9568,
          5.9607,   5.9074,   5.9644,   5.9673],
       [  6.9572,   7.4644,   6.9672,   7.3993,   6.9801,   7.1211,
          7.6634,   8.0684,   6.9568,   7.5553]])

In [103]:
t_val=np.transpose(val)
mean_val=np.mean(t_val,axis=0)

print(mean_val)
t_val


[ 9.6772   8.56379  6.68529  7.91082  5.92441  7.31332]
Out[103]:
array([[  8.6411,   8.7376,   6.5835,   8.0363,   5.8235,   6.9572],
       [  9.1225,   7.8232,   6.7591,   7.8883,   5.8564,   7.4644],
       [ 11.7064,   8.3946,   6.7469,   8.0737,   5.9594,   6.9672],
       [  9.4505,   8.7498,   6.7086,   8.0391,   5.8791,   7.3993],
       [ 10.7963,   8.235 ,   6.7443,   7.864 ,   5.9691,   6.9801],
       [  9.086 ,   9.0479,   6.5651,   7.8576,   5.9568,   7.1211],
       [  9.5048,   8.473 ,   6.6788,   7.8939,   5.9607,   7.6634],
       [  8.7645,   8.4184,   6.6414,   7.7059,   5.9074,   8.0684],
       [  9.9437,   9.0894,   6.7041,   7.9346,   5.9644,   6.9568],
       [  9.7562,   8.669 ,   6.7211,   7.8148,   5.9673,   7.5553]])

In [88]:
new_val=t_val-mean_val
print(new_val)


[[-1.0361   0.17381 -0.10179  0.12548 -0.10091 -0.35612]
 [-0.5547  -0.74059  0.07381 -0.02252 -0.06801  0.15108]
 [ 2.0292  -0.16919  0.06161  0.16288  0.03499 -0.34612]
 [-0.2267   0.18601  0.02331  0.12828 -0.04531  0.08598]
 [ 1.1191  -0.32879  0.05901 -0.04682  0.04469 -0.33322]
 [-0.5912   0.48411 -0.12019 -0.05322  0.03239 -0.19222]
 [-0.1724  -0.09079 -0.00649 -0.01692  0.03629  0.35008]
 [-0.9127  -0.14539 -0.04389 -0.20492 -0.01701  0.75508]
 [ 0.2665   0.52561  0.01881  0.02378  0.03999 -0.35652]
 [ 0.079    0.10521  0.03581 -0.09602  0.04289  0.24198]]

In [89]:
covMat=np.cov(new_val,rowvar=0) 
covMat


Out[89]:
array([[  8.99127709e-01,  -5.69096111e-02,   4.11522667e-02,
          3.90759689e-02,   3.08588189e-02,  -1.69038224e-01],
       [ -5.69096111e-02,   1.44565543e-01,  -1.50450401e-02,
          6.46081133e-03,   4.90854456e-03,  -4.29139109e-02],
       [  4.11522667e-02,  -1.50450401e-02,   4.63118989e-03,
          9.26296889e-04,   8.77191222e-04,  -2.14810889e-04],
       [  3.90759689e-02,   6.46081133e-03,   9.26296889e-04,
          1.29251484e-02,  -1.70618356e-03,  -2.88856416e-02],
       [  3.08588189e-02,   4.90854456e-03,   8.77191222e-04,
         -1.70618356e-03,   2.90853878e-03,  -1.72013356e-03],
       [ -1.69038224e-01,  -4.29139109e-02,  -2.14810889e-04,
         -2.88856416e-02,  -1.72013356e-03,   1.44798331e-01]])

In [90]:
eigVals,eigVects=np.linalg.eig(np.mat(covMat))

In [91]:
eigVals


Out[91]:
array([  9.42864684e-01,   1.84947153e-01,   7.18180225e-02,
         7.48096339e-03,   3.32767384e-04,   1.51286919e-03])

In [92]:
eigVects


Out[92]:
matrix([[-0.97436534,  0.07893162,  0.19836893, -0.01022694, -0.05108231,
          0.04814316],
        [ 0.05869008, -0.79012727,  0.59968756, -0.04890489, -0.07034452,
         -0.07272659],
        [-0.04380134,  0.08260814, -0.01602174, -0.08467942, -0.01681163,
         -0.99173959],
        [-0.04688762, -0.11056189, -0.17240662, -0.92627271,  0.30501319,
          0.06956579],
        [-0.03201291, -0.01208304,  0.1169786 ,  0.29033814,  0.9481855 ,
         -0.04234615],
        [ 0.20500203,  0.59184006,  0.74656538, -0.21921623, -0.00841533,
          0.04704334]])

In [93]:
n=3
eigValIndice=np.argsort(eigVals)            #对特征值从小到大排序  
n_eigValIndice=eigValIndice[-1:-(n+1):-1]   #最大的n个特征值的下标  
n_eigVect=eigVects[:,n_eigValIndice]        #最大的n个特征值对应的特征向量  
lowDDataMat=new_val*n_eigVect               #低维特征空间的数据  
reconMat=(lowDDataMat*n_eigVect.T)+mean_val  #重构数据

In [94]:
reconMat


Out[94]:
matrix([[  8.63823723,   8.73650269,   6.61288339,   7.98498761,
           5.85282197,   6.94302851],
        [  9.12468386,   7.82110332,   6.72220495,   7.89228277,
           5.84752675,   7.46702222],
        [ 11.70315308,   8.39315805,   6.78190459,   7.99643959,
           5.99181687,   6.94705087],
        [  9.4501112 ,   8.73916566,   6.66257641,   7.89552905,
           5.9301686 ,   7.36580061],
        [ 10.79748356,   8.24090607,   6.75361037,   7.9848839 ,
           5.93548997,   7.00842362],
        [  9.08568998,   9.056482  ,   6.61431634,   7.93687032,
           5.91791644,   7.13974375],
        [  9.50558715,   8.4751306 ,   6.69413378,   7.83936967,
           5.9347453 ,   7.65328245],
        [  8.76195682,   8.42074281,   6.67616667,   7.74861594,
           5.92326902,   8.07531799],
        [  9.94687621,   9.08662606,   6.64688921,   7.97460214,
           5.95346297,   6.96790801],
        [  9.75822092,   8.66808275,   6.6882143 ,   7.85461901,
           5.9568821 ,   7.56562196]])

In [95]:
lowDDataMat


Out[95]:
matrix([[ 0.94854103, -0.45094184, -0.39897226],
        [ 0.52798701,  0.64020111, -0.44662244],
        [-2.06952301,  0.07566037,  0.01769228],
        [ 0.24384639, -0.12568876,  0.10297725],
        [-1.17983983,  0.1604166 , -0.21159271],
        [ 0.57177469, -0.54737232,  0.04442431],
        [ 0.23433508,  0.2662153 ,  0.17997943],
        [ 1.04763842,  0.50895851,  0.32951964],
        [-0.30512668, -0.60682484,  0.10217838],
        [-0.0196331 ,  0.07937587,  0.28041612]])

In [96]:
np.array(lowDDataMat[:,0]).flatten()


Out[96]:
array([ 0.94854103,  0.52798701, -2.06952301,  0.24384639, -1.17983983,
        0.57177469,  0.23433508,  1.04763842, -0.30512668, -0.0196331 ])

In [97]:
#import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
#import pylab
#fig = pylab.figure()
#ax = Axes3D(fig)  
#x=np.array(lowDDataMat[:,0]).flatten()
#y=np.array(lowDDataMat[:,1]).flatten()
#z=np.array(lowDDataMat[:,2]).flatten()
#ax.scatter(x,y,z,s=50,marker=(5,3),c=['r','b'],alpha=0.5,lw=2)   
#plt.show()

In [98]:
import sklearn
from sklearn.decomposition import PCA
import pylab
import matplotlib.pyplot as plt

In [99]:
pca= PCA(n_components=3)
X = pca.fit_transform(t_val)
(X)


Out[99]:
array([[-0.94854103, -0.45094184,  0.39897226],
       [-0.52798701,  0.64020111,  0.44662244],
       [ 2.06952301,  0.07566037, -0.01769228],
       [-0.24384639, -0.12568876, -0.10297725],
       [ 1.17983983,  0.1604166 ,  0.21159271],
       [-0.57177469, -0.54737232, -0.04442431],
       [-0.23433508,  0.2662153 , -0.17997943],
       [-1.04763842,  0.50895851, -0.32951964],
       [ 0.30512668, -0.60682484, -0.10217838],
       [ 0.0196331 ,  0.07937587, -0.28041612]])

In [100]:
lowDDataMat


Out[100]:
matrix([[ 0.94854103, -0.45094184, -0.39897226],
        [ 0.52798701,  0.64020111, -0.44662244],
        [-2.06952301,  0.07566037,  0.01769228],
        [ 0.24384639, -0.12568876,  0.10297725],
        [-1.17983983,  0.1604166 , -0.21159271],
        [ 0.57177469, -0.54737232,  0.04442431],
        [ 0.23433508,  0.2662153 ,  0.17997943],
        [ 1.04763842,  0.50895851,  0.32951964],
        [-0.30512668, -0.60682484,  0.10217838],
        [-0.0196331 ,  0.07937587,  0.28041612]])

In [102]:
fig = pylab.figure()
ax = Axes3D(fig)  
x=np.array(lowDDataMat[:,0]).flatten()
y=np.array(lowDDataMat[:,1]).flatten()
z=np.array(lowDDataMat[:,2]).flatten()
#col=['r']*15+['b']*18+['g']*21+['y']*18+['k']*26+['c']*26+['m']*26+['#bcbcbc']*21+['#6d904f']*6+['#bc82bd']*23
ax.scatter(x,y,z,s=50,marker=(5,3),c='b',alpha=0.5,lw=2) 
xx=np.array(X[:,0]).flatten()
yy=np.array(X[:,1]).flatten()
zz=np.array(X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=50,marker=(5,3),c='r',alpha=0.5,lw=2)
plt.show()

In [ ]:


In [ ]: