In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import django
import os
os.environ['DJANGO_SETTINGS_MODULE'] = 'Carkinos.settings.local'
django.setup()
from probes.models import Dataset,Platform,Sample,CellLine,ProbeID
root=Path('../').resolve()
plus2_path=root.joinpath('src','raw','Affy_U133plus2_probe_info.csv')
val_pth=Path('../').resolve().joinpath('src','PCA_TEST.quantile_normalized.tsv')
In [2]:
import sklearn
from sklearn.decomposition import PCA
In [3]:
import pylab
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
In [4]:
val = pd.read_table(val_pth.as_posix())
original_data=val
In [5]:
val.head()
Out[5]:
In [6]:
val.index = val['Unnamed: 0']
val.index.name = None
val_test = val.iloc[:, 1:]
val_test.head()
Out[6]:
In [7]:
new_name=["GSM886845.CEL.gz",
"GSM803640_113409hp133a11.cel.gz",
"GSM803699_113468hp133a11.cel.gz",
"GSM803758_118194hp133a11.cel.gz",
"GSM886856.CEL.gz",
"GSM803641_113410hp133a11.cel.gz",
"GSM803700_113469hp133a11.cel.gz",
"GSM803759_118195hp133a11.cel.gz",
"GSM886858.CEL.gz",
"GSM803648_113417hp133a11.cel.gz",
"GSM803707_113476hp133a11.cel.gz",
"GSM803765_118202hp133a11.cel.gz",
"GSM886863.CEL.gz",
"GSM803642_113411hp133a11.cel.gz",
"GSM803701_113470hp133a11.cel.gz",
"GSM803760_118196hp133a11.cel.gz",
"GSM886894.CEL.gz",
"GSM803621_113390hp133a11.cel.gz",
"GSM803680_113449hp133a11.cel.gz",
"GSM803739_118175hp133a11.cel.gz",
"GSM886902.CEL.gz",
"GSM803643_113412hp133a11.cel.gz",
"GSM803702_113471hp133a11.cel.gz",
"GSM886940.CEL.gz",
"GSM803633_113402hp133a11.cel.gz",
"GSM803692_113461hp133a11.cel.gz",
"GSM803751_118187hp133a11.cel.gz",
"GSM886988.CEL.gz",
"GSM803663_113432hp133a11.cel.gz",
"GSM803721_113490hp133a11.cel.gz",
"GSM803779_118217hp133a11.cel.gz",
"GSM887063.CEL.gz",
"GSM803636_113405hp133a11.cel.gz",
"GSM803695_113464hp133a11.cel.gz",
"GSM803754_118190hp133a11.cel.gz",
"GSM887083.CEL.gz",
"GSM803616_113385hp133a11.cel.gz",
"GSM803675_113444hp133a11.cel.gz",
"GSM803734_118170hp133a11.cel.gz"]
In [8]:
val=val.reindex_axis(new_name,axis=1)
val
Out[8]:
In [9]:
val=val.as_matrix()
val
Out[9]:
In [10]:
t_val=np.transpose(val)
pca= PCA(n_components=54675)
X = pca.fit_transform(t_val)
#print(X)
In [11]:
t=pca.explained_variance_ratio_
sum(t[0:40])
Out[11]:
In [12]:
for n in range(1,40):
print('propotion ',n+1,':',sum(t[0:n]))
In [13]:
#colormap = plt.cm.jet
#colm=[colormap(1)]*29+[colormap(1000)]*10
col1=['r','b','b','b']*5+['r','b','b']+['r','b','b','b']*4
col=['r']*4+['b']*4+['g']*4+['m']*4+['k']*4+['c']*3+['y']*4+['#05f6f0']*4+['#75f605']*4+['#f472ce']*4
print(col1[0:6])
print(col[0:16])
In [14]:
#fig = pylab.figure()
#ax = Axes3D(fig)
fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
xx=np.array(X[:,0]).flatten()
yy=np.array(X[:,1]).flatten()
zz=np.array(X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=col[:])
ax.set_title("colored by cell line")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax = fig.add_subplot(1, 2, 2, projection='3d')
xx=np.array(X[:,0]).flatten()
yy=np.array(X[:,1]).flatten()
zz=np.array(X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=col1[:])
ax.set_title("colored by data set")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.show()
In [121]:
tt=original_data.loc[['212581_x_at','213453_x_at','217398_x_at',
'AFFX-HUMGAPDH/M33197_3_at',
'AFFX-HUMGAPDH/M33197_5_at',
'AFFX-HUMGAPDH/M33197_M_at'
],:]
tt=tt.iloc[:,1:]
tt
Out[121]:
In [128]:
means=tt.mean().as_matrix()
means
Out[128]:
In [137]:
8.95166541-13.69418395
Out[137]:
In [135]:
val
Out[135]:
In [138]:
norm_val=val-means
In [139]:
t_norm_val=np.transpose(norm_val)
pca= PCA(n_components=3)
norm_X = pca.fit_transform(t_norm_val)
print(norm_X)
In [140]:
norm_t=pca.explained_variance_ratio_
print('propotion:',sum(norm_t[0:2]))
In [170]:
fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
xx=np.array(norm_X[:,0]).flatten()
yy=np.array(norm_X[:,1]).flatten()
zz=np.array(norm_X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=col[:])
ax.set_title("norm_data colored by cell line")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax = fig.add_subplot(1, 2, 2, projection='3d')
xx=np.array(norm_X[:,0]).flatten()
yy=np.array(norm_X[:,1]).flatten()
zz=np.array(norm_X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=col1[:])
ax.set_title("norm_data colored by data set")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.show()
In [171]:
old_data=np.array([
[8.9283,8.8862,8.9374,9.0085,10.151,9.2932,9.2658,9.7032,9.675,9.4598,9.4706,9.4283],
[10.0503,8.6776,8.597,7.9725,9.5993,8.45,8.4428,8.9239,9.6814,8.6094,8.6629,8.3004],
[6.8854,6.6461,6.6673,6.6335,6.9292,6.879,6.8687,6.9176,6.8493,6.761,6.6705,6.6293],
[10.502,9.5326,9.41,9.1377,10.2953,8.8643,8.9176,8.8962,8.064,7.8083,7.9111,7.969],
[6.8712,5.8737,6.0063,5.963,6.7891,5.836,5.9821,5.9546,6.7754,5.986,5.9624,5.8532]
])
In [181]:
old_tval=np.transpose(old_data)
pca= PCA(n_components=3)
old_X = pca.fit_transform(old_tval)
print(old_X)
In [182]:
old_norm_t=pca.explained_variance_ratio_
print('propotion:',sum(old_norm_t[0:2]))
In [183]:
fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
xx=np.array(old_X[:,0]).flatten()
yy=np.array(old_X[:,1]).flatten()
zz=np.array(old_X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=col[:12])
ax.set_title("old_data colored by cell line")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax = fig.add_subplot(1, 2, 2, projection='3d')
xx=np.array(old_X[:,0]).flatten()
yy=np.array(old_X[:,1]).flatten()
zz=np.array(old_X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=col1[:12])
ax.set_title("old_data colored by data set")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.show()
In [175]:
old_norm_data=np.array([
[-5.5208,-4.7194,-4.6519,-4.4318,-4.1799,-4.1518,-4.1751,-3.8435,-4.7586,-4.1699,-4.1486,-4.0774],
[-4.3988,-4.928,-4.9924,-5.4678,-4.7316,-4.995,-4.9981,-4.6228,-4.7522,-5.0204,-4.9562,-5.2052],
[-7.5637,-6.9595,-6.922,-6.8069,-7.4017,-6.566,-6.5722,-6.6292,-7.5843,-6.8687,-6.9487,-6.8763],
[-3.9471,-4.1793,-4.1793,-4.3026,-4.0355,-4.5807,-4.5234,-4.6506,-6.3696,-5.8215,-5.7081,-5.5366],
[-7.5779,-7.7319,-7.5813,-7.4773,-7.5417,-7.6091,-7.4589,-7.5922,-7.6582,-7.6437,-7.6567,-7.6524]
])
In [176]:
old_norm_tval=np.transpose(old_norm_data)
pca= PCA(n_components=3)
old_norm_X = pca.fit_transform(old_norm_tval)
print(old_norm_X)
In [177]:
old_n_t=pca.explained_variance_ratio_
print('propotion:',sum(old_n_t[0:2]))
In [180]:
fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
xx=np.array(old_norm_X[:,0]).flatten()
yy=np.array(old_norm_X[:,1]).flatten()
zz=np.array(old_norm_X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=col[:12])
ax.set_title("old_norm_data colored by cell line")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax = fig.add_subplot(1, 2, 2, projection='3d')
xx=np.array(old_norm_X[:,0]).flatten()
yy=np.array(old_norm_X[:,1]).flatten()
zz=np.array(old_norm_X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=col1[:12])
ax.set_title("old_norm_data colored by data set")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.show()
In [185]:
ppth=Path('../').resolve().joinpath('src','raw','ptest_data.xlsx')
pp=pd.read_excel(ppth.as_posix())
pp=pp.as_matrix()
pp
Out[185]:
In [187]:
old_norm_tval=np.transpose(pp)
pca= PCA(n_components=3)
old_norm_X = pca.fit_transform(old_norm_tval)
print(old_norm_X)
In [188]:
fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
xx=np.array(old_norm_X[:,0]).flatten()
yy=np.array(old_norm_X[:,1]).flatten()
zz=np.array(old_norm_X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=['r','r','r','r','g','g','g','b','b','b','c','c','m','m','g','g'])
ax.set_title("ppdata colored by cell line")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax = fig.add_subplot(1, 2, 2, projection='3d')
xx=np.array(old_norm_X[:,0]).flatten()
yy=np.array(old_norm_X[:,1]).flatten()
zz=np.array(old_norm_X[:,2]).flatten()
ax.scatter(xx,yy,zz,s=80,marker=(5,3),c=['r','r','b','b','r','g','b','r','g','b','r','b','r','b','g','g'])
ax.set_title("ppdata colored by data set")
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.show()
In [ ]: