In [1]:
from __future__ import division
from glob import glob
import skimage.feature as ft
from skimage import data, color, exposure
from scipy import misc
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
In [3]:
basedir = '../../BYindex/'
directories=[basedir+"cellsData/ClassAna",
basedir+"cellsData/Celulas classificadas - ZeH/Curva de crescimento/0dias/08-07-2016 (Menk)",
basedir+"cellsData/Celulas classificadas - ZeH/IM/16-09",
basedir+"cellsData/SC_13-07-2016"]
features = pd.DataFrame()
for saveDir in directories:
allFiles = glob(saveDir+"/*.csv")
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
df["file"] = df.file.apply(lambda x: saveDir +'/'+ x)
features = pd.concat([features,df],ignore_index=True)
In [4]:
print(features.head())
print(features.shape)
In [5]:
for classy in features['class'].unique():
print('Number of items considered '+str(classy)+': ' +str(sum(features['class']==classy) ) )
print('DELETING UNALLOWED CLASSES\n...\n...\n...')
allowed_classes = ['unclassifiable','not a cell','interfase','mitose']
features = features[features['class'].apply(lambda x: x in allowed_classes)]
for classy in features['class'].unique():
print('Number of items considered '+str(classy)+': ' +str(sum(features['class']==classy) ) )
In [6]:
print(features.head())
In [7]:
%time features["photo"] = features.file.apply(lambda x: misc.imread(x))
print(features.photo[3].shape)
print(features.head())
In [8]:
%time features["photo"] = features.photo.apply(lambda x: x[:,:,-1])
print(features.photo[3].shape)
print(features.head())
In [9]:
features['shape'] = features['photo'].apply(np.shape)
print(np.sum(features['shape']!= (100,100)))
features = features[features['shape']== (100,100)]
In [10]:
plt.imshow(features.photo[30])
def getHoG(image):
HOG = ft.hog(image).reshape(1,-1)
return HOG
def getLBP(image):
hLBP = np.array([]).reshape(1,-1)
for P in range(4,20,2):
for R in range(2,10,2):
LBP = ft.local_binary_pattern(image, P, R, 'uniform').reshape(-1,1)
hLBP = np.hstack((np.histogram(LBP,50,range=(0,50))[0].reshape(1,-1), hLBP))
return hLBP[0]
In [11]:
%time features['HoG'] = features.photo.apply(getHoG)
%time features['LBP'] = features.photo.apply(getLBP)
In [12]:
from numpy.fft import fft2
def abs_pha_fft(image):
imfft = fft2(image)
return np.hstack([np.abs(imfft).reshape(1,-1), np.angle(imfft).reshape(1,-1)])
features['fft'] = features.photo.apply(abs_pha_fft)
In [13]:
photo_features = features.copy()
phase_classes = ['interfase','mitose']
features = features[features['class'].apply(lambda x: x in phase_classes)]
LE = LabelEncoder()
LE.fit(phase_classes)
Out[13]:
In [14]:
def isGoodPhoto(label):
if label in ['interfase','mitose']: return 1
else: return 0
photo_features['class'] = photo_features['class'].apply(isGoodPhoto)
In [15]:
Phog = np.array(photo_features.HoG.tolist())[:,0,:]
Plbp = np.array(photo_features.LBP.tolist())[:,:]
pcaHoG = PCA()
pcaLBP = PCA()
Phog_pc = pcaHoG.fit_transform(Phog)
Plbp_pc = pcaLBP.fit_transform(Plbp)
In [16]:
Xhog = np.array(features.HoG.tolist())[:,0,:]
Xlbp = np.array(features.LBP.tolist())[:,:]
pcaHoG = PCA()
pcaLBP = PCA()
Xhog_pc = pcaHoG.fit_transform(Xhog)
Xlbp_pc = pcaLBP.fit_transform(Xlbp)
In [17]:
plt.plot(np.cumsum(pcaHoG.explained_variance_ratio_))
plt.ylabel('HoG Variance explained')
plt.xlabel('Number of components')
plt.title('PCA on HoG features')
Out[17]:
In [18]:
plt.plot(np.cumsum(pcaLBP.explained_variance_ratio_))
plt.ylabel('LBP Variance explained')
plt.xlabel('Number of components');plt.xlim([0,2000])
plt.title('PCA on LBP features')
Out[18]:
In [19]:
X_mult_pc = np.hstack([Xhog_pc[:,:1000],
Xlbp_pc[:,:500]])
P_mult_pc = np.hstack([Phog_pc[:,:1000],
Plbp_pc[:,:500]])
In [20]:
Yphoto = np.array(photo_features['class'].tolist())
Y = np.array(LE.transform(features['class']).tolist())
In [23]:
from sklearn.metrics import f1_score
randomGuess_phase = (np.random.random([5,X.shape[0]])>.5)
biasedGuess_phase = (np.random.random([5,X.shape[0]])>.8)
randomGuess_photo = (np.random.random([5,Xphoto.shape[0]])>.5)
biasedGuess_photo = (np.random.random([5,Xphoto.shape[0]])>.8)
#photo
scrPhotoBias=np.array([f1_score(y, Yphoto) for y in biasedGuess_photo ])
scrPhotoRdn =np.array([f1_score(y, Yphoto) for y in randomGuess_photo ])
#phase
scrBias=np.array([f1_score(y, Y) for y in biasedGuess_phase ])
scrRdn =np.array([f1_score(y, Y) for y in randomGuess_phase ])
In [24]:
print ' mean std'
print 'Phase bias ' + str(scrBias.mean())+ ' '+ str(scrBias.std())
print 'Phase random ' + str(scrRdn.mean())+ ' '+ str(scrRdn.std())
print 'Photo bias ' + str(scrPhotoBias.mean())+ ' '+ str(scrPhotoBias.std())
print 'Photo random ' + str(scrPhotoRdn.mean()) + ' '+ str(scrPhotoRdn.std())
In [23]:
FFTphoto = np.array(photo_features.fft.tolist())[:,0,:]
FFT = np.array(features.fft.tolist())[:,0,:]
In [66]:
benchSVM = SVC()
benchphase=[]
benchphoto=[]
benchphase = cross_val_score(benchSVM, FFT, Y, cv=5, scoring='f1')
benchphoto = cross_val_score(benchSVM, FFTphoto, Yphoto, cv=5, scoring='f1')
In [68]:
print 'Phase results:'
print benchphase
print benchphase.mean()
print benchphase.std()
print '\n\nPhoto Results:'
print benchphoto
print benchphoto.mean()
print benchphoto.std()
In [112]:
classifiers = [
KNeighborsClassifier(),
SVC(),
LinearSVC(),
GaussianProcessClassifier(),
GaussianNB(),
DecisionTreeClassifier(),
RandomForestClassifier(),
LogisticRegression()
]
In [113]:
%%time
phasescores=[]
for clf in classifiers:
%time phasescores.append(cross_val_score(clf, X_mult_pc, Y, cv=5, scoring='f1'))
In [114]:
np.array(phasescores).mean(axis=1)
Out[114]:
In [115]:
np.array(phasescores).std(axis=1)
Out[115]:
In [25]:
X_not_pc = np.hstack([Xhog[:,:],
Xlbp[:,:]])
In [127]:
classifiers = [
KNeighborsClassifier(),
SVC(),
LinearSVC(),
GaussianProcessClassifier(),
GaussianNB(),
DecisionTreeClassifier(),
RandomForestClassifier(),
LogisticRegression()
]
In [128]:
%%time
phasescores=[]
for clf in classifiers:
%time phasescores.append(cross_val_score(clf, X_not_pc, Y, cv=5, scoring='f1'))
In [129]:
np.array(phasescores).mean(axis=1)
Out[129]:
In [130]:
np.array(phasescores).std(axis=1)
Out[130]:
In [131]:
classifiers = [
KNeighborsClassifier(),
SVC(),
LinearSVC(),
GaussianProcessClassifier(),
GaussianNB(),
DecisionTreeClassifier(),
RandomForestClassifier(),
LogisticRegression()
]
In [166]:
%%time
photoscores=[]
for clf in classifiers:
%time photoscores.append(cross_val_score(clf, P_mult_pc, Yphoto, cv=3,scoring='f1'))
In [167]:
np.array(photoscores).mean(axis=1)
Out[167]:
In [168]:
np.array(photoscores).std(axis=1)
Out[168]:
In [169]:
P_not_pc = np.hstack([Phog[:,:],
Plbp[:,:]])
In [170]:
classifiers = [
KNeighborsClassifier(),
SVC(),
LinearSVC(),
GaussianProcessClassifier(),
GaussianNB(),
DecisionTreeClassifier(),
RandomForestClassifier(class_weight= 'balanced'),
LogisticRegression()
]
In [171]:
%%time
photoscores=[]
for clf in classifiers:
%time photoscores.append(cross_val_score(clf, P_not_pc, Yphoto, cv=5,scoring='f1'))
In [172]:
np.array(photoscores).mean(axis=1)
Out[172]:
In [173]:
np.array(photoscores).std(axis=1)
Out[173]:
In [196]:
params1 = {'criterion':['gini','entropy'], 'class_weight':[{0:1,1:1},'balanced'], 'min_samples_split':[2,10,20,40,80,160]}
clf1 = DecisionTreeClassifier()
params2 = {'C':[.01,.1,1,2,4,8,16], 'class_weight':[None,'balanced']}
clf2 = LogisticRegression()
grid1 = GridSearchCV(clf1,params1,cv=3,scoring='f1')
grid1.fit(X_mult_pc,Y)
grid2 = GridSearchCV(clf2,params2,cv=3,scoring='f1')
grid2.fit(X_mult_pc,Y)
print('Melhores parametros: ')
print('Decision Tree:')
print(grid1.best_params_)
print('Logistic Regression:')
print(grid2.best_params_)
In [197]:
clf1 = grid1.best_estimator_
result1 = cross_val_score(clf1,X_mult_pc,Y,cv=5,scoring='f1')
clf2 = grid2.best_estimator_
result2 = cross_val_score(clf2,X_mult_pc,Y,cv=5,scoring='f1')
In [198]:
print(result1.mean())
print(result1.std())
print(result2.mean())
print(result2.std())
In [76]:
gamma = np.arange(0,2,.1)*1/len(Xpc[0])
params1 = {'C':[.01,.1,1,2,4,8,16], 'gamma':gamma}
clf1 = SVC()
params2 = {'min_samples_split':[2,10,20,40,80,160],'criterion':['gini','entropy'],'max_depth':[2,3,4,5,6]}
clf2 = RandomForestClassifier(class_weight='balanced')
grid1 = GridSearchCV(clf1,params1,cv=5,scoring='f1')
grid1.fit(P_mult_pc,Yphoto)
grid2 = GridSearchCV(clf2,params2,cv=5,scoring='f1')
grid2.fit(P_mult_pc,Yphoto)
print('Melhores parametros: ')
print('Support vector machine:')
print(grid1.best_params_)
print('Random Forest:')
print(grid2.best_params_)
In [27]:
clf1 = grid1.best_estimator_
clf2 = grid2.best_estimator_
In [28]:
result1 = cross_val_score(clf1,P_mult_pc,Yphoto,cv=5,scoring='f1')
result2 = cross_val_score(clf2,P_mult_pc,Yphoto,cv=5,scoring='f1')
In [29]:
print(result1.mean())
print(result1.std())
print(result2.mean())
print(result2.std())
In [30]:
gamma = np.arange(.5,7,.5)*1e-5
params = {'C':[.8,.9,1,1.1,1.2], 'gamma':gamma}
clf = SVC()
grid = GridSearchCV(clf,params,cv=5,scoring='f1')
grid.fit(Xphoto_pc,Yphoto)
print('Melhores parametros: ')
print(grid.best_params_)
In [39]:
clf = grid.best_estimator_
result = cross_val_score(clf,Xphoto_pc,Yphoto,cv=5,scoring='f1')
print(result.mean())
print(result.std())
In [78]:
clf = SVC(C=1.1,gamma=1.5e-5)
result = cross_val_score(clf,Xphoto,Yphoto,cv=5,scoring='f1')
print(result)
print(result.mean())
print(result.std())
In [80]:
clf = SVC(C=1.1,gamma=1.5e-5)
result = cross_val_score(clf,Xphotofft,Yphoto,cv=5,scoring='f1')
print(result)
print(result.mean())
print(result.std())
In [201]:
params = {'C':[0.06,0.08,0.1,0.12,0.16,0.2,0.5]}
clf = LogisticRegression(class_weight='balanced')
grid = GridSearchCV(clf,params,cv=5,scoring='f1')
grid.fit(X_mult_pc,Y)
print('Melhores parametros: ')
print(grid.best_params_)
In [202]:
clf = grid.best_estimator_
result = cross_val_score(clf,X_mult_pc,Y,cv=5,scoring='f1')
print(result.mean())
print(result.std())
In [203]:
clf = grid.best_estimator_
result = cross_val_score(clf,X_not_pc,Y,cv=5,scoring='f1')
print(result)
print(result.mean())
print(result.std())
In [48]:
Xfft = np.hstack([FFT,X_not_pc])
In [207]:
clf = grid.best_estimator_
result = cross_val_score(clf,Xfft,Y,cv=5,scoring='f1')
print(result)
print(result.mean())
print(result.std())
In [208]:
params = {'C':[0.16,0.3,0.5,1,2]}
clf = LogisticRegression(class_weight='balanced')
grid = GridSearchCV(clf,params,cv=5,scoring='f1')
grid.fit(Xfft,Y)
print('Melhores parametros: ')
print(grid.best_params_)
In [209]:
clf = grid.best_estimator_
%time result = cross_val_score(clf,Xfft,Y,cv=5,scoring='f1')
print(result.mean())
print(result.std())
In [56]:
params = {'C':[2,3,4]}
clf = LogisticRegression(class_weight='balanced')
grid = GridSearchCV(clf,params,cv=5,scoring='f1')
grid.fit(Xfft,Y)
print('Melhores parametros: ')
print(grid.best_params_)
In [57]:
clf = grid.best_estimator_
%time result = cross_val_score(clf,Xfft,Y,cv=5,scoring='f1')
print(result.mean())
print(result.std())
In [58]:
clf = grid.best_estimator_
for i in range(10):
Xrob = Xfft.copy()
Xrob[:,2000*i:2000*(i+1)]=0
result = cross_val_score(clf,Xrob,Y,cv=3,scoring='f1')
print(result.mean())
print(result.std())
In [105]:
clf = grid.best_estimator_
clf.fit(Xfft,Y)
coefs = clf.coef_
In [106]:
sortidx = np.argsort(np.abs(coefs))
print(sortidx[:2])
ids = sortidx[0,[0,2]]
ids
Out[106]:
In [115]:
ids = sortidx[0,[1,5]]
toPlotX = Xfft[:,ids]
mitX = toPlotX[Y==1]
intX = toPlotX[Y==0]
plt.title('Most weighted features')
plt.plot(mitX[:,0],mitX[:,1],'o',label='Mitosis')
plt.plot(intX[:,0],intX[:,1],'x',label='Interphasis')
plt.xlabel('Feature 25903')
plt.ylabel('Feature 27308')
plt.legend()
Out[115]:
In [24]:
ids = sortidx[0,[3,40]]
toPlotX = Xfft[:,ids]
mitX = toPlotX[Y==1]
intX = toPlotX[Y==0]
plt.plot(mitX[:,0],mitX[:,1],'o')
plt.plot(intX[:,0],intX[:,1],'x')
Out[24]:
In [29]:
ids = sortidx[0,[0,2]]
toPlotX = Xfft[:,ids]
mitX = toPlotX[Y==1]
intX = toPlotX[Y==0]
plt.title('Closer look at most weighted')
plt.plot(mitX[:,0],mitX[:,1],'o',label='Mitosis')
plt.plot(intX[:,0],intX[:,1],'x',label='Interphasis')
plt.xlabel('Feature 25903'); plt.xlim([0,500])
plt.ylabel('Feature 27308'); plt.ylim([0,500])
plt.legend()
Out[29]:
In [34]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(Xfft,Y)
clf = LogisticRegression(class_weight='balanced', C = 3.8)
clf.fit(Xtrain,Ytrain)
coefs = clf.coef_
sortidx = np.argsort(coefs)
print(sortidx[:2])
In [35]:
Ypred = clf.predict(Xtest)
In [55]:
intX.shape
Out[55]:
In [92]:
ids=sortidx[0,[0,-1]]
mitX = Xtest[Ytest==1]; mitX = mitX[:,ids]
intX = Xtest[Ytest==0]; intX = intX[:,ids]
mitrX = Xtest[np.logical_and(Ypred==1,Ytest ==1),:]; mitrX = mitrX[:,ids]
mitwX = Xtest[np.logical_and(Ypred==0,Ytest ==1),:]; mitwX = mitwX[:,ids]
intrX = Xtest[np.logical_and(Ypred==0,Ytest ==0),:]; intrX = intrX[:,ids]
intwX = Xtest[np.logical_and(Ypred==1,Ytest ==0),:]; intwX = intwX[:,ids]
In [93]:
mitwX.shape
Out[93]:
In [94]:
plt.title('Most relevant features')
plt.plot(mitrX[:,0],mitrX[:,1],'o',color='blue',label='Right Mitosis')
plt.plot(mitwX[:,0],mitwX[:,1],'o',color='red',label='Wrong Mitosis')
plt.plot(intrX[:,0],intrX[:,1],'x',color='blue',label='Right Interphasis')
plt.plot(intwX[:,0],intwX[:,1],'x',color='red',label='Wrong Interphasis')
plt.xlabel('Feature 26699')
plt.ylabel('Feature 18177')
plt.legend()
Out[94]:
In [95]:
plt.title('Closer look at most relevant')
plt.plot(mitrX[:,0],mitrX[:,1],'o',color='blue',label='Right Mitosis')
plt.plot(mitwX[:,0],mitwX[:,1],'o',color='red',label='Wrong Mitosis')
plt.plot(intrX[:,0],intrX[:,1],'x',color='blue',label='Right Interphasis')
plt.plot(intwX[:,0],intwX[:,1],'x',color='red',label='Wrong Interphasis')
plt.xlabel('Feature 26699')
plt.ylabel('Feature 18177')
plt.xlim([0,2000])
plt.ylim([0,2000])
plt.legend()
Out[95]: