In [1]:
from __future__ import division
from glob import glob
import skimage.feature as ft
from skimage import data, color, exposure
from scipy import misc

import pandas as pd
import numpy as np

from time import time

import matplotlib.pyplot as plt

%matplotlib inline


/home/estevao/py2_kernel/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score,train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

In [3]:
basedir = '../../BYindex/'

directories=[basedir+"cellsData/ClassAna", 
             basedir+"cellsData/Celulas classificadas - ZeH/Curva de crescimento/0dias/08-07-2016 (Menk)",
             basedir+"cellsData/Celulas classificadas - ZeH/IM/16-09",
             basedir+"cellsData/SC_13-07-2016"]

 
features = pd.DataFrame()
for saveDir in directories:
    allFiles = glob(saveDir+"/*.csv")
    for file_ in allFiles:
        df = pd.read_csv(file_,index_col=None, header=0)
        df["file"] = df.file.apply(lambda x: saveDir +'/'+ x)
        features = pd.concat([features,df],ignore_index=True)
features

In [4]:
print(features.head())
print(features.shape)


                                                file           class
0  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...  unclassifiable
1  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...      not a cell
2  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...       interfase
3  ../../BYindex/cellsData/ClassAna/BY2_5dias (88...       interfase
4  ../../BYindex/cellsData/ClassAna/BY2_5dias (88...       interfase
(2673, 2)

Preprocessing


In [5]:
for classy in features['class'].unique():
    print('Number of items considered '+str(classy)+':  ' +str(sum(features['class']==classy) ) )

    
print('DELETING UNALLOWED CLASSES\n...\n...\n...')
allowed_classes = ['unclassifiable','not a cell','interfase','mitose']

features = features[features['class'].apply(lambda x: x in allowed_classes)]

for classy in features['class'].unique():
    print('Number of items considered '+str(classy)+':  ' +str(sum(features['class']==classy) ) )


Number of items considered unclassifiable:  52
Number of items considered not a cell:  50
Number of items considered interfase:  1358
Number of items considered unknown:  753
Number of items considered mitose:  456
Number of items considered BY2_5dias (98)i:  1
Number of items considered 15:  1
Number of items considered BY2_5dias (66)i:  1
Number of items considered 28:  1
DELETING UNALLOWED CLASSES
...
...
...
Number of items considered unclassifiable:  52
Number of items considered not a cell:  50
Number of items considered interfase:  1358
Number of items considered mitose:  456

In [6]:
print(features.head())


                                                file           class
0  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...  unclassifiable
1  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...      not a cell
2  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...       interfase
3  ../../BYindex/cellsData/ClassAna/BY2_5dias (88...       interfase
4  ../../BYindex/cellsData/ClassAna/BY2_5dias (88...       interfase

In [7]:
%time features["photo"] = features.file.apply(lambda x: misc.imread(x))
print(features.photo[3].shape)
print(features.head())


CPU times: user 1.34 s, sys: 108 ms, total: 1.45 s
Wall time: 3.82 s
(100, 100, 3)
                                                file           class  \
0  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...  unclassifiable   
1  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...      not a cell   
2  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...       interfase   
3  ../../BYindex/cellsData/ClassAna/BY2_5dias (88...       interfase   
4  ../../BYindex/cellsData/ClassAna/BY2_5dias (88...       interfase   

                                               photo  
0  [[[0, 12, 36], [0, 13, 36], [0, 13, 37], [0, 1...  
1  [[[0, 8, 29], [0, 9, 30], [0, 10, 31], [0, 9, ...  
2  [[[0, 10, 30], [0, 11, 31], [0, 11, 32], [0, 1...  
3  [[[0, 5, 10], [0, 5, 10], [0, 5, 11], [0, 5, 1...  
4  [[[1, 5, 11], [1, 5, 11], [0, 5, 11], [1, 5, 1...  

In [8]:
%time features["photo"] = features.photo.apply(lambda x: x[:,:,-1])
print(features.photo[3].shape)
print(features.head())


CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.05 ms
(100, 100)
                                                file           class  \
0  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...  unclassifiable   
1  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...      not a cell   
2  ../../BYindex/cellsData/ClassAna/BY2_5dias (61...       interfase   
3  ../../BYindex/cellsData/ClassAna/BY2_5dias (88...       interfase   
4  ../../BYindex/cellsData/ClassAna/BY2_5dias (88...       interfase   

                                               photo  
0  [[36, 36, 37, 37, 37, 37, 38, 39, 39, 40, 41, ...  
1  [[29, 30, 31, 31, 32, 33, 33, 33, 34, 35, 36, ...  
2  [[30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, ...  
3  [[10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, ...  
4  [[11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, ...  

In [9]:
features['shape'] = features['photo'].apply(np.shape)
print(np.sum(features['shape']!= (100,100)))
features = features[features['shape']== (100,100)]


1

In [10]:
plt.imshow(features.photo[30])

def getHoG(image):
    HOG = ft.hog(image).reshape(1,-1)
    return HOG

def getLBP(image):
    hLBP = np.array([]).reshape(1,-1)
    for P in range(4,20,2):
        for R in range(2,10,2):
            LBP = ft.local_binary_pattern(image, P, R, 'uniform').reshape(-1,1)
            hLBP = np.hstack((np.histogram(LBP,50,range=(0,50))[0].reshape(1,-1), hLBP))

    return hLBP[0]



In [11]:
%time features['HoG'] = features.photo.apply(getHoG)
%time features['LBP'] = features.photo.apply(getLBP)


CPU times: user 4.56 s, sys: 48 ms, total: 4.6 s
Wall time: 4.58 s
CPU times: user 2min 24s, sys: 44 ms, total: 2min 24s
Wall time: 2min 24s

In [12]:
from numpy.fft import fft2
def abs_pha_fft(image):
    imfft = fft2(image)
    return np.hstack([np.abs(imfft).reshape(1,-1), np.angle(imfft).reshape(1,-1)])
features['fft'] = features.photo.apply(abs_pha_fft)

In [13]:
photo_features = features.copy()

phase_classes = ['interfase','mitose']
features = features[features['class'].apply(lambda x: x in phase_classes)]

LE = LabelEncoder()
LE.fit(phase_classes)


Out[13]:
LabelEncoder()

In [14]:
def isGoodPhoto(label):
    if label in ['interfase','mitose']: return 1
    else: return 0

photo_features['class'] = photo_features['class'].apply(isGoodPhoto)

PCA


In [15]:
Phog = np.array(photo_features.HoG.tolist())[:,0,:]
Plbp = np.array(photo_features.LBP.tolist())[:,:]

pcaHoG = PCA()
pcaLBP = PCA()

Phog_pc = pcaHoG.fit_transform(Phog)
Plbp_pc  = pcaLBP.fit_transform(Plbp)

In [16]:
Xhog = np.array(features.HoG.tolist())[:,0,:]
Xlbp = np.array(features.LBP.tolist())[:,:]

pcaHoG = PCA()
pcaLBP = PCA()

Xhog_pc = pcaHoG.fit_transform(Xhog)
Xlbp_pc  = pcaLBP.fit_transform(Xlbp)

In [17]:
plt.plot(np.cumsum(pcaHoG.explained_variance_ratio_))
plt.ylabel('HoG Variance explained')
plt.xlabel('Number of components')
plt.title('PCA on HoG features')


Out[17]:
<matplotlib.text.Text at 0x7f3c7f77da50>

In [18]:
plt.plot(np.cumsum(pcaLBP.explained_variance_ratio_))
plt.ylabel('LBP Variance explained')
plt.xlabel('Number of components');plt.xlim([0,2000])
plt.title('PCA on LBP features')


Out[18]:
<matplotlib.text.Text at 0x7f3c7f6e4c90>

In [19]:
X_mult_pc = np.hstack([Xhog_pc[:,:1000],
                    Xlbp_pc[:,:500]])
P_mult_pc = np.hstack([Phog_pc[:,:1000],
                    Plbp_pc[:,:500]])

In [20]:
Yphoto = np.array(photo_features['class'].tolist())
Y = np.array(LE.transform(features['class']).tolist())

Implementation

All classifiers testing

Benchmarks


In [23]:
from sklearn.metrics import f1_score
randomGuess_phase = (np.random.random([5,X.shape[0]])>.5)
biasedGuess_phase = (np.random.random([5,X.shape[0]])>.8)
randomGuess_photo = (np.random.random([5,Xphoto.shape[0]])>.5)
biasedGuess_photo = (np.random.random([5,Xphoto.shape[0]])>.8)


#photo
scrPhotoBias=np.array([f1_score(y, Yphoto) for y in biasedGuess_photo ])
scrPhotoRdn =np.array([f1_score(y, Yphoto) for y in randomGuess_photo ])

#phase
scrBias=np.array([f1_score(y, Y) for y in biasedGuess_phase ])
scrRdn =np.array([f1_score(y, Y) for y in randomGuess_phase ])

In [24]:
print '                      mean                   std'
print 'Phase bias     ' + str(scrBias.mean())+ '      '+ str(scrBias.std())
print 'Phase random   ' + str(scrRdn.mean())+ '      '+ str(scrRdn.std())
print 'Photo bias     ' + str(scrPhotoBias.mean())+ '      '+ str(scrPhotoBias.std())
print 'Photo random   ' + str(scrPhotoRdn.mean()) + '      '+ str(scrPhotoRdn.std())


                      mean                   std
Phase bias     0.226352906371      0.00602583984541
Phase random   0.333974147558      0.00503161832731
Photo bias     0.332729815228      0.0103044383537
Photo random   0.65821026927      0.00890418356653

FFT


In [23]:
FFTphoto = np.array(photo_features.fft.tolist())[:,0,:]
FFT = np.array(features.fft.tolist())[:,0,:]

In [66]:
benchSVM = SVC()
benchphase=[]
benchphoto=[]
benchphase = cross_val_score(benchSVM, FFT, Y, cv=5, scoring='f1')
benchphoto = cross_val_score(benchSVM, FFTphoto, Yphoto, cv=5, scoring='f1')

In [68]:
print 'Phase results:'
print benchphase
print benchphase.mean()
print benchphase.std()
print '\n\nPhoto Results:'
print benchphoto
print benchphoto.mean()
print benchphoto.std()


Phase results:
[ 0.62666667  0.35897436  0.40350877  0.40350877  0.38938053]
0.436407820095
0.0965097199896


Photo Results:
[ 0.98231293  0.98507463  0.97580645  0.98092643  0.98499318]
0.981822722577
0.00340193556651

Phase classifier

With PCA


In [112]:
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    LinearSVC(),
    GaussianProcessClassifier(),
    GaussianNB(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression()
]

In [113]:
%%time
phasescores=[]
for clf in classifiers:
    %time phasescores.append(cross_val_score(clf, X_mult_pc, Y, cv=5, scoring='f1'))


CPU times: user 2.81 s, sys: 0 ns, total: 2.81 s
Wall time: 2.81 s
CPU times: user 24 s, sys: 12 ms, total: 24 s
Wall time: 24 s
CPU times: user 26.4 s, sys: 1.17 s, total: 27.6 s
Wall time: 26.2 s
CPU times: user 23.9 s, sys: 10.9 s, total: 34.9 s
Wall time: 18.7 s
CPU times: user 404 ms, sys: 304 ms, total: 708 ms
Wall time: 366 ms
CPU times: user 14.2 s, sys: 0 ns, total: 14.2 s
Wall time: 14.1 s
CPU times: user 1.77 s, sys: 0 ns, total: 1.77 s
Wall time: 1.77 s
CPU times: user 2min 53s, sys: 1.29 s, total: 2min 55s
Wall time: 2min 53s
CPU times: user 4min 27s, sys: 13.7 s, total: 4min 41s
Wall time: 4min 21s

In [114]:
np.array(phasescores).mean(axis=1)


Out[114]:
array([ 0.47481809,  0.43640782,  0.54962273,  0.43640782,  0.46497331,
        0.56110927,  0.42505499,  0.64376917])

In [115]:
np.array(phasescores).std(axis=1)


Out[115]:
array([ 0.09671241,  0.09650972,  0.06542592,  0.09650972,  0.10815983,
        0.07579519,  0.06583178,  0.08022015])

Without PCA


In [25]:
X_not_pc = np.hstack([Xhog[:,:],
                    Xlbp[:,:]])

In [127]:
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    LinearSVC(),
    GaussianProcessClassifier(),
    GaussianNB(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression()
]

In [128]:
%%time
phasescores=[]
for clf in classifiers:
    %time phasescores.append(cross_val_score(clf, X_not_pc, Y, cv=5, scoring='f1'))


CPU times: user 27.7 s, sys: 144 ms, total: 27.9 s
Wall time: 27.8 s
CPU times: user 2min 22s, sys: 248 ms, total: 2min 22s
Wall time: 2min 22s
CPU times: user 1min 38s, sys: 1.46 s, total: 1min 40s
Wall time: 1min 38s
CPU times: user 1min 59s, sys: 11.4 s, total: 2min 10s
Wall time: 1min 55s
CPU times: user 1.82 s, sys: 452 ms, total: 2.28 s
Wall time: 2.12 s
CPU times: user 55 s, sys: 88 ms, total: 55.1 s
Wall time: 55.1 s
CPU times: user 3.37 s, sys: 56 ms, total: 3.42 s
Wall time: 3.42 s
CPU times: user 4min 25s, sys: 1.56 s, total: 4min 26s
Wall time: 4min 25s
CPU times: user 11min 53s, sys: 15.4 s, total: 12min 8s
Wall time: 11min 49s

In [129]:
np.array(phasescores).mean(axis=1)


Out[129]:
array([ 0.47629376,  0.43640782,  0.62597124,  0.43640782,  0.50460824,
        0.60708125,  0.53671735,  0.67267363])

In [130]:
np.array(phasescores).std(axis=1)


Out[130]:
array([ 0.09795529,  0.09650972,  0.1042698 ,  0.09650972,  0.04498977,
        0.0466631 ,  0.08377888,  0.04815626])

Photo classifier

With PCA


In [131]:
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    LinearSVC(),
    GaussianProcessClassifier(),
    GaussianNB(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression()
]

In [166]:
%%time
photoscores=[]
for clf in classifiers:
    %time photoscores.append(cross_val_score(clf, P_mult_pc, Yphoto, cv=3,scoring='f1'))


CPU times: user 2.64 s, sys: 0 ns, total: 2.64 s
Wall time: 2.64 s
CPU times: user 12.7 s, sys: 0 ns, total: 12.7 s
Wall time: 12.6 s
CPU times: user 15.6 s, sys: 652 ms, total: 16.2 s
Wall time: 15.6 s
CPU times: user 13.2 s, sys: 6.03 s, total: 19.2 s
Wall time: 11.3 s
CPU times: user 288 ms, sys: 220 ms, total: 508 ms
Wall time: 279 ms
CPU times: user 12.1 s, sys: 16 ms, total: 12.1 s
Wall time: 12.1 s
CPU times: user 720 ms, sys: 16 ms, total: 736 ms
Wall time: 737 ms
CPU times: user 1min 47s, sys: 888 ms, total: 1min 48s
Wall time: 1min 47s
CPU times: user 2min 44s, sys: 7.83 s, total: 2min 52s
Wall time: 2min 42s

In [167]:
np.array(photoscores).mean(axis=1)


Out[167]:
array([ 0.97425592,  0.98181025,  0.59257811,  0.47882438,  0.97691859,
        0.9667074 ,  0.98155403,  0.8259437 ])

In [168]:
np.array(photoscores).std(axis=1)


Out[168]:
array([ 0.0017507 ,  0.00135777,  0.06328935,  0.17840869,  0.00216439,
        0.00838531,  0.00164513,  0.19315872])

Without PCA


In [169]:
P_not_pc = np.hstack([Phog[:,:],
                    Plbp[:,:]])

In [170]:
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    LinearSVC(),
    GaussianProcessClassifier(),
    GaussianNB(),
    DecisionTreeClassifier(),
    RandomForestClassifier(class_weight= 'balanced'),
    LogisticRegression()
]

In [171]:
%%time
photoscores=[]
for clf in classifiers:
    %time photoscores.append(cross_val_score(clf, P_not_pc, Yphoto, cv=5,scoring='f1'))


CPU times: user 30.5 s, sys: 408 ms, total: 30.9 s
Wall time: 30.8 s
CPU times: user 2min 39s, sys: 864 ms, total: 2min 40s
Wall time: 2min 40s
CPU times: user 1min 10s, sys: 1.41 s, total: 1min 12s
Wall time: 1min 10s
CPU times: user 2min 8s, sys: 14.6 s, total: 2min 23s
Wall time: 2min 5s
CPU times: user 1.76 s, sys: 380 ms, total: 2.14 s
Wall time: 1.9 s
CPU times: user 1min 47s, sys: 132 ms, total: 1min 48s
Wall time: 1min 47s
CPU times: user 2.44 s, sys: 68 ms, total: 2.51 s
Wall time: 2.51 s
CPU times: user 1min 15s, sys: 1.26 s, total: 1min 16s
Wall time: 1min 14s
CPU times: user 9min 36s, sys: 19.2 s, total: 9min 55s
Wall time: 9min 34s

In [172]:
np.array(photoscores).mean(axis=1)


Out[172]:
array([ 0.97587275,  0.98182272,  0.94902283,  0.47874962,  0.96028892,
        0.9696855 ,  0.98128742,  0.97548319])

In [173]:
np.array(photoscores).std(axis=1)


Out[173]:
array([ 0.00160194,  0.00340194,  0.06715102,  0.1786438 ,  0.00792675,
        0.01200587,  0.00339384,  0.00507288])

Grid searching

Phase classifier

Support Vector Machine

In [196]:
params1 = {'criterion':['gini','entropy'], 'class_weight':[{0:1,1:1},'balanced'], 'min_samples_split':[2,10,20,40,80,160]}
clf1 = DecisionTreeClassifier()

params2 = {'C':[.01,.1,1,2,4,8,16], 'class_weight':[None,'balanced']}
clf2 = LogisticRegression()


grid1 = GridSearchCV(clf1,params1,cv=3,scoring='f1')
grid1.fit(X_mult_pc,Y)

grid2 = GridSearchCV(clf2,params2,cv=3,scoring='f1')
grid2.fit(X_mult_pc,Y)

print('Melhores parametros: ') 
print('Decision Tree:')
print(grid1.best_params_)
print('Logistic Regression:')
print(grid2.best_params_)


Melhores parametros: 
Support vector machine:
{'min_samples_split': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}
Logistic Regression:
{'C': 0.1, 'class_weight': 'balanced'}

In [197]:
clf1 = grid1.best_estimator_
result1 = cross_val_score(clf1,X_mult_pc,Y,cv=5,scoring='f1')

clf2 = grid2.best_estimator_
result2 = cross_val_score(clf2,X_mult_pc,Y,cv=5,scoring='f1')

In [198]:
print(result1.mean())
print(result1.std())

print(result2.mean())
print(result2.std())


0.564498583599
0.0382862250387
0.644928144692
0.0728574422891

Photo classifier


In [76]:
gamma = np.arange(0,2,.1)*1/len(Xpc[0])
params1 = {'C':[.01,.1,1,2,4,8,16], 'gamma':gamma}
clf1 = SVC()

params2 = {'min_samples_split':[2,10,20,40,80,160],'criterion':['gini','entropy'],'max_depth':[2,3,4,5,6]}
clf2 = RandomForestClassifier(class_weight='balanced')


grid1 = GridSearchCV(clf1,params1,cv=5,scoring='f1')
grid1.fit(P_mult_pc,Yphoto)

grid2 = GridSearchCV(clf2,params2,cv=5,scoring='f1')
grid2.fit(P_mult_pc,Yphoto)

print('Melhores parametros: ') 
print('Support vector machine:')
print(grid1.best_params_)
print('Random Forest:')
print(grid2.best_params_)


Melhores parametros: 
Support vector machine:
{'C': 1, 'gamma': 6.666666666666667e-05}
Gaussian Process:
{'min_samples_split': 2, 'criterion': 'gini', 'max_depth': 6}

In [27]:
clf1 = grid1.best_estimator_
clf2 = grid2.best_estimator_

In [28]:
result1 = cross_val_score(clf1,P_mult_pc,Yphoto,cv=5,scoring='f1')
result2 = cross_val_score(clf2,P_mult_pc,Yphoto,cv=5,scoring='f1')

In [29]:
print(result1.mean())
print(result1.std())

print(result2.mean())
print(result2.std())


0.981822722577
0.00340193556651
0.98046510094
0.0029733429454

Refinement

Photo


In [30]:
gamma = np.arange(.5,7,.5)*1e-5
params = {'C':[.8,.9,1,1.1,1.2], 'gamma':gamma}
clf = SVC()

grid = GridSearchCV(clf,params,cv=5,scoring='f1')
grid.fit(Xphoto_pc,Yphoto)

print('Melhores parametros: ') 
print(grid.best_params_)


Melhores parametros: 
{'C': 1.1, 'gamma': 1.5000000000000002e-05}

In [39]:
clf = grid.best_estimator_
result = cross_val_score(clf,Xphoto_pc,Yphoto,cv=5,scoring='f1')
print(result.mean())
print(result.std())


0.982099291193
0.00369226539286
All features

In [78]:
clf = SVC(C=1.1,gamma=1.5e-5)
result = cross_val_score(clf,Xphoto,Yphoto,cv=5,scoring='f1')
print(result)
print(result.mean())
print(result.std())


[ 0.98231293  0.98507463  0.97580645  0.98092643  0.98499318]
0.981822722577
0.00340193556651

In [80]:
clf = SVC(C=1.1,gamma=1.5e-5)
result = cross_val_score(clf,Xphotofft,Yphoto,cv=5,scoring='f1')
print(result)
print(result.mean())
print(result.std())


[ 0.98231293  0.98507463  0.97580645  0.98092643  0.98499318]
0.981822722577
0.00340193556651

Phase


In [201]:
params = {'C':[0.06,0.08,0.1,0.12,0.16,0.2,0.5]}
clf = LogisticRegression(class_weight='balanced')

grid = GridSearchCV(clf,params,cv=5,scoring='f1')
grid.fit(X_mult_pc,Y)

print('Melhores parametros: ')
print(grid.best_params_)


Melhores parametros: 
{'C': 0.16}

In [202]:
clf = grid.best_estimator_
result = cross_val_score(clf,X_mult_pc,Y,cv=5,scoring='f1')
print(result.mean())
print(result.std())


0.648686795335
0.0745223935243
All Features

In [203]:
clf = grid.best_estimator_
result = cross_val_score(clf,X_not_pc,Y,cv=5,scoring='f1')
print(result)
print(result.mean())
print(result.std())


[ 0.74166667  0.65945946  0.58064516  0.70056497  0.69565217]
0.675597686616
0.0541610235394

In [48]:
Xfft = np.hstack([FFT,X_not_pc])

In [207]:
clf = grid.best_estimator_
result = cross_val_score(clf,Xfft,Y,cv=5,scoring='f1')
print(result)
print(result.mean())
print(result.std())


[ 0.73777778  0.72131148  0.67039106  0.72189349  0.71515152]
0.713305064183
0.0227235491078

In [208]:
params = {'C':[0.16,0.3,0.5,1,2]}
clf = LogisticRegression(class_weight='balanced')

grid = GridSearchCV(clf,params,cv=5,scoring='f1')
grid.fit(Xfft,Y)

print('Melhores parametros: ')
print(grid.best_params_)


Melhores parametros: 
{'C': 2}

In [209]:
clf = grid.best_estimator_
%time result = cross_val_score(clf,Xfft,Y,cv=5,scoring='f1')
print(result.mean())
print(result.std())


CPU times: user 13min, sys: 2.58 s, total: 13min 3s
Wall time: 13min 1s
0.713728199846
0.0210488112225

In [56]:
params = {'C':[2,3,4]}
clf = LogisticRegression(class_weight='balanced')

grid = GridSearchCV(clf,params,cv=5,scoring='f1')
grid.fit(Xfft,Y)

print('Melhores parametros: ')
print(grid.best_params_)


Melhores parametros: 
{'C': 2}

Robustness


In [57]:
clf = grid.best_estimator_
%time result = cross_val_score(clf,Xfft,Y,cv=5,scoring='f1')
print(result.mean())
print(result.std())


CPU times: user 12min 56s, sys: 2.74 s, total: 12min 59s
Wall time: 12min 58s
0.713728199846
0.0210488112225

In [58]:
clf = grid.best_estimator_
for i in range(10):
    Xrob = Xfft.copy()
    Xrob[:,2000*i:2000*(i+1)]=0
    result = cross_val_score(clf,Xrob,Y,cv=3,scoring='f1')
    print(result.mean())
    print(result.std())


0.66741914529
0.0643410046319
0.667170838251
0.0618388288775
0.668329249407
0.0645646277901
0.673036950028
0.0618707478628
0.673858084925
0.0548163346545
0.673392006725
0.0561240799136
0.671890330046
0.0592894964658
0.672055715534
0.0578915330375
0.671553005053
0.0582784910189
0.672142553389
0.0549266484603

Free Visualization


In [105]:
clf = grid.best_estimator_
clf.fit(Xfft,Y)
coefs = clf.coef_


[[28100  5132  3120 ..., 26608 27191 19109]]

In [106]:
sortidx = np.argsort(np.abs(coefs))
print(sortidx[:2])
ids = sortidx[0,[0,2]]
ids


[[28100  5132  3120 ..., 26608 27191 19109]]
Out[106]:
array([28100,  3120])

In [115]:
ids = sortidx[0,[1,5]]
toPlotX = Xfft[:,ids]
mitX = toPlotX[Y==1]
intX = toPlotX[Y==0]
plt.title('Most weighted features')
plt.plot(mitX[:,0],mitX[:,1],'o',label='Mitosis')
plt.plot(intX[:,0],intX[:,1],'x',label='Interphasis')
plt.xlabel('Feature 25903')
plt.ylabel('Feature 27308')
plt.legend()


Out[115]:
<matplotlib.legend.Legend at 0x7f28e07ae250>

In [24]:
ids = sortidx[0,[3,40]]
toPlotX = Xfft[:,ids]
mitX = toPlotX[Y==1]
intX = toPlotX[Y==0]
plt.plot(mitX[:,0],mitX[:,1],'o')
plt.plot(intX[:,0],intX[:,1],'x')


Out[24]:
[<matplotlib.lines.Line2D at 0x7f28e973d310>]

In [29]:
ids = sortidx[0,[0,2]]
toPlotX = Xfft[:,ids]
mitX = toPlotX[Y==1]
intX = toPlotX[Y==0]
plt.title('Closer look at most weighted')
plt.plot(mitX[:,0],mitX[:,1],'o',label='Mitosis')
plt.plot(intX[:,0],intX[:,1],'x',label='Interphasis')
plt.xlabel('Feature 25903'); plt.xlim([0,500])
plt.ylabel('Feature 27308'); plt.ylim([0,500])
plt.legend()


Out[29]:
<matplotlib.legend.Legend at 0x7f28e80b7090>

In [34]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(Xfft,Y)

clf = LogisticRegression(class_weight='balanced', C = 3.8)
clf.fit(Xtrain,Ytrain)
coefs = clf.coef_
sortidx = np.argsort(coefs)
print(sortidx[:2])


[[26699 19601 18177 ..., 26889 19305 26995]]

In [35]:
Ypred = clf.predict(Xtest)

In [55]:
intX.shape


Out[55]:
(343, 2)

In [92]:
ids=sortidx[0,[0,-1]]

mitX = Xtest[Ytest==1]; mitX = mitX[:,ids]
intX = Xtest[Ytest==0]; intX = intX[:,ids]

mitrX = Xtest[np.logical_and(Ypred==1,Ytest ==1),:]; mitrX = mitrX[:,ids]
mitwX = Xtest[np.logical_and(Ypred==0,Ytest ==1),:]; mitwX = mitwX[:,ids]
intrX = Xtest[np.logical_and(Ypred==0,Ytest ==0),:]; intrX = intrX[:,ids]
intwX = Xtest[np.logical_and(Ypred==1,Ytest ==0),:]; intwX = intwX[:,ids]

In [93]:
mitwX.shape


Out[93]:
(28, 2)

In [94]:
plt.title('Most relevant features')
plt.plot(mitrX[:,0],mitrX[:,1],'o',color='blue',label='Right Mitosis')
plt.plot(mitwX[:,0],mitwX[:,1],'o',color='red',label='Wrong Mitosis')
plt.plot(intrX[:,0],intrX[:,1],'x',color='blue',label='Right Interphasis')
plt.plot(intwX[:,0],intwX[:,1],'x',color='red',label='Wrong Interphasis')
plt.xlabel('Feature 26699')
plt.ylabel('Feature 18177')
plt.legend()


Out[94]:
<matplotlib.legend.Legend at 0x7f28e1e02dd0>

In [95]:
plt.title('Closer look at most relevant')
plt.plot(mitrX[:,0],mitrX[:,1],'o',color='blue',label='Right Mitosis')
plt.plot(mitwX[:,0],mitwX[:,1],'o',color='red',label='Wrong Mitosis')
plt.plot(intrX[:,0],intrX[:,1],'x',color='blue',label='Right Interphasis')
plt.plot(intwX[:,0],intwX[:,1],'x',color='red',label='Wrong Interphasis')
plt.xlabel('Feature 26699')
plt.ylabel('Feature 18177')
plt.xlim([0,2000])
plt.ylim([0,2000])
plt.legend()


Out[95]:
<matplotlib.legend.Legend at 0x7f28e1c96d90>