In [158]:
#print __doc__

from time import time
import logging
import pylab as pl
import numpy as np
import cv2
import os 

from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import RandomizedPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.svm import SVC

%pylab inline
base_path = 'simple_classes'
n_components = 10


Populating the interactive namespace from numpy and matplotlib
/usr/lib/python3.5/site-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['clf']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [159]:
def flatted_img(imgname):
    img = cv2.imread(imgname, 0)
    return np.array(cv2.resize(img, (80, 80))).flatten()
    
    
class1_dir = os.path.join(base_path, 'corrupted')
class2_dir = os.path.join(base_path, 'fine')

class1_imgnames = os.listdir(class1_dir)
class2_imgnames = os.listdir(class2_dir)
print(list(set(class1_imgnames) & set(class2_imgnames)))

X1 = [flatted_img(os.path.join(class1_dir, imgname)) for imgname in class1_imgnames]
X2 = [flatted_img(os.path.join(class2_dir, imgname)) for imgname in class2_imgnames]

X = np.array(X1 + X2)
n_samples = X.shape[0]
n_features = X.shape[1]

y = np.array([0]*len(X1) + [1]*len(X2))
target_names = np.array(['corrupted', 'fine'])
n_classes = target_names.shape[0]

print ("Total dataset size:")
print ("n_samples: %d" % n_samples)
print( "n_features: %d" % n_features)
print ("n_classes: %d" % n_classes)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


y_pred = None
eigencells = None
pca = None
clf = None
def pca_train_and_predict(n_components):
    ###############################################################################
    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
    # n_components = 150
    
    print ("Extracting the top %d eigencells from %d cells" % (n_components, X_train.shape[0]))
    t0 = time()
    global pca
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    print ("done in %0.3fs" % (time() - t0))    
    
    print ("Projecting the input data on the eigencells orthonormal basis")
    t0 = time()
    X_train_pca = pca.transform(X_train)    
    X_test_pca = pca.transform(X_test)
    print ("done in %0.3fs" % (time() - t0))
    
    ###############################################################################
    # Train a SVM classification model

    print ("Fitting the classifier to the training set")
    t0 = time()
    param_grid = {
             'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
              }
    
    global clf
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    clf = clf.fit(X_train_pca, y_train)
    print ("done in %0.3fs" % (time() - t0))
    print ("Best estimator found by grid search:")
    print (clf.best_estimator_)
    
    ###############################################################################
    # Quantitative evaluation of the model quality on the test set

    print ("Predicting the cell quality on the testing set")
    t0 = time()
    global y_pred
    y_pred = clf.predict(X_test_pca)
    print ("done in %0.3fs" % (time() - t0))

    print (classification_report(y_test, y_pred, target_names=target_names))
    print (confusion_matrix(y_test, y_pred, labels=range(n_classes)))

pca_train_and_predict(n_components)


[]
Total dataset size:
n_samples: 79
n_features: 6400
n_classes: 2
Extracting the top 10 eigencells from 59 cells
done in 0.119s
Projecting the input data on the eigencells orthonormal basis
done in 0.009s
Fitting the classifier to the training set
done in 0.246s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Predicting the cell quality on the testing set
done in 0.000s
             precision    recall  f1-score   support

  corrupted       1.00      0.75      0.86         4
       fine       0.94      1.00      0.97        16

avg / total       0.95      0.95      0.95        20

[[ 3  1]
 [ 0 16]]

In [128]:
def flatted_img(imgname):
    img = cv2.imread(imgname, 0)
    return np.array(cv2.resize(img, (80, 80))).flatten()
    
    
class1_dir = os.path.join(base_path, 'corrupted')
class2_dir = os.path.join(base_path, 'fine')

class1_imgnames = os.listdir(class1_dir)
class2_imgnames = os.listdir(class2_dir)

X1 = [flatted_img(os.path.join(class1_dir, imgname)) for imgname in class1_imgnames]
X2 = [flatted_img(os.path.join(class2_dir, imgname)) for imgname in class2_imgnames]

X = np.array(X1 + X2)
n_samples = X.shape[0]
n_features = X.shape[1]

y = np.array([0]*len(X1) + [1]*len(X2))
target_names = np.array(['corrupted', 'fine'])
n_classes = target_names.shape[0]

print ("Total dataset size:")
print ("n_samples: %d" % n_samples)
print( "n_features: %d" % n_features)
print ("n_classes: %d" % n_classes)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


y_pred = None
eigencells = None

def pca_train_and_predict(n_components):
    ###############################################################################
    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
    # n_components = 150
    
    print ("Extracting the top %d eigencells from %d cells" % (n_components, X_train.shape[0]))
    t0 = time()
    lda = LinearDiscriminantAnalysis(n_components=n_components).fit(X_train, y_train)
    print ("done in %0.3fs" % (time() - t0))    
    
    print ("Projecting the input data on the eigencells orthonormal basis")
    t0 = time()
    X_train_lda = lda.transform(X_train)    
    X_test_lda = lda.transform(X_test)
    print(X_train.shape, X_train_lda.shape)
    print ("done in %0.3fs" % (time() - t0))
    
    ###############################################################################
    # Train a SVM classification model

    print ("Fitting the classifier to the training set")
    t0 = time()
    param_grid = {
             'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
              }
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    clf = clf.fit(X_train_lda, y_train)
    print ("done in %0.3fs" % (time() - t0))
    print ("Best estimator found by grid search:")
    print (clf.best_estimator_)
    
    ###############################################################################
    # Quantitative evaluation of the model quality on the test set

    print ("Predicting the cell quality on the testing set")
    t0 = time()
    global y_pred
    y_pred = clf.predict(X_test_lda)
    print ("done in %0.3fs" % (time() - t0))

    print (classification_report(y_test, y_pred, target_names=target_names))
    print (confusion_matrix(y_test, y_pred, labels=range(n_classes)))

pca_train_and_predict(1)


Total dataset size:
n_samples: 79
n_features: 6400
n_classes: 2
Extracting the top 1 eigencells from 59 cells
done in 0.169s
Projecting the input data on the eigencells orthonormal basis
(59, 6400) (59, 1)
done in 0.003s
Fitting the classifier to the training set
/usr/lib/python3.5/site-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
done in 0.257s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Predicting the cell quality on the testing set
done in 0.000s
             precision    recall  f1-score   support

  corrupted       0.67      1.00      0.80         4
       fine       1.00      0.88      0.93        16

avg / total       0.93      0.90      0.91        20

[[ 4  0]
 [ 2 14]]

In [154]:
def process_img(imgname):
    img = cv2.imread(imgname, 0)    
    ret, thresh = cv2.threshold(-blured,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    
    plt.imshow(thresh)
    return np.array(cv2.resize(img, (80, 80))).flatten()

test_path = '/home/falcon/Workspace/Klyushin/summer/data'
test_imgnames = os.listdir(test_path)
X = np.array([flatted_img(os.path.join(test_path, imgname)) for imgname in test_imgnames
     if os.path.isfile(os.path.join(test_path, imgname)) and imgname.split(".")[-1] == 'bmp'])
print(X.shape)


(153, 6400)

In [157]:
def process_img(imgname):
    img = cv2.imread(imgname, 0)    
    # noise reduction and thresholding
    blured = cv2.medianBlur(img,5)
    ret, thresh = cv2.threshold(-blured,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    
    left, right, top, bottom = 65536, -1, 65536, -1    
    for row in range(img.shape[0]):
        for col in range(img.shape[1]):
            if (thresh[row][col] == 0):
                continue
            if col < left:
                left = col
            if col > right:
                right = col
                
            if row < top:
                top = row
                
            if row > bottom:
                bottom = row
                
    subimg = img[top:bottom+1, left:right+1]    
    return np.array(cv2.resize(subimg, (80, 80))).flatten()

test_path = '/home/falcon/Workspace/Klyushin/summer/data'
test_imgnames = os.listdir(test_path)
X_test = np.array([flatted_img(os.path.join(test_path, imgname)) for imgname in test_imgnames
     if os.path.isfile(os.path.join(test_path, imgname)) and imgname.split(".")[-1] == 'bmp'])
y_test = np.array([1]*X.shape[0])
print(X.shape, y_test.shape)

X_test_pca = pca.transform(X_test)
###############################################################################
# Quantitative evaluation of the model quality on the test set

print ("Predicting the cell quality on the testing set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print ("done in %0.3fs" % (time() - t0))

print (classification_report(y_test, y_pred, target_names=target_names))
print (confusion_matrix(y_test, y_pred, labels=range(n_classes)))


(153, 6400) (153,)
Predicting the cell quality on the testing set
done in 0.001s
             precision    recall  f1-score   support

  corrupted       0.00      0.00      0.00         0
       fine       0.00      0.00      0.00       153

avg / total       0.00      0.00      0.00       153

[[  0   0]
 [153   0]]
/usr/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/lib/python3.5/site-packages/sklearn/metrics/classification.py:1076: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)

In [ ]: