In [158]:
#print __doc__
from time import time
import logging
import pylab as pl
import numpy as np
import cv2
import os
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import RandomizedPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
%pylab inline
base_path = 'simple_classes'
n_components = 10
In [159]:
def flatted_img(imgname):
img = cv2.imread(imgname, 0)
return np.array(cv2.resize(img, (80, 80))).flatten()
class1_dir = os.path.join(base_path, 'corrupted')
class2_dir = os.path.join(base_path, 'fine')
class1_imgnames = os.listdir(class1_dir)
class2_imgnames = os.listdir(class2_dir)
print(list(set(class1_imgnames) & set(class2_imgnames)))
X1 = [flatted_img(os.path.join(class1_dir, imgname)) for imgname in class1_imgnames]
X2 = [flatted_img(os.path.join(class2_dir, imgname)) for imgname in class2_imgnames]
X = np.array(X1 + X2)
n_samples = X.shape[0]
n_features = X.shape[1]
y = np.array([0]*len(X1) + [1]*len(X2))
target_names = np.array(['corrupted', 'fine'])
n_classes = target_names.shape[0]
print ("Total dataset size:")
print ("n_samples: %d" % n_samples)
print( "n_features: %d" % n_features)
print ("n_classes: %d" % n_classes)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
y_pred = None
eigencells = None
pca = None
clf = None
def pca_train_and_predict(n_components):
###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
# n_components = 150
print ("Extracting the top %d eigencells from %d cells" % (n_components, X_train.shape[0]))
t0 = time()
global pca
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print ("done in %0.3fs" % (time() - t0))
print ("Projecting the input data on the eigencells orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print ("done in %0.3fs" % (time() - t0))
###############################################################################
# Train a SVM classification model
print ("Fitting the classifier to the training set")
t0 = time()
param_grid = {
'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
global clf
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print ("done in %0.3fs" % (time() - t0))
print ("Best estimator found by grid search:")
print (clf.best_estimator_)
###############################################################################
# Quantitative evaluation of the model quality on the test set
print ("Predicting the cell quality on the testing set")
t0 = time()
global y_pred
y_pred = clf.predict(X_test_pca)
print ("done in %0.3fs" % (time() - t0))
print (classification_report(y_test, y_pred, target_names=target_names))
print (confusion_matrix(y_test, y_pred, labels=range(n_classes)))
pca_train_and_predict(n_components)
In [128]:
def flatted_img(imgname):
img = cv2.imread(imgname, 0)
return np.array(cv2.resize(img, (80, 80))).flatten()
class1_dir = os.path.join(base_path, 'corrupted')
class2_dir = os.path.join(base_path, 'fine')
class1_imgnames = os.listdir(class1_dir)
class2_imgnames = os.listdir(class2_dir)
X1 = [flatted_img(os.path.join(class1_dir, imgname)) for imgname in class1_imgnames]
X2 = [flatted_img(os.path.join(class2_dir, imgname)) for imgname in class2_imgnames]
X = np.array(X1 + X2)
n_samples = X.shape[0]
n_features = X.shape[1]
y = np.array([0]*len(X1) + [1]*len(X2))
target_names = np.array(['corrupted', 'fine'])
n_classes = target_names.shape[0]
print ("Total dataset size:")
print ("n_samples: %d" % n_samples)
print( "n_features: %d" % n_features)
print ("n_classes: %d" % n_classes)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
y_pred = None
eigencells = None
def pca_train_and_predict(n_components):
###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
# n_components = 150
print ("Extracting the top %d eigencells from %d cells" % (n_components, X_train.shape[0]))
t0 = time()
lda = LinearDiscriminantAnalysis(n_components=n_components).fit(X_train, y_train)
print ("done in %0.3fs" % (time() - t0))
print ("Projecting the input data on the eigencells orthonormal basis")
t0 = time()
X_train_lda = lda.transform(X_train)
X_test_lda = lda.transform(X_test)
print(X_train.shape, X_train_lda.shape)
print ("done in %0.3fs" % (time() - t0))
###############################################################################
# Train a SVM classification model
print ("Fitting the classifier to the training set")
t0 = time()
param_grid = {
'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_lda, y_train)
print ("done in %0.3fs" % (time() - t0))
print ("Best estimator found by grid search:")
print (clf.best_estimator_)
###############################################################################
# Quantitative evaluation of the model quality on the test set
print ("Predicting the cell quality on the testing set")
t0 = time()
global y_pred
y_pred = clf.predict(X_test_lda)
print ("done in %0.3fs" % (time() - t0))
print (classification_report(y_test, y_pred, target_names=target_names))
print (confusion_matrix(y_test, y_pred, labels=range(n_classes)))
pca_train_and_predict(1)
In [154]:
def process_img(imgname):
img = cv2.imread(imgname, 0)
ret, thresh = cv2.threshold(-blured,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
plt.imshow(thresh)
return np.array(cv2.resize(img, (80, 80))).flatten()
test_path = '/home/falcon/Workspace/Klyushin/summer/data'
test_imgnames = os.listdir(test_path)
X = np.array([flatted_img(os.path.join(test_path, imgname)) for imgname in test_imgnames
if os.path.isfile(os.path.join(test_path, imgname)) and imgname.split(".")[-1] == 'bmp'])
print(X.shape)
In [157]:
def process_img(imgname):
img = cv2.imread(imgname, 0)
# noise reduction and thresholding
blured = cv2.medianBlur(img,5)
ret, thresh = cv2.threshold(-blured,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
left, right, top, bottom = 65536, -1, 65536, -1
for row in range(img.shape[0]):
for col in range(img.shape[1]):
if (thresh[row][col] == 0):
continue
if col < left:
left = col
if col > right:
right = col
if row < top:
top = row
if row > bottom:
bottom = row
subimg = img[top:bottom+1, left:right+1]
return np.array(cv2.resize(subimg, (80, 80))).flatten()
test_path = '/home/falcon/Workspace/Klyushin/summer/data'
test_imgnames = os.listdir(test_path)
X_test = np.array([flatted_img(os.path.join(test_path, imgname)) for imgname in test_imgnames
if os.path.isfile(os.path.join(test_path, imgname)) and imgname.split(".")[-1] == 'bmp'])
y_test = np.array([1]*X.shape[0])
print(X.shape, y_test.shape)
X_test_pca = pca.transform(X_test)
###############################################################################
# Quantitative evaluation of the model quality on the test set
print ("Predicting the cell quality on the testing set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print ("done in %0.3fs" % (time() - t0))
print (classification_report(y_test, y_pred, target_names=target_names))
print (confusion_matrix(y_test, y_pred, labels=range(n_classes)))
In [ ]: