In [17]:
import numpy as np
import mahotas as mh
from mahotas.features import surf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.cluster import MiniBatchKMeans
import glob

In [18]:
all_instance_filenames = []
all_instance_targets = []

number_images = 1000

data_path = './data/dog-v-cat/train/{}.{}.jpg'

for sp in ['cat', 'dog']:
    for n in range(1, number_images+1):
        target = 1 if sp == 'cat' else 0
        path = data_path.format(sp, n)
        all_instance_filenames.append(path)
        all_instance_targets.append(target)
    
surf_features = []
counter = 0
for f in all_instance_filenames:
    if counter % 100 == 0:
        print "Read {} images".format(counter)
    counter += 1
    image = mh.imread(f, as_grey=True)
    surf_features.append(surf.surf(image)[:, 5:])
print '*Finished reading images*'


Read 0 images
Read 100 images
Read 200 images
Read 300 images
Read 400 images
Read 500 images
Read 600 images
Read 700 images
Read 800 images
Read 900 images
Read 1000 images
Read 1100 images
Read 1200 images
Read 1300 images
Read 1400 images
Read 1500 images
Read 1600 images
Read 1700 images
Read 1800 images
Read 1900 images

In [19]:
train_len = int(len(all_instance_filenames) * .60)
X_train_surf_features = np.concatenate(surf_features[:train_len])
X_test_surf_features = np.concatenate(surf_features[train_len:])
y_train = all_instance_targets[:train_len]
y_test = all_instance_targets[train_len:]

In [20]:
n_clusters = 300
print 'Clustering', len(X_train_surf_features), 'features'
estimator = MiniBatchKMeans(n_clusters=n_clusters)
estimator.fit_transform(X_train_surf_features)

# find the cluster associated with each of the extracted SURF descriptors and count.
X_train = []
for instance in surf_features[:train_len]:
    clusters = estimator.predict(instance)
    features = np.bincount(clusters)
    if len(features) < n_clusters:
        features = np.append(features, np.zeros((1, n_clusters - len(features))))
    X_train.append(features)


Clustering 753173 features

In [21]:
X_test = []

for instance in surf_features[train_len:]:
    clusters = estimator.predict(instance)
    features = np.bincount(clusters)
    if len(features) < n_clusters:
        features = np.append(features, np.zeros((1, n_clusters - len(features))))
    X_test.append(features)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-f2fbb3991db1> in <module>()
      2 
      3 for instance in surf_features[train_len:]:
----> 4     clusters = estimator.predict(instance)
      5     features = np.bincount(clusters)
      6     if len(features) < n_clusters:

/usr/local/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in predict(self, X)
   1357         self._check_fitted()
   1358         X = self._check_test_data(X)
-> 1359         return self._labels_inertia_minibatch(X)[0]

/usr/local/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in _labels_inertia_minibatch(self, X)
   1282         results = [_labels_inertia(X[s], x_squared_norms[s],
   1283                                    self.cluster_centers_) for s in slices]
-> 1284         labels, inertia = zip(*results)
   1285         return np.hstack(labels), np.sum(inertia)
   1286 

ValueError: need more than 0 values to unpack

In [16]:
clf = LogisticRegression(C=0.001, penalty='l2')
clf.fit_transform(X_train, y_train)

predictions = clf.predict(X_test)

print classification_report(y_test, predictions)
print 'Precision:', precision_score(y_test, predictions)
print 'Recall:', recall_score(y_test, predictions)
print 'Accuracy:', accuracy_score(y_test, predictions)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-16-c144e9b105f3> in <module>()
      4 predictions = clf.predict(X_test)
      5 
----> 6 print classification_report(y_test, predictions)
      7 print 'Precision:', precision_score(y_test, predictions)
      8 print 'Recall:', recall_score(y_test, predictions)

/usr/local/lib/python2.7/site-packages/sklearn/metrics/metrics.pyc in classification_report(y_true, y_pred, labels, target_names, sample_weight)
   2039                                                   labels=labels,
   2040                                                   average=None,
-> 2041                                                   sample_weight=sample_weight)
   2042 
   2043     for i, label in enumerate(labels):

/usr/local/lib/python2.7/site-packages/sklearn/metrics/metrics.pyc in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
   1667         raise ValueError("beta should be >0 in the F-beta score")
   1668 
-> 1669     y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred)
   1670 
   1671     label_order = labels  # save this for later

/usr/local/lib/python2.7/site-packages/sklearn/metrics/metrics.pyc in _check_clf_targets(y_true, y_pred)
    107     y_pred : array or indicator matrix
    108     """
--> 109     y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True)
    110     type_true = type_of_target(y_true)
    111     type_pred = type_of_target(y_pred)

/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_arrays(*arrays, **options)
    252         if size != n_samples:
    253             raise ValueError("Found array with dim %d. Expected %d"
--> 254                              % (size, n_samples))
    255 
    256         if not allow_lists or hasattr(array, "shape"):

ValueError: Found array with dim 798. Expected 800

In [ ]: