In [75]:
import numpy as np
import mahotas as mh
from mahotas.features import surf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.cluster import MiniBatchKMeans
import glob
from numpy.random import choice

In [83]:
all_instance_filenames = []
all_instance_targets = []
surf_features = []

number_images = 6000

data_path = './data/dog-v-cat/train/{}.*.jpg'

for sp in ['cat', 'dog']:
    path = data_path.format(sp)
    count = 0
    while count < number_images:
        target = 1 if sp == 'cat' else 0
        f = choice(glob.glob(path), replace=False)
        image = mh.imread(f, as_grey=True)
        surf_feature = surf.surf(image)[:, 5:]
        if len(surf_feature) > 0:
            if count % 100 == 0:
                print 'Finished reading {} images'.format(count)
            surf_features.append(surf_feature)
            all_instance_targets.append(target)
            count += 1
    
print '*Finished reading images*'


Finished reading 0 images
Finished reading 100 images
Finished reading 200 images
Finished reading 300 images
Finished reading 400 images
Finished reading 500 images
Finished reading 600 images
Finished reading 700 images
Finished reading 800 images
Finished reading 900 images
Finished reading 1000 images
Finished reading 1100 images
Finished reading 1200 images
Finished reading 1300 images
Finished reading 1400 images
Finished reading 1500 images
Finished reading 1600 images
Finished reading 1700 images
Finished reading 1800 images
Finished reading 1900 images
Finished reading 2000 images
Finished reading 2100 images
Finished reading 2200 images
Finished reading 2300 images
Finished reading 2400 images
Finished reading 2500 images
Finished reading 2600 images
Finished reading 2700 images
Finished reading 2800 images
Finished reading 2900 images
Finished reading 3000 images
Finished reading 3100 images
Finished reading 3200 images
Finished reading 3300 images
Finished reading 3400 images
Finished reading 3500 images
Finished reading 3600 images
Finished reading 3700 images
Finished reading 3800 images
Finished reading 3900 images
Finished reading 4000 images
Finished reading 4100 images
Finished reading 4200 images
Finished reading 4300 images
Finished reading 4400 images
Finished reading 4500 images
Finished reading 4600 images
Finished reading 4700 images
Finished reading 4800 images
Finished reading 4900 images
Finished reading 5000 images
Finished reading 5100 images
Finished reading 5200 images
Finished reading 5300 images
Finished reading 5400 images
Finished reading 5500 images
Finished reading 5600 images
Finished reading 5700 images
Finished reading 5800 images
Finished reading 5900 images
Finished reading 0 images
Finished reading 100 images
Finished reading 200 images
Finished reading 300 images
Finished reading 400 images
Finished reading 500 images
Finished reading 600 images
Finished reading 700 images
Finished reading 800 images
Finished reading 900 images
Finished reading 1000 images
Finished reading 1100 images
Finished reading 1200 images
Finished reading 1300 images
Finished reading 1400 images
Finished reading 1500 images
Finished reading 1600 images
Finished reading 1700 images
Finished reading 1800 images
Finished reading 1900 images
Finished reading 2000 images
Finished reading 2100 images
Finished reading 2200 images
Finished reading 2300 images
Finished reading 2400 images
Finished reading 2500 images
Finished reading 2600 images
Finished reading 2700 images
Finished reading 2800 images
Finished reading 2900 images
Finished reading 3000 images
Finished reading 3100 images
Finished reading 3200 images
Finished reading 3300 images
Finished reading 3400 images
Finished reading 3500 images
Finished reading 3600 images
Finished reading 3700 images
Finished reading 3800 images
Finished reading 3900 images
Finished reading 4000 images
Finished reading 4100 images
Finished reading 4200 images
Finished reading 4300 images
Finished reading 4400 images
Finished reading 4500 images
Finished reading 4600 images
Finished reading 4700 images
Finished reading 4800 images
Finished reading 4900 images
Finished reading 5000 images
Finished reading 5100 images
Finished reading 5200 images
Finished reading 5300 images
Finished reading 5400 images
Finished reading 5500 images
Finished reading 5600 images
Finished reading 5700 images
Finished reading 5800 images
Finished reading 5900 images
*Finished reading images*

In [84]:
train_len = int(len(surf_features) * .60)
X_train_surf_features = np.concatenate(surf_features[:train_len])
X_test_surf_features = np.concatenate(surf_features[train_len:])
y_train = all_instance_targets[:train_len]
y_test = all_instance_targets[train_len:]

In [85]:
n_clusters = 300
print 'Clustering', len(X_train_surf_features), 'features'
estimator = MiniBatchKMeans(n_clusters=n_clusters)
estimator.fit_transform(X_train_surf_features)

# find the cluster associated with each of the extracted SURF descriptors and count.
X_train = []
counter = 0

for instance in surf_features[:train_len]:
    clusters = estimator.predict(instance)
    features = np.bincount(clusters)
    if len(features) < n_clusters:
        features = np.append(features, np.zeros((1, n_clusters - len(features))))
    X_train.append(features)
    counter += 1


Clustering 4508548 features

In [86]:
X_test = []

for instance in surf_features[train_len:]:
    clusters = estimator.predict(instance)
    features = np.bincount(clusters)
    if len(features) < n_clusters:
        features = np.append(features, np.zeros((1, n_clusters - len(features))))
    X_test.append(features)

In [87]:
clf = LogisticRegression(C=0.001, penalty='l2')
clf.fit_transform(X_train, y_train)

predictions = clf.predict(X_test)

print classification_report(y_test, predictions)
print 'Precision:', precision_score(y_test, predictions)
print 'Recall:', recall_score(y_test, predictions)
print 'Accuracy:', accuracy_score(y_test, predictions)


             precision    recall  f1-score   support

          0       1.00      0.26      0.42      4800
          1       0.00      0.00      0.00         0

avg / total       1.00      0.26      0.42      4800

Precision: 0
Recall: 0
Accuracy: 0.26375

In [ ]: