In [1]:

    
import graphlab



In [2]:

    
# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)









    



[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1477284409.log






    



This non-commercial license of GraphLab Create for academic use is assigned to sudhanshu.shekhar.iitd@gmail.com and will expire on September 18, 2017.

Load the image dataset



In [3]:

    
image_train = graphlab.SFrame('image_train_data/')



In [4]:

    
image_test = graphlab.SFrame('image_test_data/')



In [5]:

    
image_train.head()









    Out[5]:





    
        id
        image
        label
        deep_features
        image_array
    
    
        24
        Height: 32 Width: 32
        bird
        [0.242871761322,
1.09545373917, 0.0, ...
        [73.0, 77.0, 58.0, 71.0,
68.0, 50.0, 77.0, 69.0, ...
    
    
        33
        Height: 32 Width: 32
        cat
        [0.525087952614, 0.0,
0.0, 0.0, 0.0, 0.0, ...
        [7.0, 5.0, 8.0, 7.0, 5.0,
8.0, 5.0, 4.0, 6.0, 7.0, ...
    
    
        36
        Height: 32 Width: 32
        cat
        [0.566015958786, 0.0,
0.0, 0.0, 0.0, 0.0, ...
        [169.0, 122.0, 65.0,
131.0, 108.0, 75.0, ...
    
    
        70
        Height: 32 Width: 32
        dog
        [1.12979578972, 0.0, 0.0,
0.778194487095, 0.0, ...
        [154.0, 179.0, 152.0,
159.0, 183.0, 157.0, ...
    
    
        90
        Height: 32 Width: 32
        bird
        [1.71786928177, 0.0, 0.0,
0.0, 0.0, 0.0, ...
        [216.0, 195.0, 180.0,
201.0, 178.0, 160.0, ...
    
    
        97
        Height: 32 Width: 32
        automobile
        [1.57818555832, 0.0, 0.0,
0.0, 0.0, 0.0, ...
        [33.0, 44.0, 27.0, 29.0,
44.0, 31.0, 32.0, 45.0, ...
    
    
        107
        Height: 32 Width: 32
        dog
        [0.0, 0.0,
0.220677852631, 0.0,  ...
        [97.0, 51.0, 31.0, 104.0,
58.0, 38.0, 107.0, 61.0, ...
    
    
        121
        Height: 32 Width: 32
        bird
        [0.0, 0.23753464222, 0.0,
0.0, 0.0, 0.0, ...
        [93.0, 96.0, 88.0, 102.0,
106.0, 97.0, 117.0, ...
    
    
        136
        Height: 32 Width: 32
        automobile
        [0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 7.5737862587, 0.0, ...
        [35.0, 59.0, 53.0, 36.0,
56.0, 56.0, 42.0, 62.0, ...
    
    
        138
        Height: 32 Width: 32
        bird
        [0.658935725689, 0.0,
0.0, 0.0, 0.0, 0.0, ...
        [205.0, 193.0, 195.0,
200.0, 187.0, 193.0, ...
    

[10 rows x 5 columns]

Computing summary statistics of the data

Using the training data, compute the sketch summary of the ‘label’ column and interpret the results. What’s the least common category in the training data? Save this result to answer the quiz at the end.



In [6]:

    
image_train['label'].sketch_summary()









    Out[6]:





+------------------+-------+----------+
|       item       | value | is exact |
+------------------+-------+----------+
|      Length      |  2005 |   Yes    |
| # Missing Values |   0   |   Yes    |
| # unique values  |   4   |    No    |
+------------------+-------+----------+

Most frequent items:
+-------+------------+-----+-----+------+
| value | automobile | cat | dog | bird |
+-------+------------+-----+-----+------+
| count |    509     | 509 | 509 | 478  |
+-------+------------+-----+-----+------+

Creating category-specific image retrieval models



In [7]:

    
label_filter = lambda l : image_train[image_train['label'] == l]



In [8]:

    
image_train_auto = label_filter('automobile')
len(image_train_auto)









    Out[8]:





509



In [9]:

    
image_train_cat = label_filter('cat')
len(image_train_cat)









    Out[9]:





509



In [10]:

    
image_train_dog = label_filter('dog')
len(image_train_dog)









    Out[10]:





509



In [11]:

    
image_train_bird = label_filter('bird')
len(image_train_bird)









    Out[11]:





478



In [12]:

    
auto_model = graphlab.nearest_neighbors.create(image_train_auto, features=['deep_features'], label='id')
cat_model = graphlab.nearest_neighbors.create(image_train_cat, features=['deep_features'], label='id')
dog_model = graphlab.nearest_neighbors.create(image_train_dog, features=['deep_features'], label='id')
bird_model = graphlab.nearest_neighbors.create(image_train_bird, features=['deep_features'], label='id')









    




Starting brute force nearest neighbors model training.






    




Starting brute force nearest neighbors model training.






    




Starting brute force nearest neighbors model training.






    




Starting brute force nearest neighbors model training.



In [13]:

    
def get_images_from_ids(query_result):
    return image_train.filter_by(query_result['reference_label'], 'id')



In [14]:

    
show_neighbours = lambda i : get_images_from_ids(knn_model.query(image_train[i:i+1]))['image'].show()



In [15]:

    
image_test[0:1]['image'].show()









    



Canvas is accessible via web browser at the URL: http://localhost:50395/index.html
Opening Canvas in default web browser.



In [16]:

    
graphlab.canvas.set_target('ipynb')



In [17]:

    
image_test[0:1]['image'].show()

What is the nearest ‘cat’ labeled image in the training data to the cat image above (the first image in the test data)? Save this result.



In [71]:

    
cat_model.query(image_test[0:1])









    




Starting pairwise querying.






    




+--------------+---------+-------------+--------------+






    




| Query points | # Pairs | % Complete. | Elapsed Time |






    




+--------------+---------+-------------+--------------+






    




| 0            | 1       | 0.196464    | 13.066ms     |






    




| Done         |         | 100         | 74.237ms     |






    




+--------------+---------+-------------+--------------+






    Out[71]:





    
        query_label
        reference_label
        distance
        rank
    
    
        0
        16289
        34.623719208
        1
    
    
        0
        45646
        36.0068799284
        2
    
    
        0
        32139
        36.5200813436
        3
    
    
        0
        25713
        36.7548502521
        4
    
    
        0
        331
        36.8731228168
        5
    

[5 rows x 4 columns]



In [72]:

    
image_train_cat[image_train_cat['id'] == 16289]['image'].show()

What is the nearest ‘dog’ labeled image in the training data to the cat image above (the first image in the test data)? Save this result.



In [19]:

    
dog_model.query(image_test[0:1])









    




Starting pairwise querying.






    




+--------------+---------+-------------+--------------+






    




| Query points | # Pairs | % Complete. | Elapsed Time |






    




+--------------+---------+-------------+--------------+






    




| 0            | 1       | 0.196464    | 9.953ms      |






    




| Done         |         | 100         | 97.588ms     |






    




+--------------+---------+-------------+--------------+






    Out[19]:





    
        query_label
        reference_label
        distance
        rank
    
    
        0
        16976
        37.4642628784
        1
    
    
        0
        13387
        37.5666832169
        2
    
    
        0
        35867
        37.6047267079
        3
    
    
        0
        44603
        37.7065585153
        4
    
    
        0
        6094
        38.5113254907
        5
    

[5 rows x 4 columns]



In [73]:

    
image_train_dog[image_train_dog['id'] == 16976]['image'].show()

A simple example of nearest-neighbors classification

For the first image in the test data (image_test[0:1]), which we used above, compute the mean distance between this image at its 5 nearest neighbors that were labeled ‘cat’ in the training data (similarly to what you did in the previous question). Save this result.



In [20]:

    
cat_model.query(image_test[0:1])['distance'].mean()









    




Starting pairwise querying.






    




+--------------+---------+-------------+--------------+






    




| Query points | # Pairs | % Complete. | Elapsed Time |






    




+--------------+---------+-------------+--------------+






    




| 0            | 1       | 0.196464    | 12.034ms     |






    




| Done         |         | 100         | 53.609ms     |






    




+--------------+---------+-------------+--------------+






    Out[20]:





36.15573070978294

Similarly, for the first image in the test data (image_test[0:1]), which we used above, compute the mean distance between this image at its 5 nearest neighbors that were labeled ‘dog’ in the training data (similarly to what you did in the previous question). Save this result.



In [21]:

    
dog_model.query(image_test[0:1])['distance'].mean()









    




Starting pairwise querying.






    




+--------------+---------+-------------+--------------+






    




| Query points | # Pairs | % Complete. | Elapsed Time |






    




+--------------+---------+-------------+--------------+






    




| 0            | 1       | 0.196464    | 9.475ms      |






    




| Done         |         | 100         | 46.569ms     |






    




+--------------+---------+-------------+--------------+






    Out[21]:





37.77071136184157

[Challenging Question] Computing nearest neighbors accuracy using SFrame operations



In [22]:

    
label_filter_test = lambda l : image_test[image_test['label'] == l]
image_test_cat = label_filter_test('cat')
print len(image_test_cat)
image_test_dog = label_filter_test('dog')
print len(image_test_dog)
image_test_bird = label_filter_test('bird')
print len(image_test_bird)
image_test_automobile = label_filter_test('automobile')
print len(image_test_automobile)



In [23]:

    
print len(image_test)

Finding nearest neighbors in the training set for each part of the test set



In [24]:

    
dog_dog_neighbors = dog_model.query(image_test_dog, k = 1)
dog_cat_neighbors = cat_model.query(image_test_dog, k = 1)
dog_automobile_neighbors = auto_model.query(image_test_dog, k = 1)
dog_bird_neighbors = bird_model.query(image_test_dog, k = 1)









    




Starting blockwise querying.






    




max rows per data block: 4348






    




number of reference data blocks: 8






    




number of query data blocks: 1






    




+--------------+---------+-------------+--------------+






    




| Query points | # Pairs | % Complete. | Elapsed Time |






    




+--------------+---------+-------------+--------------+






    




| 1000         | 63000   | 12.3772     | 297.635ms    |






    




| Done         | 509000  | 100         | 340.52ms     |






    




+--------------+---------+-------------+--------------+






    




Starting blockwise querying.






    




max rows per data block: 4348






    




number of reference data blocks: 8






    




number of query data blocks: 1






    




+--------------+---------+-------------+--------------+






    




| Query points | # Pairs | % Complete. | Elapsed Time |






    




+--------------+---------+-------------+--------------+






    




| 1000         | 63000   | 12.3772     | 281.573ms    |






    




| Done         | 509000  | 100         | 331.25ms     |






    




+--------------+---------+-------------+--------------+






    




Starting blockwise querying.






    




max rows per data block: 4348






    




number of reference data blocks: 8






    




number of query data blocks: 1






    




+--------------+---------+-------------+--------------+






    




| Query points | # Pairs | % Complete. | Elapsed Time |






    




+--------------+---------+-------------+--------------+






    




| 1000         | 64000   | 12.5737     | 276.817ms    |






    




| Done         | 509000  | 100         | 376.33ms     |






    




+--------------+---------+-------------+--------------+






    




Starting blockwise querying.






    




max rows per data block: 4348






    




number of reference data blocks: 8






    




number of query data blocks: 1






    




+--------------+---------+-------------+--------------+






    




| Query points | # Pairs | % Complete. | Elapsed Time |






    




+--------------+---------+-------------+--------------+






    




| 1000         | 60000   | 12.5523     | 287.251ms    |






    




| Done         | 478000  | 100         | 386.24ms     |






    




+--------------+---------+-------------+--------------+



In [26]:

    
dog_distances = graphlab.SFrame({
        'dog-dog' : dog_dog_neighbors['distance'],
        'dog-cat' : dog_cat_neighbors['distance'],
        'dog-bird': dog_bird_neighbors['distance'],
        'dog-automobile': dog_automobile_neighbors['distance']
    })



In [27]:

    
dog_distances.head()









    Out[27]:





    
        dog-automobile
        dog-bird
        dog-cat
        dog-dog
    
    
        41.9579761457
        41.7538647304
        36.4196077068
        33.4773590373
    
    
        46.0021331807
        41.3382958925
        38.8353268874
        32.8458495684
    
    
        42.9462290692
        38.6157590853
        36.9763410854
        35.0397073189
    
    
        41.6866060048
        37.0892269954
        34.5750072914
        33.9010327697
    
    
        39.2269664935
        38.272288694
        34.778824791
        37.4849250909
    
    
        40.5845117698
        39.1462089236
        35.1171578292
        34.945165344
    
    
        45.1067352961
        40.523040106
        40.6095830913
        39.0957278345
    
    
        41.3221140974
        38.1947918393
        39.9036867306
        37.7696131032
    
    
        41.8244654995
        40.1567131661
        38.0674700168
        35.1089144603
    
    
        45.4976929401
        45.5597962603
        42.7258732951
        43.2422832585
    

[10 rows x 4 columns]

Computing the number of correct predictions using 1-nearest neighbors for the dog class



In [54]:

    
dog_distances[0:1]['dog-dog']









    Out[54]:





dtype: float
Rows: 1
[33.47735903726335]



In [65]:

    
def is_dog_correct(r):
    return r['dog-dog'] < r['dog-cat'] and r['dog-dog'] < r['dog-bird'] and r['dog-dog'] < r['dog-automobile']



In [66]:

    
is_dog_correct(dog_distances[0:1])









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-66-25d8b0e0c7a5> in <module>()
----> 1 is_dog_correct(dog_distances[0:1])

<ipython-input-65-608785539821> in is_dog_correct(r)
      1 def is_dog_correct(r):
----> 2     return r['dog-dog'] < r['dog-cat'] and r['dog-dog'] < r['dog-bird'] and r['dog-dog'] < r['dog-automobile']

/Users/sud/anaconda3/envs/gl-env/lib/python2.7/site-packages/graphlab/data_structures/sarray.pyc in __nonzero__(self)
    752         """
    753         # message copied from Numpy
--> 754         raise ValueError("The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()")
    755 
    756     def __bool__(self):

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Accuracy of predicting dog in the test data:

Using the work you did in this question, what is the accuracy of the 1-nearest neighbor classifier at classifying ‘dog’ images from the test set? Save this result to answer the quiz at the end.



In [68]:

    
dog_distances.apply(is_dog_correct).sum() / float(len(image_test_dog))









    Out[68]:





0.678



In [62]:

    
dog_distances









    Out[62]:





    
        dog-automobile
        dog-bird
        dog-cat
        dog-dog
    
    
        41.9579761457
        41.7538647304
        36.4196077068
        33.4773590373
    
    
        46.0021331807
        41.3382958925
        38.8353268874
        32.8458495684
    
    
        42.9462290692
        38.6157590853
        36.9763410854
        35.0397073189
    
    
        41.6866060048
        37.0892269954
        34.5750072914
        33.9010327697
    
    
        39.2269664935
        38.272288694
        34.778824791
        37.4849250909
    
    
        40.5845117698
        39.1462089236
        35.1171578292
        34.945165344
    
    
        45.1067352961
        40.523040106
        40.6095830913
        39.0957278345
    
    
        41.3221140974
        38.1947918393
        39.9036867306
        37.7696131032
    
    
        41.8244654995
        40.1567131661
        38.0674700168
        35.1089144603
    
    
        45.4976929401
        45.5597962603
        42.7258732951
        43.2422832585
    

[1000 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [ ]:

id	image	label	deep_features	image_array
24	Height: 32 Width: 32	bird	[0.242871761322, 1.09545373917, 0.0, ...	[73.0, 77.0, 58.0, 71.0, 68.0, 50.0, 77.0, 69.0, ...
33	Height: 32 Width: 32	cat	[0.525087952614, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[7.0, 5.0, 8.0, 7.0, 5.0, 8.0, 5.0, 4.0, 6.0, 7.0, ...
36	Height: 32 Width: 32	cat	[0.566015958786, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[169.0, 122.0, 65.0, 131.0, 108.0, 75.0, ...
70	Height: 32 Width: 32	dog	[1.12979578972, 0.0, 0.0, 0.778194487095, 0.0, ...	[154.0, 179.0, 152.0, 159.0, 183.0, 157.0, ...
90	Height: 32 Width: 32	bird	[1.71786928177, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[216.0, 195.0, 180.0, 201.0, 178.0, 160.0, ...
97	Height: 32 Width: 32	automobile	[1.57818555832, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[33.0, 44.0, 27.0, 29.0, 44.0, 31.0, 32.0, 45.0, ...
107	Height: 32 Width: 32	dog	[0.0, 0.0, 0.220677852631, 0.0, ...	[97.0, 51.0, 31.0, 104.0, 58.0, 38.0, 107.0, 61.0, ...
121	Height: 32 Width: 32	bird	[0.0, 0.23753464222, 0.0, 0.0, 0.0, 0.0, ...	[93.0, 96.0, 88.0, 102.0, 106.0, 97.0, 117.0, ...
136	Height: 32 Width: 32	automobile	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.5737862587, 0.0, ...	[35.0, 59.0, 53.0, 36.0, 56.0, 56.0, 42.0, 62.0, ...
138	Height: 32 Width: 32	bird	[0.658935725689, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[205.0, 193.0, 195.0, 200.0, 187.0, 193.0, ...

reference_label	distance	rank
16289	34.623719208	1
45646	36.0068799284	2
32139	36.5200813436	3
25713	36.7548502521	4
331	36.8731228168	5

reference_label	distance	rank
16976	37.4642628784	1
13387	37.5666832169	2
35867	37.6047267079	3
44603	37.7065585153	4
6094	38.5113254907	5

dog-automobile	dog-bird	dog-cat	dog-dog
41.9579761457	41.7538647304	36.4196077068	33.4773590373
46.0021331807	41.3382958925	38.8353268874	32.8458495684
42.9462290692	38.6157590853	36.9763410854	35.0397073189
41.6866060048	37.0892269954	34.5750072914	33.9010327697
39.2269664935	38.272288694	34.778824791	37.4849250909
40.5845117698	39.1462089236	35.1171578292	34.945165344
45.1067352961	40.523040106	40.6095830913	39.0957278345
41.3221140974	38.1947918393	39.9036867306	37.7696131032
41.8244654995	40.1567131661	38.0674700168	35.1089144603
45.4976929401	45.5597962603	42.7258732951	43.2422832585