In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io
import re
from collections import defaultdict
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import tensorflow as tf
import os
import numpy as np
import time
%matplotlib inline

snowball = SnowballStemmer('english')

In [5]:
data_path = "~/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/"
data_path_stan = "/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/stanford/file_list.mat"

1. Read in common breed names


In [3]:
# 1. from website
df = pd.read_csv(data_path+"top_dog_breed.csv")

In [4]:
df.head()


Out[4]:
Breed 2016 Rank 2015 Rank 2014 Rank 2013 Rank
0 Retrievers (Labrador) 1 1.0 1.0 1.0
1 German Shepherd Dogs 2 2.0 2.0 2.0
2 Retrievers (Golden) 3 3.0 3.0 3.0
3 Bulldogs 4 4.0 4.0 5.0
4 Beagles 5 5.0 5.0 4.0

In [5]:
# 2. from the stanford
df_stanford = scipy.io.loadmat("/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/stanford/file_list.mat")

In [6]:
# clean out the breed name
def getBreedName_stanford(file_path):
    df_stanford = scipy.io.loadmat(file_path)
    stanford_breed = set()
    for label in df_stanford['file_list']:
        name = re.findall('\d+-(\S*)\/', label[0][0])
        if 'black' in label[0][0]:
            print label[0][0]
            print name[0]
        stanford_breed.add(name[0])
    return stanford_breed

In [7]:
stanford_breed = getBreedName_stanford(data_path_stan)# total 120


n02089078-black-and-tan_coonhound/n02089078_1021.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1025.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1026.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1064.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_111.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1151.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1174.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1245.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1275.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1304.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1366.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_141.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1454.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1472.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1617.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1629.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1668.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1680.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1681.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1735.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1744.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1758.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_176.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1793.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1825.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_183.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1836.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1842.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_185.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1867.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_188.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_19.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_192.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1951.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1962.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_1967.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2021.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2025.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2032.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2106.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2108.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2110.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2174.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_222.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2225.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2227.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2232.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2270.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2282.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2294.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2333.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2337.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_237.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2404.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2417.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_243.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2464.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2476.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2478.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2498.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2542.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_256.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2574.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2596.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2684.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_269.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2728.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2753.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2794.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2801.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2829.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2841.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2903.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2921.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2934.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2935.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2957.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_2962.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_302.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3038.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3051.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3078.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3081.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3088.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3134.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_314.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3182.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3183.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3188.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3191.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3196.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_324.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3340.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3355.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3412.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3419.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3440.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3443.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_351.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_353.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3532.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3543.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3613.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3615.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3648.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3652.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3681.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3759.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_376.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3760.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3807.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3810.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3821.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3869.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3893.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3914.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3923.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_393.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_3932.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4024.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_404.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4065.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4098.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4140.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4186.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4312.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4331.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4362.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4422.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4441.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_445.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4466.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4497.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_45.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4508.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_4544.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_456.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_461.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_464.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_465.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_482.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_502.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_52.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_641.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_663.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_682.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_695.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_715.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_727.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_779.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_782.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_80.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_825.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_877.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_901.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_933.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_944.jpg
black-and-tan_coonhound
n02089078-black-and-tan_coonhound/n02089078_965.jpg
black-and-tan_coonhound

In [8]:
stanford_breed_list = set([x.replace('-', ' ').replace('_',' ').lower() for x in stanford_breed])

In [9]:
stanford_breed_list


Out[9]:
{u'affenpinscher',
 u'afghan hound',
 u'african hunting dog',
 u'airedale',
 u'american staffordshire terrier',
 u'appenzeller',
 u'australian terrier',
 u'basenji',
 u'basset',
 u'beagle',
 u'bedlington terrier',
 u'bernese mountain dog',
 u'black and tan coonhound',
 u'blenheim spaniel',
 u'bloodhound',
 u'bluetick',
 u'border collie',
 u'border terrier',
 u'borzoi',
 u'boston bull',
 u'bouvier des flandres',
 u'boxer',
 u'brabancon griffon',
 u'briard',
 u'brittany spaniel',
 u'bull mastiff',
 u'cairn',
 u'cardigan',
 u'chesapeake bay retriever',
 u'chihuahua',
 u'chow',
 u'clumber',
 u'cocker spaniel',
 u'collie',
 u'curly coated retriever',
 u'dandie dinmont',
 u'dhole',
 u'dingo',
 u'doberman',
 u'english foxhound',
 u'english setter',
 u'english springer',
 u'entlebucher',
 u'eskimo dog',
 u'flat coated retriever',
 u'french bulldog',
 u'german shepherd',
 u'german short haired pointer',
 u'giant schnauzer',
 u'golden retriever',
 u'gordon setter',
 u'great dane',
 u'great pyrenees',
 u'greater swiss mountain dog',
 u'groenendael',
 u'ibizan hound',
 u'irish setter',
 u'irish terrier',
 u'irish water spaniel',
 u'irish wolfhound',
 u'italian greyhound',
 u'japanese spaniel',
 u'keeshond',
 u'kelpie',
 u'kerry blue terrier',
 u'komondor',
 u'kuvasz',
 u'labrador retriever',
 u'lakeland terrier',
 u'leonberg',
 u'lhasa',
 u'malamute',
 u'malinois',
 u'maltese dog',
 u'mexican hairless',
 u'miniature pinscher',
 u'miniature poodle',
 u'miniature schnauzer',
 u'newfoundland',
 u'norfolk terrier',
 u'norwegian elkhound',
 u'norwich terrier',
 u'old english sheepdog',
 u'otterhound',
 u'papillon',
 u'pekinese',
 u'pembroke',
 u'pomeranian',
 u'pug',
 u'redbone',
 u'rhodesian ridgeback',
 u'rottweiler',
 u'saint bernard',
 u'saluki',
 u'samoyed',
 u'schipperke',
 u'scotch terrier',
 u'scottish deerhound',
 u'sealyham terrier',
 u'shetland sheepdog',
 u'shih tzu',
 u'siberian husky',
 u'silky terrier',
 u'soft coated wheaten terrier',
 u'staffordshire bullterrier',
 u'standard poodle',
 u'standard schnauzer',
 u'sussex spaniel',
 u'tibetan mastiff',
 u'tibetan terrier',
 u'toy poodle',
 u'toy terrier',
 u'vizsla',
 u'walker hound',
 u'weimaraner',
 u'welsh springer spaniel',
 u'west highland white terrier',
 u'whippet',
 u'wire haired fox terrier',
 u'yorkshire terrier'}

In [10]:
#2.1. Naturally there is no intersection....
stanford_breed_list & set(df['Breed'].str.lower().values)


Out[10]:
{u'great pyrenees', u'shih tzu'}

In [11]:
'bulldog' in stanford_breed_list


Out[11]:
False

In [12]:
# from ox database 
df_ox = pd.read_csv(data_path+"ox/ox_annotations/list.txt",skiprows=6,delimiter=" ", names=["image_ID", "class","species", "breed"])

In [13]:
df_ox_dogs = df_ox[df_ox['species']==2]
df_ox_dogs.head()


Out[13]:
image_ID class species breed
50 american_bulldog_100 2 2 1
51 american_bulldog_101 2 2 1
52 american_bulldog_102 2 2 1
53 american_bulldog_103 2 2 1
54 american_bulldog_104 2 2 1

In [14]:
df_ox_dogs.shape


Out[14]:
(4978, 4)

In [15]:
def remove_digit(s):
    temp = ''.join([x if not x.isdigit() else '' for x in s])
    return temp.replace('_', ' ').strip()
df_ox_dogs['breedname'] = df_ox_dogs['image_ID'].apply(lambda row:remove_digit(row))


/Users/zhouyu/Documents/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [16]:
df_ox_dogs['breedname'].head()


Out[16]:
50    american bulldog
51    american bulldog
52    american bulldog
53    american bulldog
54    american bulldog
Name: breedname, dtype: object

In [17]:
df_ox_dogs['breedname'].value_counts()


Out[17]:
miniature pinscher            200
german shorthaired            200
chihuahua                     200
beagle                        200
great pyrenees                200
american bulldog              200
samoyed                       200
pomeranian                    200
pug                           200
leonberger                    200
havanese                      200
yorkshire terrier             200
japanese chin                 200
american pit bull terrier     200
english setter                200
basset hound                  200
shiba inu                     200
wheaten terrier               200
saint bernard                 200
keeshond                      199
scottish terrier              199
boxer                         199
english cocker spaniel        196
newfoundland                  196
staffordshire bull terrier    189
Name: breedname, dtype: int64

In [18]:
ox_breed_list = set(df_ox_dogs['breedname'].values)

In [19]:
list_intersect = ox_breed_list & stanford_breed_list

In [20]:
ox_breed_list - list_intersect


Out[20]:
{'american bulldog',
 'american pit bull terrier',
 'basset hound',
 'english cocker spaniel',
 'german shorthaired',
 'havanese',
 'japanese chin',
 'leonberger',
 'scottish terrier',
 'shiba inu',
 'staffordshire bull terrier',
 'wheaten terrier'}

In [21]:
stanford_breed_list - list_intersect


Out[21]:
{u'affenpinscher',
 u'afghan hound',
 u'african hunting dog',
 u'airedale',
 u'american staffordshire terrier',
 u'appenzeller',
 u'australian terrier',
 u'basenji',
 u'basset',
 u'bedlington terrier',
 u'bernese mountain dog',
 u'black and tan coonhound',
 u'blenheim spaniel',
 u'bloodhound',
 u'bluetick',
 u'border collie',
 u'border terrier',
 u'borzoi',
 u'boston bull',
 u'bouvier des flandres',
 u'brabancon griffon',
 u'briard',
 u'brittany spaniel',
 u'bull mastiff',
 u'cairn',
 u'cardigan',
 u'chesapeake bay retriever',
 u'chow',
 u'clumber',
 u'cocker spaniel',
 u'collie',
 u'curly coated retriever',
 u'dandie dinmont',
 u'dhole',
 u'dingo',
 u'doberman',
 u'english foxhound',
 u'english springer',
 u'entlebucher',
 u'eskimo dog',
 u'flat coated retriever',
 u'french bulldog',
 u'german shepherd',
 u'german short haired pointer',
 u'giant schnauzer',
 u'golden retriever',
 u'gordon setter',
 u'great dane',
 u'greater swiss mountain dog',
 u'groenendael',
 u'ibizan hound',
 u'irish setter',
 u'irish terrier',
 u'irish water spaniel',
 u'irish wolfhound',
 u'italian greyhound',
 u'japanese spaniel',
 u'kelpie',
 u'kerry blue terrier',
 u'komondor',
 u'kuvasz',
 u'labrador retriever',
 u'lakeland terrier',
 u'leonberg',
 u'lhasa',
 u'malamute',
 u'malinois',
 u'maltese dog',
 u'mexican hairless',
 u'miniature poodle',
 u'miniature schnauzer',
 u'norfolk terrier',
 u'norwegian elkhound',
 u'norwich terrier',
 u'old english sheepdog',
 u'otterhound',
 u'papillon',
 u'pekinese',
 u'pembroke',
 u'redbone',
 u'rhodesian ridgeback',
 u'rottweiler',
 u'saluki',
 u'schipperke',
 u'scotch terrier',
 u'scottish deerhound',
 u'sealyham terrier',
 u'shetland sheepdog',
 u'shih tzu',
 u'siberian husky',
 u'silky terrier',
 u'soft coated wheaten terrier',
 u'staffordshire bullterrier',
 u'standard poodle',
 u'standard schnauzer',
 u'sussex spaniel',
 u'tibetan mastiff',
 u'tibetan terrier',
 u'toy poodle',
 u'toy terrier',
 u'vizsla',
 u'walker hound',
 u'weimaraner',
 u'welsh springer spaniel',
 u'west highland white terrier',
 u'whippet',
 u'wire haired fox terrier'}

2. Condense to a breed list


In [22]:
top100 = df['Breed'].str.lower().values[:100]

In [23]:
def clean_top100(lst):
    res = [word.replace('(','') for word in lst]
    res = [word.replace(')','') for word in res]
    res = [re.sub(r'[^\x00-\x7F]+',' ', word) for word in res]
    return res

In [24]:
top100 = clean_top100(top100)
top100[:10]


Out[24]:
['retrievers labrador',
 'german shepherd dogs',
 'retrievers golden',
 'bulldogs',
 'beagles',
 'french bulldogs',
 'poodles',
 'rottweilers',
 'yorkshire terriers',
 'boxers']

In [25]:
# get popular score
def get_top_position(word, top100):
    score = np.array([breed_dist(word, topb) for topb in top100])
    top_ind = np.argmax(score)
    return (top100[top_ind], score[top_ind], top_ind)
def breed_dist(b1, b2):
    br1 = snowball.stem(b1).strip().split()
    br2 = snowball.stem(b2).strip().split()
    return len(set(br1)&set(br2))*1.0/len(set(br1)|set(br2))

In [26]:
count = 0 
ox_image_list = set()
for ox_breed in ox_breed_list:
    _, score, rank = get_top_position(ox_breed,top100)
    if rank < 70 and score >0.4:
        print ox_breed, get_top_position(ox_breed, top100)
        ox_image_list.add(ox_breed)
        count += 1
print "total top breed in stanford database only", count


yorkshire terrier ('yorkshire terriers', 1.0, 8)
wheaten terrier ('soft coated wheaten terriers', 0.5, 49)
chihuahua ('chihuahuas', 1.0, 29)
samoyed ('samoyeds', 1.0, 64)
beagle ('beagles', 1.0, 4)
great pyrenees ('great pyrenees', 1.0, 66)
american pit bull terrier ('bull terriers', 0.5, 56)
pomeranian ('pomeranians', 1.0, 21)
pug ('pugs', 1.0, 31)
havanese ('havanese', 1.0, 22)
miniature pinscher ('miniature pinschers', 1.0, 67)
american bulldog ('bulldogs', 0.5, 3)
german shorthaired ('pointers german shorthaired', 0.66666666666666663, 10)
scottish terrier ('scottish terriers', 1.0, 57)
boxer ('boxers', 1.0, 9)
basset hound ('basset hounds', 1.0, 38)
shiba inu ('shiba inu', 1.0, 43)
newfoundland ('newfoundlands', 1.0, 34)
total top breed in stanford database only 18

In [27]:
stan_only_breed = stanford_breed_list - ox_breed_list
stan_image_list = set()
count = 0 
for stan_breed in stan_only_breed:
    _, score, rank = get_top_position(stan_breed,top100)
    if rank < 70 and score >0.4:
        print stan_breed, get_top_position(stan_breed, top100)
        stan_image_list.add(stan_breed)
        count += 1
print "total top breed in stanford database only", count


rottweiler ('rottweilers', 1.0, 7)
bull mastiff ('mastiffs', 0.5, 27)
standard poodle ('poodles', 0.5, 6)
papillon ('papillons', 1.0, 52)
weimaraner ('weimaraners', 1.0, 33)
basset ('basset hounds', 0.5, 38)
border collie ('border collies', 1.0, 37)
toy poodle ('poodles', 0.5, 6)
malamute ('alaskan malamutes', 0.5, 58)
french bulldog ('french bulldogs', 1.0, 5)
shetland sheepdog ('shetland sheepdogs', 1.0, 23)
cairn ('cairn terriers', 0.5, 69)
whippet ('whippets', 1.0, 59)
malinois ('belgian malinois', 0.5, 46)
bernese mountain dog ('bernese mountain dogs', 1.0, 26)
bloodhound ('bloodhounds', 1.0, 51)
miniature poodle ('poodles', 0.5, 6)
shih tzu ('shih tzu', 1.0, 19)
rhodesian ridgeback ('rhodesian ridgebacks', 1.0, 41)
west highland white terrier ('west highland white terriers', 1.0, 40)
siberian husky ('siberian huskies', 1.0, 11)
great dane ('great danes', 1.0, 13)
collie ('collies', 1.0, 36)
doberman ('doberman pinschers', 0.5, 14)
chesapeake bay retriever ('retrievers chesapeake bay', 0.5, 42)
vizsla ('vizslas', 1.0, 30)
miniature schnauzer ('miniature schnauzers', 1.0, 16)
english springer ('spaniels english springer', 0.66666666666666663, 25)
soft coated wheaten terrier ('soft coated wheaten terriers', 1.0, 49)
tibetan mastiff ('mastiffs', 0.5, 27)
german shepherd ('german shepherd dogs', 0.66666666666666663, 1)
total top breed in stanford database only 31

3. Retrive pictures for a breed


In [28]:
ox_img_path = '/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/'

In [29]:
ox_file_names = os.listdir(ox_img_path)

In [30]:
def find_ox_file_name(ox_breed, ox_file_names):
    file_list = []
    for fi in ox_file_names:
        temp = remove_digit(fi.lower()).strip('.jpg').strip()
        if temp == ox_breed and fi.endswith('.jpg'):
            file_list.append(fi)
    return file_list

In [31]:
ox_list = list(ox_image_list)
test = find_ox_file_name(ox_list[11], ox_file_names)
test_subset = [ox_img_path+x for x in test[:4]]

In [32]:
print test_subset


['/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_1.jpg', '/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_10.jpg', '/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_100.jpg', '/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_101.jpg']

In [6]:
# use tensorflow to load and process pictures
test_subset = ['/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_1.jpg', 
               '/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_10.jpg', 
               '/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_100.jpg', 
               '/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_101.jpg']

In [9]:
# test code from https://stackoverflow.com/questions/33648322/tensorflow-image-reading-display
# read single random image
one_file_queue = tf.train.string_input_producer(test_subset[:4])
reader = tf.WholeFileReader()
key, value = reader.read(one_file_queue)

my_img = tf.image.decode_jpeg(value) # use png or jpg decoder based on your files.
resized = tf.image.resize_images(my_img,[300,300],method=1)
init_op = tf.global_variables_initializer()
proc_images_to_show = []
ori_images_to_show = []
with tf.Session() as sess:
    sess.run(init_op)

    # Start populating the filename queue.

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    for i in range(4): #length of your filename lis
        ori_images_to_show.append(my_img.eval())
        #image = resized.eval() #here is your image Tensor :) 
        proc_images_to_show.append(resized.eval())
    coord.request_stop()
    coord.join(threads)

In [10]:
fig, axes = plt.subplots(2,4, figsize=(10,5))
for img, ax in zip(proc_images_to_show, axes[0].flatten()):
    print img.shape
    ax.imshow(img)
    ax.axis('off')
for img, ax in zip(ori_images_to_show, axes[1].flatten()):
    print img.shape
    ax.imshow(img)
    ax.axis('off')
plt.show()


(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(375, 500, 3)
(375, 500, 3)
(375, 500, 3)
(492, 500, 3)

In [19]:
# try another code from to batch readin https://zhuanlan.zhihu.com/p/27481108
def read_img(filenames, num_epochs, shuffle=True):
    filename_queue = tf.train.string_input_producer(filenames,    
                              num_epochs=num_epochs, shuffle=True)

    reader = tf.WholeFileReader()
    key, value = reader.read(filename_queue)
    img = tf.image.decode_jpeg(value, channels=3)
    shape = tf.shape(img)
    height = shape[0]
    width = shape[1]
    long_edge = tf.maximum(height, width)
    img_padded = tf.image.resize_image_with_crop_or_pad(img, long_edge, long_edge)
    img = tf.image.resize_images(img_padded, size=(256, 256),
                   method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

    return img

In [20]:
#create_tfrecord_start_time = time.time()
#convert_to_tfrecord()
#create_tfrecord_duration = time.time() - create_tfrecord_start_time
#print("Create TFrecord Duration:  %.3f" % (create_tfrecord_duration))

with tf.Session() as sess:
    min_after_dequeue = 1000
    capacity = min_after_dequeue + 3*4

    img = read_img(test_subset, 1, True)
    # img = read_tfrecord("training.tfrecords", 1, True)
    img_batch = tf.train.shuffle_batch([img], batch_size=4,   
                                       num_threads=8,
                                       capacity=capacity,
                                   min_after_dequeue=min_after_dequeue)


    init = (tf.global_variables_initializer(),          
            tf.local_variables_initializer())
    sess.run(init)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    i = 0
    read_tfrecord_start_time = time.time()
    try:
        while not coord.should_stop():
            imgs = sess.run([img_batch])
            for img in imgs:
                print(img.shape)
    except Exception, e:
        coord.request_stop(e)
    finally:
        coord.request_stop()
    coord.join(threads)
    read_tfrecord_duration = time.time() - read_tfrecord_start_time
    print("Read TFrecord Duration:   %.3f" % read_tfrecord_duration)


(4, 256, 256, 3)
Read TFrecord Duration:   0.252

In [21]:
fig, axes = plt.subplots(2,2, figsize=(10,5))
for im, ax in zip(img, axes.flatten()):
    print im.shape
    ax.imshow(im)
    ax.axis('off')
plt.show()


(256, 256, 3)
(256, 256, 3)
(256, 256, 3)
(256, 256, 3)

In [ ]: