In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io
import re
from collections import defaultdict
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import tensorflow as tf
import os
import numpy as np
import time
%matplotlib inline
snowball = SnowballStemmer('english')
In [5]:
data_path = "~/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/"
data_path_stan = "/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/stanford/file_list.mat"
In [3]:
# 1. from website
df = pd.read_csv(data_path+"top_dog_breed.csv")
In [4]:
df.head()
Out[4]:
In [5]:
# 2. from the stanford
df_stanford = scipy.io.loadmat("/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/stanford/file_list.mat")
In [6]:
# clean out the breed name
def getBreedName_stanford(file_path):
df_stanford = scipy.io.loadmat(file_path)
stanford_breed = set()
for label in df_stanford['file_list']:
name = re.findall('\d+-(\S*)\/', label[0][0])
if 'black' in label[0][0]:
print label[0][0]
print name[0]
stanford_breed.add(name[0])
return stanford_breed
In [7]:
stanford_breed = getBreedName_stanford(data_path_stan)# total 120
In [8]:
stanford_breed_list = set([x.replace('-', ' ').replace('_',' ').lower() for x in stanford_breed])
In [9]:
stanford_breed_list
Out[9]:
In [10]:
#2.1. Naturally there is no intersection....
stanford_breed_list & set(df['Breed'].str.lower().values)
Out[10]:
In [11]:
'bulldog' in stanford_breed_list
Out[11]:
In [12]:
# from ox database
df_ox = pd.read_csv(data_path+"ox/ox_annotations/list.txt",skiprows=6,delimiter=" ", names=["image_ID", "class","species", "breed"])
In [13]:
df_ox_dogs = df_ox[df_ox['species']==2]
df_ox_dogs.head()
Out[13]:
In [14]:
df_ox_dogs.shape
Out[14]:
In [15]:
def remove_digit(s):
temp = ''.join([x if not x.isdigit() else '' for x in s])
return temp.replace('_', ' ').strip()
df_ox_dogs['breedname'] = df_ox_dogs['image_ID'].apply(lambda row:remove_digit(row))
In [16]:
df_ox_dogs['breedname'].head()
Out[16]:
In [17]:
df_ox_dogs['breedname'].value_counts()
Out[17]:
In [18]:
ox_breed_list = set(df_ox_dogs['breedname'].values)
In [19]:
list_intersect = ox_breed_list & stanford_breed_list
In [20]:
ox_breed_list - list_intersect
Out[20]:
In [21]:
stanford_breed_list - list_intersect
Out[21]:
In [22]:
top100 = df['Breed'].str.lower().values[:100]
In [23]:
def clean_top100(lst):
res = [word.replace('(','') for word in lst]
res = [word.replace(')','') for word in res]
res = [re.sub(r'[^\x00-\x7F]+',' ', word) for word in res]
return res
In [24]:
top100 = clean_top100(top100)
top100[:10]
Out[24]:
In [25]:
# get popular score
def get_top_position(word, top100):
score = np.array([breed_dist(word, topb) for topb in top100])
top_ind = np.argmax(score)
return (top100[top_ind], score[top_ind], top_ind)
def breed_dist(b1, b2):
br1 = snowball.stem(b1).strip().split()
br2 = snowball.stem(b2).strip().split()
return len(set(br1)&set(br2))*1.0/len(set(br1)|set(br2))
In [26]:
count = 0
ox_image_list = set()
for ox_breed in ox_breed_list:
_, score, rank = get_top_position(ox_breed,top100)
if rank < 70 and score >0.4:
print ox_breed, get_top_position(ox_breed, top100)
ox_image_list.add(ox_breed)
count += 1
print "total top breed in stanford database only", count
In [27]:
stan_only_breed = stanford_breed_list - ox_breed_list
stan_image_list = set()
count = 0
for stan_breed in stan_only_breed:
_, score, rank = get_top_position(stan_breed,top100)
if rank < 70 and score >0.4:
print stan_breed, get_top_position(stan_breed, top100)
stan_image_list.add(stan_breed)
count += 1
print "total top breed in stanford database only", count
In [28]:
ox_img_path = '/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/'
In [29]:
ox_file_names = os.listdir(ox_img_path)
In [30]:
def find_ox_file_name(ox_breed, ox_file_names):
file_list = []
for fi in ox_file_names:
temp = remove_digit(fi.lower()).strip('.jpg').strip()
if temp == ox_breed and fi.endswith('.jpg'):
file_list.append(fi)
return file_list
In [31]:
ox_list = list(ox_image_list)
test = find_ox_file_name(ox_list[11], ox_file_names)
test_subset = [ox_img_path+x for x in test[:4]]
In [32]:
print test_subset
In [6]:
# use tensorflow to load and process pictures
test_subset = ['/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_1.jpg',
'/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_10.jpg',
'/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_100.jpg',
'/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_101.jpg']
In [9]:
# test code from https://stackoverflow.com/questions/33648322/tensorflow-image-reading-display
# read single random image
one_file_queue = tf.train.string_input_producer(test_subset[:4])
reader = tf.WholeFileReader()
key, value = reader.read(one_file_queue)
my_img = tf.image.decode_jpeg(value) # use png or jpg decoder based on your files.
resized = tf.image.resize_images(my_img,[300,300],method=1)
init_op = tf.global_variables_initializer()
proc_images_to_show = []
ori_images_to_show = []
with tf.Session() as sess:
sess.run(init_op)
# Start populating the filename queue.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(4): #length of your filename lis
ori_images_to_show.append(my_img.eval())
#image = resized.eval() #here is your image Tensor :)
proc_images_to_show.append(resized.eval())
coord.request_stop()
coord.join(threads)
In [10]:
fig, axes = plt.subplots(2,4, figsize=(10,5))
for img, ax in zip(proc_images_to_show, axes[0].flatten()):
print img.shape
ax.imshow(img)
ax.axis('off')
for img, ax in zip(ori_images_to_show, axes[1].flatten()):
print img.shape
ax.imshow(img)
ax.axis('off')
plt.show()
In [19]:
# try another code from to batch readin https://zhuanlan.zhihu.com/p/27481108
def read_img(filenames, num_epochs, shuffle=True):
filename_queue = tf.train.string_input_producer(filenames,
num_epochs=num_epochs, shuffle=True)
reader = tf.WholeFileReader()
key, value = reader.read(filename_queue)
img = tf.image.decode_jpeg(value, channels=3)
shape = tf.shape(img)
height = shape[0]
width = shape[1]
long_edge = tf.maximum(height, width)
img_padded = tf.image.resize_image_with_crop_or_pad(img, long_edge, long_edge)
img = tf.image.resize_images(img_padded, size=(256, 256),
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
return img
In [20]:
#create_tfrecord_start_time = time.time()
#convert_to_tfrecord()
#create_tfrecord_duration = time.time() - create_tfrecord_start_time
#print("Create TFrecord Duration: %.3f" % (create_tfrecord_duration))
with tf.Session() as sess:
min_after_dequeue = 1000
capacity = min_after_dequeue + 3*4
img = read_img(test_subset, 1, True)
# img = read_tfrecord("training.tfrecords", 1, True)
img_batch = tf.train.shuffle_batch([img], batch_size=4,
num_threads=8,
capacity=capacity,
min_after_dequeue=min_after_dequeue)
init = (tf.global_variables_initializer(),
tf.local_variables_initializer())
sess.run(init)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
i = 0
read_tfrecord_start_time = time.time()
try:
while not coord.should_stop():
imgs = sess.run([img_batch])
for img in imgs:
print(img.shape)
except Exception, e:
coord.request_stop(e)
finally:
coord.request_stop()
coord.join(threads)
read_tfrecord_duration = time.time() - read_tfrecord_start_time
print("Read TFrecord Duration: %.3f" % read_tfrecord_duration)
In [21]:
fig, axes = plt.subplots(2,2, figsize=(10,5))
for im, ax in zip(img, axes.flatten()):
print im.shape
ax.imshow(im)
ax.axis('off')
plt.show()
In [ ]: