In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io
import re
from collections import defaultdict
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import tensorflow as tf
import os
import numpy as np
import time
%matplotlib inline
snowball = SnowballStemmer('english')
In [3]:
data_path = "~/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/"
data_path_stan = "/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/stanford/file_list.mat"
write_path = "/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone/params/"
In [4]:
# 1. from website
df = pd.read_csv(data_path+"top_dog_breed.csv")
In [5]:
df.head()
Out[5]:
In [6]:
# 2. from the stanford
df_stanford = scipy.io.loadmat("/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/stanford/file_list.mat")
In [7]:
# clean out the breed name
def getBreedName_stanford(file_path):
df_stanford = scipy.io.loadmat(file_path)
stanford_breed = set()
for label in df_stanford['file_list']:
name = re.findall('\d+-(\S*)\/', label[0][0])
stanford_breed.add(name[0])
return stanford_breed
In [8]:
stanford_breed = getBreedName_stanford(data_path_stan)# total 120
In [9]:
stanford_dict = defaultdict(str)
for breed in stanford_breed:
key = breed.replace('-', ' ').replace('_',' ').lower()
stanford_dict[key] = breed
In [10]:
stanford_breed_list = set([x.replace('-', ' ').replace('_',' ').lower() for x in stanford_breed])
In [11]:
stanford_breed_list
Out[11]:
In [12]:
stanford_dict
Out[12]:
In [13]:
#2.1. Naturally there is no intersection....
stanford_breed_list & set(df['Breed'].str.lower().values)
Out[13]:
In [14]:
'bulldog' in stanford_breed_list
Out[14]:
In [15]:
# from ox database
df_ox = pd.read_csv(data_path+"ox/ox_annotations/list.txt",skiprows=6,delimiter=" ", names=["image_ID", "class","species", "breed"])
In [16]:
df_ox_dogs = df_ox[df_ox['species']==2]
df_ox_dogs.head()
Out[16]:
In [17]:
df_ox_dogs.shape
Out[17]:
In [18]:
def remove_digit(s):
temp = ''.join([x if not x.isdigit() else '' for x in s])
return temp.replace('_', ' ').strip()
df_ox_dogs['breedname'] = df_ox_dogs['image_ID'].apply(lambda row:remove_digit(row))
In [19]:
df_ox_dogs['breedname'].head()
Out[19]:
In [21]:
df_ox_dogs['breedname'].value_counts()
Out[21]:
In [22]:
ox_breed_list = set(df_ox_dogs['breedname'].values)
In [23]:
ox_dict = defaultdict(str)
for breed in ox_breed_list:
ox_dict[breed] = breed.replace(' ', '_')
In [24]:
list_intersect = ox_breed_list & stanford_breed_list
In [25]:
ox_breed_list - list_intersect
Out[25]:
In [26]:
ox_dict
Out[26]:
In [27]:
stanford_breed_list - list_intersect
Out[27]:
In [28]:
top100 = df['Breed'].str.lower().values[:100]
In [29]:
def clean_top100(lst):
res = [word.replace('(','') for word in lst]
res = [word.replace(')','') for word in res]
res = [re.sub(r'[^\x00-\x7F]+',' ', word) for word in res]
return res
In [30]:
top100 = clean_top100(top100)
top100[:10]
Out[30]:
In [31]:
# get popular score
def get_top_position(word, top100):
score = np.array([breed_dist(word, topb) for topb in top100])
top_ind = np.argmax(score)
return (top100[top_ind], score[top_ind], top_ind)
def breed_dist(b1, b2):
br1 = snowball.stem(b1).strip().split()
br2 = snowball.stem(b2).strip().split()
return len(set(br1)&set(br2))*1.0/len(set(br1)|set(br2))
In [32]:
count = 0
ox_image_list = set()
fname = write_path+'ox_breeds.txt'
with open(fname, 'w') as f:
for ox_breed in ox_breed_list:
_, score, rank = get_top_position(ox_breed,top100)
if rank < 70 and score >0.4:
print ox_breed, get_top_position(ox_breed, top100)
ox_image_list.add(ox_breed)
count += 1
f.write(ox_dict[ox_breed])
f.write('\n')
print "total top breed in oxford database only", count
In [34]:
fname = write_path + 'stanford_breed.txt'
stan_only_breed = stanford_breed_list - ox_breed_list
stan_image_list = set()
count = 0
with open(fname, 'w') as f:
for stan_breed in stan_only_breed:
_, score, rank = get_top_position(stan_breed,top100)
if rank < 70 and score >0.4:
print stan_breed, get_top_position(stan_breed, top100)
stan_image_list.add(stan_breed)
count += 1
f.write(stanford_dict[stan_breed])
f.write('\n')
print "total top breed in stanford database only", count
In [28]:
ox_img_path = '/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/'
In [29]:
ox_file_names = os.listdir(ox_img_path)
In [30]:
def find_ox_file_name(ox_breed, ox_file_names):
file_list = []
for fi in ox_file_names:
temp = remove_digit(fi.lower()).strip('.jpg').strip()
if temp == ox_breed and fi.endswith('.jpg'):
file_list.append(fi)
return file_list
In [31]:
ox_list = list(ox_image_list)
test = find_ox_file_name(ox_list[11], ox_file_names)
test_subset = [ox_img_path+x for x in test[:4]]
In [32]:
print test_subset
In [33]:
# use tensorflow to load and process pictures
test_subset = ['/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_1.jpg',
'/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_10.jpg',
'/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_100.jpg',
'/Users/zhouyu/Documents/Zhou_Yu/DS/Galvanize/Capstone_data/ox/ox_images/miniature_pinscher_101.jpg']
In [9]:
# test code from https://stackoverflow.com/questions/33648322/tensorflow-image-reading-display
# read single random image
one_file_queue = tf.train.string_input_producer(test_subset[:4])
reader = tf.WholeFileReader()
key, value = reader.read(one_file_queue)
my_img = tf.image.decode_jpeg(value) # use png or jpg decoder based on your files.
resized = tf.image.resize_images(my_img,[300,300],method=1)
init_op = tf.global_variables_initializer()
proc_images_to_show = []
ori_images_to_show = []
with tf.Session() as sess:
sess.run(init_op)
# Start populating the filename queue.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(4): #length of your filename lis
ori_images_to_show.append(my_img.eval())
#image = resized.eval() #here is your image Tensor :)
proc_images_to_show.append(resized.eval())
coord.request_stop()
coord.join(threads)
In [10]:
fig, axes = plt.subplots(2,4, figsize=(10,5))
for img, ax in zip(proc_images_to_show, axes[0].flatten()):
print img.shape
ax.imshow(img)
ax.axis('off')
for img, ax in zip(ori_images_to_show, axes[1].flatten()):
print img.shape
ax.imshow(img)
ax.axis('off')
plt.show()
In [19]:
# try another code from to batch readin https://zhuanlan.zhihu.com/p/27481108
def read_img(filenames, num_epochs, shuffle=True):
filename_queue = tf.train.string_input_producer(filenames,
num_epochs=num_epochs, shuffle=True)
reader = tf.WholeFileReader()
key, value = reader.read(filename_queue)
img = tf.image.decode_jpeg(value, channels=3)
shape = tf.shape(img)
height = shape[0]
width = shape[1]
long_edge = tf.maximum(height, width)
img_padded = tf.image.resize_image_with_crop_or_pad(img, long_edge, long_edge)
img = tf.image.resize_images(img_padded, size=(256, 256),
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
return img
In [20]:
#create_tfrecord_start_time = time.time()
#convert_to_tfrecord()
#create_tfrecord_duration = time.time() - create_tfrecord_start_time
#print("Create TFrecord Duration: %.3f" % (create_tfrecord_duration))
with tf.Session() as sess:
min_after_dequeue = 1000
capacity = min_after_dequeue + 3*4
img = read_img(test_subset, 1, True)
# img = read_tfrecord("training.tfrecords", 1, True)
img_batch = tf.train.shuffle_batch([img], batch_size=4,
num_threads=8,
capacity=capacity,
min_after_dequeue=min_after_dequeue)
init = (tf.global_variables_initializer(),
tf.local_variables_initializer())
sess.run(init)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
i = 0
read_tfrecord_start_time = time.time()
try:
while not coord.should_stop():
imgs = sess.run([img_batch])
for img in imgs:
print(img.shape)
except Exception, e:
coord.request_stop(e)
finally:
coord.request_stop()
coord.join(threads)
read_tfrecord_duration = time.time() - read_tfrecord_start_time
print("Read TFrecord Duration: %.3f" % read_tfrecord_duration)
In [21]:
fig, axes = plt.subplots(2,2, figsize=(10,5))
for im, ax in zip(img, axes.flatten()):
print im.shape
ax.imshow(im)
ax.axis('off')
plt.show()
In [ ]: