In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np
import itertools
import boto3
%matplotlib inline
In [2]:
labels = pd.read_csv("s3://dogfaces/tensor_model/output_labels_20170907.txt", names=["dog_breed"])
labels.head()
Out[2]:
In [3]:
df_conf = pd.read_csv("s3://dogfaces/tensor_model/test_result_20170907.txt", sep=" ", names=["image_name", "pred", "actual"])
In [4]:
df_conf.tail()
Out[4]:
In [5]:
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
'''
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
'''
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [6]:
y_pred = df_conf['pred']
y_true = df_conf['actual']
In [7]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
#plt.figure(figsize=(10,10))
#plot_confusion_matrix(cnf_matrix, classes=labels.dog_breed,
# title='Confusion matrix, without normalization')
#plt.show()
#Plot normalized confusion matrix
plt.figure(figsize=(20,10))
plot_confusion_matrix(cnf_matrix, classes=labels.dog_breed, normalize=True,
title='Normalized confusion matrix')
plt.show()
In [8]:
breed_names = labels.dog_breed.values
In [9]:
df_revpics = pd.read_csv("s3://dogfaces/reviews/labeled_pictures.csv")
df_revpics.head()
Out[9]:
In [10]:
def prob2num(row):
return map(lambda x:float(x), row.lstrip('[').rstrip(']').split(','))
def getTopbreed(prob,k, thres=0.1):
prob_num = prob2num(prob)
prob_index = np.argsort(prob_num)[-k]
pred = breed_names[prob_index]
prob_digi = prob_num[prob_index]
if prob_digi > thres:
return pred
else:
return 'unknown'
In [11]:
df_a = pd.DataFrame()
In [12]:
df_a['top1'] = df_revpics['probability'].apply(lambda row:getTopbreed(row, 1))
In [13]:
df_a['top1'].head()
Out[13]:
In [14]:
df_a['top1'].value_counts()
Out[14]:
In [96]:
df_a['top2'] = df_revpics['probability'].apply(lambda row:getTopbreed(row, 2, thres=0.05))
In [97]:
df_a['top2'].value_counts()
Out[97]:
In [99]:
df_a['top3'] = df_revpics['probability'].apply(lambda row:getTopbreed(row, 3, thres=0.05))
In [100]:
df_a['top3'].value_counts()
Out[100]:
In [101]:
df_a.shape
Out[101]:
In [102]:
df_calc = pd.concat([df_revpics['pic_names'], df_a], axis=1)
In [103]:
df_calc[df_calc['top1']=='unknown'].sample(10)
Out[103]:
In [52]:
s3 = boto3.client("s3")
In [53]:
def getReviewPicFromS3(pic_name, s3):
pic_Key = "reviews/review_pics/"+pic_name
pic_object = s3.get_object(Bucket='dogfaces', Key=pic_Key)
pic_content = pic_object['Body'].read()
return pic_content
In [61]:
test_name = "0090-en_us_1428965_photo.jpg"
pic_to_show = getReviewPicFromS3(test_name, s3)
with open('photo.jpg', 'wb') as f:
f.write(pic_to_show)
In [146]:
# plot classified pictures
samples = df_calc[df_calc['top1']!='unknown'].sample(9)
image_names = samples.pic_names.values
breeds_1 = samples.top1.values
breeds_2 = samples.top2.values
fig, axes = plt.subplots(3,3, figsize=(10,10))
for img_name,breed1,breed2, ax in zip(image_names, breeds_1,breeds_2, axes.flatten()):
pic_to_show = getReviewPicFromS3(img_name, s3)
with open('photo.jpg', 'wb') as f:
f.write(pic_to_show)
ax.imshow(plt.imread('photo.jpg'))
ax.axis("off")
ax.set_title(breed1+"\n"+breed2)
plt.tight_layout()
plt.show()
In [58]:
df_users = pd.read_csv("s3://dogfaces/reviews/reviews.csv")
In [60]:
len(df_users['user_name'].unique())
Out[60]:
In [61]:
df_users.shape
Out[61]:
In [62]:
df_users['user_name'].value_counts()
Out[62]:
In [80]:
a= [1,2,3]
zip(a,a,a)
Out[80]:
In [ ]: