In [121]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
from PIL import Image as pil_image
from bokeh.io import output_notebook
import seaborn as sns
import os

sns.set_style('whitegrid')
sns.set_context('notebook')
output_notebook()

import amazon_products

data_dir = 'amazon_products_data'
image_dir = os.path.join(data_dir, 'images')
full_file_name = os.path.join(data_dir, 'amazon_products.csv')
train_file_name = os.path.join(data_dir, 'amazon_products_train.csv')
test_file_name = os.path.join(data_dir, 'amazon_products_test.csv')

df = pd.read_csv(full_file_name)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading BokehJS ...

Target Distribution


In [122]:
df.head(10)


Out[122]:
title image_file product_category
0 Coffee Retro Women Lady Weave Rivet Leather St... B00FDP6M9A.jpg Accessory
1 Fabric Fanny Pack- High Quality- Color Pattern... B0077DSN7U.jpg Clothing
2 Missoni Women's SM12 Ballet Flat,Beige,37.5 EU... B0050SMHQW.jpg Accessory
3 Brown Reindeer Hat Chenille X Small B007XF1WHK.jpg Clothing
4 ADJUSTABLE MULTI COLOR Cross Howlite Turquoise... B008TUQY1C.jpg Accessory
5 2 Pieces of Pink Pearl Beaded Head Chain B00AKJISCS.jpg Accessory
6 Lemon Beads B004KV3JR6.jpg Clothing
7 Sterling Silver Celtic Knot Ring B003TPHD1M.jpg Accessory
8 Quiksilver Men's All Time Long Sleeve Surf T-S... B00EKR9WD0.jpg Clothing
9 Red Tree Design Quartz Clock Pendant Pocket Watch B007RK16W2.jpg Accessory

In [112]:
import seaborn as sns

colors = ["windows blue", "amber", "greyish", "light red"]
with sns.xkcd_palette(colors): 
    ax = sns.countplot(y='product_category', data=df)
    
    # display counts directly on graph
    for p in ax.patches:
        x=p.get_bbox().get_points()[1, 0]
        y=p.get_bbox().get_points()[:, 1]
        ax.annotate('{:d}'.format(int(x)), (x + 50, np.mean(y)))
            
    sns.despine(left=True, bottom=True)
    
    ax.get_figure().savefig('target_distribution.pdf')


Simple TFIDF Feature Engineering


In [104]:
#title = df.groupby('product_category')['title']
title = df.groupby('product_category')['title']
amazon_products.text_plots.word_cloud(title.get_group('Clothing').tolist())


Out[104]:

In [114]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(1,2, sharey=False, sharex=True, figsize=(20, 8))

colors = [sns.xkcd_rgb['windows blue'], sns.xkcd_rgb['amber']]
for i, (name, group) in enumerate(title):
    amazon_products.text_plots.frequency_plot(
        group.tolist(), 
        plot_n_words=20,
        color=colors[i],
        yaxis_label='{} - word'.format(name),
        ax=ax.flatten()[i])
#fig.subplots_adjust(wspace=0.5)

fig.savefig('word_counts.pdf')



In [109]:
emb = amazon_products.text_plots.text_embedding(df['title'][:5000], labels=df['product_category'][:5000])

In [118]:
from bokeh.plotting import show, output_file

colors = ["windows blue", "amber"]
with sns.xkcd_palette(colors): 
    p = amazon_products.bokeh_plots.scatter_plot(
        'component_1', 'component_2', hue='labels', 
        data=emb, table_column='text', hover_columns=['text'])
    
    show(p)



In [8]:
from bokeh.plotting import show, output_file

#output_file('test_bokeh.html')

w2v = amazon_products.word2vec_plots.word2vec_embedding(df['title'][:5000], labels=df['product_category'][:5000])
colors = ["windows blue", "amber", "greyish", "light red"]
with sns.xkcd_palette(colors): 
    p = amazon_products.bokeh_plots.scatter_plot(
        'component_1', 'component_2', hue='labels', 
        data=w2v, table_column='text', hover_columns=['text'])
    
    show(p)



In [38]:
amazon_products.image_utils.load_image(df['image_file'][130], as_image=True, image_dir=image_dir)
df['title'][130]


Out[38]:
"Swiss Army Women's Garrison Collection Black Rubber Watch Silver Dial 241020"

In [39]:
mean_hsv = amazon_products.image_features.mean_hsv('image_file',
                                                   data=df,
                                                   background='white',
                                                   image_directory=image_dir,
                                                   n_jobs=-1)
df['mean_hue'] = mean_hsv[:, 0]
df['mean_saturation'] = mean_hsv[:, 1]
df['mean_value'] = mean_hsv[:, 2]
df.head(10)


Out[39]:
title image_file product_category mean_hue mean_saturation mean_value
0 Samsonite Cruisair Bold 26" Spinner - Silver B008O841LY.jpg Clothing 0.234798 0.034866 0.646103
1 Columbia Women's Sleet To Street Interchange J... B00E9SOH5I.jpg Clothing 0.659377 0.183517 0.250793
2 Hurley Baby-Boys Infant Griffin 2.0 Board Shor... B0081PD79W.jpg Clothing 0.839782 0.155021 0.234832
3 Elliott Lucca Lucca Smartphone Wristlet B006Q62SJI.jpg Clothing 0.548122 0.126736 0.275154
4 Muk Luks Unisex-baby Infant Monkey Hat B005NB6LNG.jpg Clothing 0.580469 0.259384 0.657213
5 Please Mum 3 Pack Terry Fash.sock - B.apple/om... B001JLBUAS.jpg Clothing 0.666088 0.272174 0.488102
6 SWISS MILITARY HANOWA NAVIGATOR STEEL/BLACK AL... B00DYXLXOW.jpg Watches 0.673705 0.093829 0.412216
7 Large Croco Purse Organizer With Floral lining B005HF4874.jpg Clothing 0.776477 0.339005 0.527542
8 Independent Trading Co Unisex Full Zip Hooded ... B003XXVG4K.jpg Clothing 0.694430 0.510251 0.347023
9 Johnny Cash - Fabulous Ladies T-Shirt - Large B000X9F8TG.jpg Clothing 0.841195 0.425981 0.691341

In [66]:
img = amazon_products.image_utils.column_to_sprites(
    image_column='image_file',
    sort_by='mean_value',
    data=df,
    image_directory=image_dir,
    n_samples=500,
    random_state=42,
    n_jobs=-1)
img


Out[66]:

In [64]:
img.save('test.png')

In [58]:
23**2


Out[58]:
529

In [59]:
amazon_products.image_utils.image_histogram(
    'image_file',
    x_column='mean_hue',
    y_column='mean_saturation',
    data=df,
    image_directory=image_dir,
    n_samples=5000,
    fig_size=(1000, 300),
    random_state=123)


Out[59]:

In [3]:
from keras.applications import resnet50
from keras.utils import vis_utils
from IPython.display import SVG

model = resnet50.ResNet50(include_top=True,
                          weights='imagenet',
                          input_shape=(224, 224, 3))

#SVG(vis_utils.model_to_dot(model).create(prog='dot', format='svg'))

vis_utils.plot_model(model, to_file='model.png')

In [23]:
from amazon_products import resnet

train_df = pd.read_csv(train_file_name)
test_df = pd.read_csv(dev_file_name)

vec = resnet.ResNetVectorizer(
    batch_size=500, 
    use_cache=True, 
    image_dir=image_dir, 
    cache_dir='resnet50')

train_features = vec.transform(train_df['image_file'].values)
test_features = vec.transform(test_df['image_file'].values)

In [61]:
df = pd.concat((train_df, test_df))[['title', 'product_category']]
resnet_df= pd.DataFrame(np.vstack((train_features, test_features)), 
                        columns=['resnet_%i' %i for i in range(train_features.shape[1])])
resnet_df['product_category'] = df['product_category'].values
resnet_df['title'] = df['title'].values
resnet_df.to_csv('amazon_products_resnet_features.csv', index=False)
#df = pd.concat((resnet_features, df))
#df.head(1)['title']

In [ ]:
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

embedding = TruncatedSVD(n_components=500, random_state=42).fit_transform(features[:5000, :])
embedding = TSNE(n_components=2, random_state=42).fit_transform(embedding)

data = pd.DataFrame({'x': embedding[:, 0], 'y': embedding[:, 1]})
data['target'] = df['product_category']

In [178]:
data['image_file'] = df['image_file'][:5000]
amazon_products.image_utils.image_scatter_plot(
    'image_file', 
    'x', 
    'y', 
    data, 
    image_directory=image_dir,
    thumbnail_size=55,
    fig_size=(2000, 2000),
    n_samples=None)


Out[178]:

In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from amazon_products import resnet

encoder = LabelEncoder()
train_df = pd.read_csv(train_file_name)
train_y = train_df.pop('product_category').values
train_X = train_df['image_file'].values

dev_df = pd.read_csv(dev_file_name)
dev_y = dev_df.pop('product_category').values
dev_X = dev_df['image_file'].values

train_y = encoder.fit_transform(train_y)
dev_y = encoder.transform(dev_y)

vec = resnet.ResNetVectorizer(
    batch_size=500, 
    use_cache=True, 
    image_dir=image_dir, 
    cache_dir='resnet50')
train_X = vec.transform(train_X)
dev_X = vec.transform(dev_X)

In [69]:
from sklearn.metrics import accuracy_score

estimator = RandomForestClassifier(n_estimators=500, random_state=123, n_jobs=-1)
estimator.fit(train_X, train_y)
y_pred = estimator.predict(dev_X)
print(accuracy_score(dev_y, y_pred))


0.849281458114

In [75]:
from sklearn.metrics import confusion_matrix

c_mat = pd.DataFrame(confusion_matrix(dev_y, y_pred), index=encoder.classes_, columns=encoder.classes_)


sns.heatmap(c_mat, annot=True, fmt='d')


Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x1195c3390>

In [ ]: