In [121]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
from PIL import Image as pil_image
from bokeh.io import output_notebook
import seaborn as sns
import os

sns.set_style('whitegrid')
sns.set_context('notebook')
output_notebook()

import amazon_products

data_dir = 'amazon_products_data'
image_dir = os.path.join(data_dir, 'images')
full_file_name = os.path.join(data_dir, 'amazon_products.csv')
train_file_name = os.path.join(data_dir, 'amazon_products_train.csv')
test_file_name = os.path.join(data_dir, 'amazon_products_test.csv')

df = pd.read_csv(full_file_name)









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload






    





    
        
        Loading BokehJS ...

Target Distribution



In [122]:

    
df.head(10)









    Out[122]:






  
    
      
      title
      image_file
      product_category
    
  
  
    
      0
      Coffee Retro Women Lady Weave Rivet Leather St...
      B00FDP6M9A.jpg
      Accessory
    
    
      1
      Fabric Fanny Pack- High Quality- Color Pattern...
      B0077DSN7U.jpg
      Clothing
    
    
      2
      Missoni Women's SM12 Ballet Flat,Beige,37.5 EU...
      B0050SMHQW.jpg
      Accessory
    
    
      3
      Brown Reindeer Hat Chenille X Small
      B007XF1WHK.jpg
      Clothing
    
    
      4
      ADJUSTABLE MULTI COLOR Cross Howlite Turquoise...
      B008TUQY1C.jpg
      Accessory
    
    
      5
      2 Pieces of Pink Pearl Beaded Head Chain
      B00AKJISCS.jpg
      Accessory
    
    
      6
      Lemon Beads
      B004KV3JR6.jpg
      Clothing
    
    
      7
      Sterling Silver Celtic Knot Ring
      B003TPHD1M.jpg
      Accessory
    
    
      8
      Quiksilver Men's All Time Long Sleeve Surf T-S...
      B00EKR9WD0.jpg
      Clothing
    
    
      9
      Red Tree Design Quartz Clock Pendant Pocket Watch
      B007RK16W2.jpg
      Accessory



In [112]:

    
import seaborn as sns

colors = ["windows blue", "amber", "greyish", "light red"]
with sns.xkcd_palette(colors): 
    ax = sns.countplot(y='product_category', data=df)
    
    # display counts directly on graph
    for p in ax.patches:
        x=p.get_bbox().get_points()[1, 0]
        y=p.get_bbox().get_points()[:, 1]
        ax.annotate('{:d}'.format(int(x)), (x + 50, np.mean(y)))
            
    sns.despine(left=True, bottom=True)
    
    ax.get_figure().savefig('target_distribution.pdf')

Simple TFIDF Feature Engineering



In [104]:

    
#title = df.groupby('product_category')['title']
title = df.groupby('product_category')['title']
amazon_products.text_plots.word_cloud(title.get_group('Clothing').tolist())









    Out[104]:



In [114]:

    
from matplotlib import pyplot as plt

fig, ax = plt.subplots(1,2, sharey=False, sharex=True, figsize=(20, 8))

colors = [sns.xkcd_rgb['windows blue'], sns.xkcd_rgb['amber']]
for i, (name, group) in enumerate(title):
    amazon_products.text_plots.frequency_plot(
        group.tolist(), 
        plot_n_words=20,
        color=colors[i],
        yaxis_label='{} - word'.format(name),
        ax=ax.flatten()[i])
#fig.subplots_adjust(wspace=0.5)

fig.savefig('word_counts.pdf')



In [109]:

    
emb = amazon_products.text_plots.text_embedding(df['title'][:5000], labels=df['product_category'][:5000])



In [118]:

    
from bokeh.plotting import show, output_file

colors = ["windows blue", "amber"]
with sns.xkcd_palette(colors): 
    p = amazon_products.bokeh_plots.scatter_plot(
        'component_1', 'component_2', hue='labels', 
        data=emb, table_column='text', hover_columns=['text'])
    
    show(p)



In [8]:

    
from bokeh.plotting import show, output_file

#output_file('test_bokeh.html')

w2v = amazon_products.word2vec_plots.word2vec_embedding(df['title'][:5000], labels=df['product_category'][:5000])
colors = ["windows blue", "amber", "greyish", "light red"]
with sns.xkcd_palette(colors): 
    p = amazon_products.bokeh_plots.scatter_plot(
        'component_1', 'component_2', hue='labels', 
        data=w2v, table_column='text', hover_columns=['text'])
    
    show(p)



In [38]:

    
amazon_products.image_utils.load_image(df['image_file'][130], as_image=True, image_dir=image_dir)
df['title'][130]









    Out[38]:





"Swiss Army Women's Garrison Collection Black Rubber Watch Silver Dial 241020"



In [39]:

    
mean_hsv = amazon_products.image_features.mean_hsv('image_file',
                                                   data=df,
                                                   background='white',
                                                   image_directory=image_dir,
                                                   n_jobs=-1)
df['mean_hue'] = mean_hsv[:, 0]
df['mean_saturation'] = mean_hsv[:, 1]
df['mean_value'] = mean_hsv[:, 2]
df.head(10)









    Out[39]:






  
    
      
      title
      image_file
      product_category
      mean_hue
      mean_saturation
      mean_value
    
  
  
    
      0
      Samsonite Cruisair Bold 26&quot; Spinner - Silver
      B008O841LY.jpg
      Clothing
      0.234798
      0.034866
      0.646103
    
    
      1
      Columbia Women's Sleet To Street Interchange J...
      B00E9SOH5I.jpg
      Clothing
      0.659377
      0.183517
      0.250793
    
    
      2
      Hurley Baby-Boys Infant Griffin 2.0 Board Shor...
      B0081PD79W.jpg
      Clothing
      0.839782
      0.155021
      0.234832
    
    
      3
      Elliott Lucca Lucca Smartphone Wristlet
      B006Q62SJI.jpg
      Clothing
      0.548122
      0.126736
      0.275154
    
    
      4
      Muk Luks Unisex-baby Infant Monkey Hat
      B005NB6LNG.jpg
      Clothing
      0.580469
      0.259384
      0.657213
    
    
      5
      Please Mum 3 Pack Terry Fash.sock - B.apple/om...
      B001JLBUAS.jpg
      Clothing
      0.666088
      0.272174
      0.488102
    
    
      6
      SWISS MILITARY HANOWA NAVIGATOR STEEL/BLACK AL...
      B00DYXLXOW.jpg
      Watches
      0.673705
      0.093829
      0.412216
    
    
      7
      Large Croco Purse Organizer With Floral lining
      B005HF4874.jpg
      Clothing
      0.776477
      0.339005
      0.527542
    
    
      8
      Independent Trading Co Unisex Full Zip Hooded ...
      B003XXVG4K.jpg
      Clothing
      0.694430
      0.510251
      0.347023
    
    
      9
      Johnny Cash - Fabulous Ladies T-Shirt - Large
      B000X9F8TG.jpg
      Clothing
      0.841195
      0.425981
      0.691341



In [66]:

    
img = amazon_products.image_utils.column_to_sprites(
    image_column='image_file',
    sort_by='mean_value',
    data=df,
    image_directory=image_dir,
    n_samples=500,
    random_state=42,
    n_jobs=-1)
img









    Out[66]:



In [64]:

    
img.save('test.png')



In [58]:

    
23**2









    Out[58]:





529



In [59]:

    
amazon_products.image_utils.image_histogram(
    'image_file',
    x_column='mean_hue',
    y_column='mean_saturation',
    data=df,
    image_directory=image_dir,
    n_samples=5000,
    fig_size=(1000, 300),
    random_state=123)









    Out[59]:



In [3]:

    
from keras.applications import resnet50
from keras.utils import vis_utils
from IPython.display import SVG

model = resnet50.ResNet50(include_top=True,
                          weights='imagenet',
                          input_shape=(224, 224, 3))

#SVG(vis_utils.model_to_dot(model).create(prog='dot', format='svg'))

vis_utils.plot_model(model, to_file='model.png')



In [23]:

    
from amazon_products import resnet

train_df = pd.read_csv(train_file_name)
test_df = pd.read_csv(dev_file_name)

vec = resnet.ResNetVectorizer(
    batch_size=500, 
    use_cache=True, 
    image_dir=image_dir, 
    cache_dir='resnet50')

train_features = vec.transform(train_df['image_file'].values)
test_features = vec.transform(test_df['image_file'].values)



In [61]:

    
df = pd.concat((train_df, test_df))[['title', 'product_category']]
resnet_df= pd.DataFrame(np.vstack((train_features, test_features)), 
                        columns=['resnet_%i' %i for i in range(train_features.shape[1])])
resnet_df['product_category'] = df['product_category'].values
resnet_df['title'] = df['title'].values
resnet_df.to_csv('amazon_products_resnet_features.csv', index=False)
#df = pd.concat((resnet_features, df))
#df.head(1)['title']



In [ ]:

    
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

embedding = TruncatedSVD(n_components=500, random_state=42).fit_transform(features[:5000, :])
embedding = TSNE(n_components=2, random_state=42).fit_transform(embedding)

data = pd.DataFrame({'x': embedding[:, 0], 'y': embedding[:, 1]})
data['target'] = df['product_category']



In [178]:

    
data['image_file'] = df['image_file'][:5000]
amazon_products.image_utils.image_scatter_plot(
    'image_file', 
    'x', 
    'y', 
    data, 
    image_directory=image_dir,
    thumbnail_size=55,
    fig_size=(2000, 2000),
    n_samples=None)









    Out[178]:



In [68]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from amazon_products import resnet

encoder = LabelEncoder()
train_df = pd.read_csv(train_file_name)
train_y = train_df.pop('product_category').values
train_X = train_df['image_file'].values

dev_df = pd.read_csv(dev_file_name)
dev_y = dev_df.pop('product_category').values
dev_X = dev_df['image_file'].values

train_y = encoder.fit_transform(train_y)
dev_y = encoder.transform(dev_y)

vec = resnet.ResNetVectorizer(
    batch_size=500, 
    use_cache=True, 
    image_dir=image_dir, 
    cache_dir='resnet50')
train_X = vec.transform(train_X)
dev_X = vec.transform(dev_X)



In [69]:

    
from sklearn.metrics import accuracy_score

estimator = RandomForestClassifier(n_estimators=500, random_state=123, n_jobs=-1)
estimator.fit(train_X, train_y)
y_pred = estimator.predict(dev_X)
print(accuracy_score(dev_y, y_pred))









    



0.849281458114



In [75]:

    
from sklearn.metrics import confusion_matrix

c_mat = pd.DataFrame(confusion_matrix(dev_y, y_pred), index=encoder.classes_, columns=encoder.classes_)


sns.heatmap(c_mat, annot=True, fmt='d')









    Out[75]:





<matplotlib.axes._subplots.AxesSubplot at 0x1195c3390>



In [ ]:

	title	image_file	product_category
0	Coffee Retro Women Lady Weave Rivet Leather St...	B00FDP6M9A.jpg	Accessory
1	Fabric Fanny Pack- High Quality- Color Pattern...	B0077DSN7U.jpg	Clothing
2	Missoni Women's SM12 Ballet Flat,Beige,37.5 EU...	B0050SMHQW.jpg	Accessory
3	Brown Reindeer Hat Chenille X Small	B007XF1WHK.jpg	Clothing
4	ADJUSTABLE MULTI COLOR Cross Howlite Turquoise...	B008TUQY1C.jpg	Accessory
5	2 Pieces of Pink Pearl Beaded Head Chain	B00AKJISCS.jpg	Accessory
6	Lemon Beads	B004KV3JR6.jpg	Clothing
7	Sterling Silver Celtic Knot Ring	B003TPHD1M.jpg	Accessory
8	Quiksilver Men's All Time Long Sleeve Surf T-S...	B00EKR9WD0.jpg	Clothing
9	Red Tree Design Quartz Clock Pendant Pocket Watch	B007RK16W2.jpg	Accessory

	title	image_file	product_category	mean_hue	mean_saturation	mean_value
0	Samsonite Cruisair Bold 26" Spinner - Silver	B008O841LY.jpg	Clothing	0.234798	0.034866	0.646103
1	Columbia Women's Sleet To Street Interchange J...	B00E9SOH5I.jpg	Clothing	0.659377	0.183517	0.250793
2	Hurley Baby-Boys Infant Griffin 2.0 Board Shor...	B0081PD79W.jpg	Clothing	0.839782	0.155021	0.234832
3	Elliott Lucca Lucca Smartphone Wristlet	B006Q62SJI.jpg	Clothing	0.548122	0.126736	0.275154
4	Muk Luks Unisex-baby Infant Monkey Hat	B005NB6LNG.jpg	Clothing	0.580469	0.259384	0.657213
5	Please Mum 3 Pack Terry Fash.sock - B.apple/om...	B001JLBUAS.jpg	Clothing	0.666088	0.272174	0.488102
6	SWISS MILITARY HANOWA NAVIGATOR STEEL/BLACK AL...	B00DYXLXOW.jpg	Watches	0.673705	0.093829	0.412216
7	Large Croco Purse Organizer With Floral lining	B005HF4874.jpg	Clothing	0.776477	0.339005	0.527542
8	Independent Trading Co Unisex Full Zip Hooded ...	B003XXVG4K.jpg	Clothing	0.694430	0.510251	0.347023
9	Johnny Cash - Fabulous Ladies T-Shirt - Large	B000X9F8TG.jpg	Clothing	0.841195	0.425981	0.691341