In [121]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import pandas as pd
from PIL import Image as pil_image
from bokeh.io import output_notebook
import seaborn as sns
import os
sns.set_style('whitegrid')
sns.set_context('notebook')
output_notebook()
import amazon_products
data_dir = 'amazon_products_data'
image_dir = os.path.join(data_dir, 'images')
full_file_name = os.path.join(data_dir, 'amazon_products.csv')
train_file_name = os.path.join(data_dir, 'amazon_products_train.csv')
test_file_name = os.path.join(data_dir, 'amazon_products_test.csv')
df = pd.read_csv(full_file_name)
In [122]:
df.head(10)
Out[122]:
In [112]:
import seaborn as sns
colors = ["windows blue", "amber", "greyish", "light red"]
with sns.xkcd_palette(colors):
ax = sns.countplot(y='product_category', data=df)
# display counts directly on graph
for p in ax.patches:
x=p.get_bbox().get_points()[1, 0]
y=p.get_bbox().get_points()[:, 1]
ax.annotate('{:d}'.format(int(x)), (x + 50, np.mean(y)))
sns.despine(left=True, bottom=True)
ax.get_figure().savefig('target_distribution.pdf')
In [104]:
#title = df.groupby('product_category')['title']
title = df.groupby('product_category')['title']
amazon_products.text_plots.word_cloud(title.get_group('Clothing').tolist())
Out[104]:
In [114]:
from matplotlib import pyplot as plt
fig, ax = plt.subplots(1,2, sharey=False, sharex=True, figsize=(20, 8))
colors = [sns.xkcd_rgb['windows blue'], sns.xkcd_rgb['amber']]
for i, (name, group) in enumerate(title):
amazon_products.text_plots.frequency_plot(
group.tolist(),
plot_n_words=20,
color=colors[i],
yaxis_label='{} - word'.format(name),
ax=ax.flatten()[i])
#fig.subplots_adjust(wspace=0.5)
fig.savefig('word_counts.pdf')
In [109]:
emb = amazon_products.text_plots.text_embedding(df['title'][:5000], labels=df['product_category'][:5000])
In [118]:
from bokeh.plotting import show, output_file
colors = ["windows blue", "amber"]
with sns.xkcd_palette(colors):
p = amazon_products.bokeh_plots.scatter_plot(
'component_1', 'component_2', hue='labels',
data=emb, table_column='text', hover_columns=['text'])
show(p)
In [8]:
from bokeh.plotting import show, output_file
#output_file('test_bokeh.html')
w2v = amazon_products.word2vec_plots.word2vec_embedding(df['title'][:5000], labels=df['product_category'][:5000])
colors = ["windows blue", "amber", "greyish", "light red"]
with sns.xkcd_palette(colors):
p = amazon_products.bokeh_plots.scatter_plot(
'component_1', 'component_2', hue='labels',
data=w2v, table_column='text', hover_columns=['text'])
show(p)
In [38]:
amazon_products.image_utils.load_image(df['image_file'][130], as_image=True, image_dir=image_dir)
df['title'][130]
Out[38]:
In [39]:
mean_hsv = amazon_products.image_features.mean_hsv('image_file',
data=df,
background='white',
image_directory=image_dir,
n_jobs=-1)
df['mean_hue'] = mean_hsv[:, 0]
df['mean_saturation'] = mean_hsv[:, 1]
df['mean_value'] = mean_hsv[:, 2]
df.head(10)
Out[39]:
In [66]:
img = amazon_products.image_utils.column_to_sprites(
image_column='image_file',
sort_by='mean_value',
data=df,
image_directory=image_dir,
n_samples=500,
random_state=42,
n_jobs=-1)
img
Out[66]: