In [118]:
# assuming the data was imported into the database named stores
from pymongo import MongoClient
import pandas as pd
from IPython.display import HTML, display
db = MongoClient()['stores']
TOTAL_NUMBER_OF_PRODUCTS = db.data.count()
display(HTML(
"products/records in the database: <b>{:,}</b>".format(
TOTAL_NUMBER_OF_PRODUCTS)))
In [119]:
# The number of uniq. values for given key in database
%matplotlib inline
from tabulate import tabulate
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (20,8)
fields = ['brand', 'category', 'size', 'code', 'color']
values_by_key = []
for field in fields:
res = db.data.distinct(field)
avg_items = round(TOTAL_NUMBER_OF_PRODUCTS / len(res))
values_by_key.append([field, int(len(res)), avg_items])
print(tabulate(values_by_key, headers=['Field', 'Uniq. Values', 'Avg. Products per Field'], tablefmt="simple"))
df_values_by_key = pd.DataFrame(values_by_key, index=fields, columns=['Field', 'Uniq', 'Avg'])
df_values_by_key.plot.bar()
Out[119]:
In [120]:
DISTRIBUTORS = list(db.data.distinct("source"))
results = db.data.aggregate(
[
{
"$group": {
"_id": "$source",
"count": {"$sum": 1},
}
},
{
"$sort": {
"count": -1,
}
}
]
)
PRODUCTS_PER_DISTRIBUTOR = [
(str(x['_id']), x['count'])
for x in list(results)
]
print(tabulate(PRODUCTS_PER_DISTRIBUTOR,
headers=['Distributor', 'Number of Products'],
tablefmt="simple"))
df_values_by_key = pd.DataFrame(PRODUCTS_PER_DISTRIBUTOR,
index=[x[0] for x in PRODUCTS_PER_DISTRIBUTOR],
columns=['Distributor', 'Products'])
df_values_by_key.iloc[::-1].plot.barh()
Out[120]:
In [121]:
results = db.data.aggregate(
[
{
"$group": {
"_id": "$brand",
"count": {"$sum": 1},
}
},
{
"$sort": {
"count": -1,
}
},
{
"$limit": 20
}
]
)
TOP_BRANDS = [(str(x['_id']), x['count']) for x in list(results)]
print(tabulate(TOP_BRANDS,
headers=['Brand', 'Number of Products'], tablefmt="simple"))
df_values_by_key = pd.DataFrame(TOP_BRANDS,
index=[x[0] for x in TOP_BRANDS],
columns=['Brand', 'Products'])
df_values_by_key.iloc[::-1].plot.barh()
Out[121]:
Brand field is empty for 126929 + 64153 products.
In [122]:
results = db.data.aggregate(
[
{
"$group": {
"_id": "$size",
"count": {"$sum": 1},
}
},
{
"$sort": {
"count": -1,
}
},
{
"$limit": 20
}
]
)
TOP_SIZES = [(str(x['_id']), x['count']) for x in list(results)]
print(tabulate(TOP_SIZES, headers=['Size', 'Number of Products'], tablefmt="simple"))
df_values_by_key = pd.DataFrame(TOP_SIZES,
index=[x[0] for x in TOP_SIZES],
columns=['Size', 'Products'])
df_values_by_key.iloc[::-1].plot.barh()
Out[122]:
In [123]:
results = db.data.aggregate(
[
{
"$group": {
"_id": "$category",
"count": {"$sum": 1},
}
},
{
"$sort": {
"count": -1,
}
},
{
"$limit": 20
}
]
)
TOP_CATEGORIES = [(str(x['_id']), x['count']) for x in list(results)]
print(tabulate(TOP_CATEGORIES, headers=['Category', 'Number of Products'], tablefmt="simple"))
df_values_by_key = pd.DataFrame(TOP_CATEGORIES,
index=[x[0] for x in TOP_CATEGORIES],
columns=['Category', 'Products'])
df_values_by_key.iloc[::-1].plot.barh()
Out[123]:
In [124]:
results = db.data.aggregate(
[
{
"$group": {
"_id": "$color",
"count": {"$sum": 1},
}
},
{
"$sort": {
"count": -1,
}
},
{
"$limit": 20
}
]
)
TOP_COLORS = [(str(x['_id']), x['count']) for x in list(results)]
print(tabulate(TOP_COLORS, headers=['Color', 'Number of Products'], tablefmt="simple"))
df_values_by_key = pd.DataFrame(TOP_COLORS,
index=[x[0] for x in TOP_COLORS],
columns=['Color', 'Products'])
df_values_by_key.iloc[::-1].plot.barh()
Out[124]:
In [125]:
results = db.data.aggregate(
[
{
"$group": {
"_id": "$code",
"count": {"$sum": 1},
}
},
{
"$sort": {
"count": -1,
}
},
{
"$limit": 20
}
]
)
TOP_CODES = [(str(x['_id']), x['count']) for x in list(results)]
print(tabulate(TOP_CODES, headers=['Code', 'Number of Products'], tablefmt="simple"))
df_values_by_key = pd.DataFrame(TOP_CATEGORIES,
index=[x[0] for x in TOP_CATEGORIES],
columns=['Code', 'Products'])
df_values_by_key.iloc[::-1].plot.barh()
Out[125]:
In [126]:
MEASUREMENT_PER_DISTRIBUTOR = db.data.aggregate(
[
{
"$group": {
"_id": "$source",
"brands": {"$addToSet": "$brand"},
"colors": {"$addToSet": "$color"},
"sizes": {"$addToSet": "$size"},
"categories": {"$addToSet": "$category"},
"codes": {"$addToSet": "$code"}
}
},
{
"$project": {
"_id": 1,
"brands": {"$size": "$brands"},
"colors": {"$size": "$colors"},
"sizes": {"$size": "$sizes"},
"categories": {"$size": "$categories"},
"codes": {"$size": "$codes"},
}
},
{
"$sort": {
"categories": -1,
"brands": -1,
"colors": -1,
"sizes": -1,
"codes": -1,
}
},
]
)
TABLE_MEASUREMENTS_PER_DISTRIBUTOR = [
(str(x['_id']), x['brands'], x['colors'],
x['sizes'], x['categories'], x['codes'])
for x in list(MEASUREMENT_PER_DISTRIBUTOR)
]
print(tabulate(TABLE_MEASUREMENTS_PER_DISTRIBUTOR,
headers=[
'Distributor', 'Brands', 'Colors',
'Sizes', 'Categories', 'Codes'],
tablefmt="simple"))
df_values_by_key = pd.DataFrame(TABLE_MEASUREMENTS_PER_DISTRIBUTOR,
index=[x[0] for x in TABLE_MEASUREMENTS_PER_DISTRIBUTOR],
columns=['Distributor', 'Brands', 'Colors', 'Sizes', 'Categories', 'Codes'])
df_values_by_key.iloc[::-1].plot.bar()
Out[126]:
In [203]:
import numpy as np
from sklearn.preprocessing import normalize
from plotly.offline import iplot, init_notebook_mode
from plotly.graph_objs import Bar, Layout, Figure
init_notebook_mode(connected=True)
def get_bar(df, name, color):
df_ = df[1:]
return Bar(
x=['Brand', 'Color', 'Size', 'Category', 'Code'],
y=df_,
name=name,
marker=dict(color=color))
data = []
colors = ['#ffcdd2', '#a2d5f2', '#235412', '#415220', '#59606d']
for dist in TABLE_MEASUREMENTS_PER_DISTRIBUTOR:
name = dist[0]
data.append(get_bar(dist, name, str(iter(colors))))
layout = Layout(title="Joined chart",
xaxis=dict(title='Measurement'),
yaxis=dict(title='Products'))
fig = Figure(data=data, layout=layout)
iplot(fig, filename='0.1-joined_chart')
Looking on chart above easy i guess i'll start with sizes. categories and brands.