In [98]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy.stats import entropy
from tabulate import tabulate
from pymongo import MongoClient
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (20,8)
db = MongoClient()['stores']
TOTAL_NUMBER_OF_PRODUCTS = db.data.count()
results = db.data.aggregate(
[
{
"$group": {
"_id": "$size",
"count": {"$sum": 1},
}
},
{
"$sort": {
"count": -1,
}
}
]
)
ALL_SIZES = [(str(x['_id']), x['count']) for x in list(results)]
print('Number of uniq. sizes: {}'.format(len(ALL_SIZES)))
In [99]:
DISTRIBUTORS = list(db.data.distinct("source"))
results = db.data.aggregate(
[
{
"$group": {
"_id": "$source",
"sizes": {"$addToSet": "$size"},
}
},
{
"$project": {
"_id": 1,
"count": {"$size": "$sizes"}
}
},
{
"$sort": {
"count": -1,
}
}
]
)
SIZES_PER_DISTRIBUTOR = [
(str(x['_id']), x['count'])
for x in list(results)
]
print(tabulate(SIZES_PER_DISTRIBUTOR,
headers=['Distributor', 'Number of uniq. Sizes'],
tablefmt="simple"))
df_values_by_key = pd.DataFrame(SIZES_PER_DISTRIBUTOR,
index=[x[0] for x in SIZES_PER_DISTRIBUTOR],
columns=['Distributor', 'Sizes'])
df_values_by_key.iloc[::-1].plot.barh()
Out[99]:
Print joint table with first 60 sizes.
In [100]:
import operator
all_sizes_table = []
number_of_sizes = 180
for sizes in zip(ALL_SIZES[0:number_of_sizes:3],
ALL_SIZES[1:number_of_sizes:3],
ALL_SIZES[2:number_of_sizes:3]):
all_sizes_table.append(list(reduce(operator.add, sizes)))
print(
tabulate(
all_sizes_table[:60],
headers=3*['Size', 'Number of Products'],
tablefmt="simple"))
In [101]:
# calculate probability vector
p = [x[1] for x in ALL_SIZES]
size_prob_vector = np.array(p) / TOTAL_NUMBER_OF_PRODUCTS
# calculate entropy
first_entropy = entropy(size_prob_vector)
print("Data entropy:", first_entropy)
In [49]:
# create new collection
db.data.aggregate(
[
{
"$project": {
"_id": 1,
"source": 1,
"size": 1,
},
},
{
"$out": "size_mapping"
}
]
)
print('Db "size_mapping" created')
# create indexes
db.size_mapping.create_index([("size", 1)])
db.size_mapping.create_index([("source", 1)])
print('Indexes "size", "source" for "size_mapping" created.')
print(list(db.size_mapping.find().limit(5)))
In [102]:
SIZES_LIST_PER_DISTRIBUTOR = db.size_mapping.aggregate(
[
{
"$group": {
"_id": "$source",
"sizes": {"$addToSet": "$size"},
},
},
{
"$project": {
"_id": 1,
"sizes": 1,
"number_of_sizes": {"$size": "$sizes"},
}
},
{
"$sort": {
"number_of_sizes": -1
}
}
]
)
TABLE_SIZES_LIST_PER_DISTRIBUTOR = [
(str(x['_id']), x['sizes'], x['number_of_sizes'])
for x in SIZES_LIST_PER_DISTRIBUTOR
]
In [103]:
for distr, sizes, num in TABLE_SIZES_LIST_PER_DISTRIBUTOR:
print('Sizes for: "{}"'.format(distr))
print(", ".join(sizes))
print(80*"-")
Since the number of sizes is low (1117 uniq sizes), the task could be resolved using tivial brute force, i.e. map sizes using mapping table.
During the observation of data i noticed that sizes are defined for adult, youth, toddler and baby:
kid, toddler, baby sizes chart youth sizes chart
I.e. could tag products accodring to the size.
TAG_FROM_SIZE = {
'adult': ['XS', 'S', 'M', 'L', 'XL', '2XL', '3XL', '4XL', '5XL', '6XL'],
'youth': ['YXS', 'YSM', 'YMD', 'YLG', 'YXL', '8H', '10H', '12H', '14H', '16H', '18H', '20H'],
'kid': []
}
In [50]:
SIZES_MAPPING = {
'ALL': [],
'NO SIZE': ['PLAIN', 'CONE', 'BLANKET'],
'ONE': ['OS', 'ONE SIZE', '1 SIZ', 'O/S'],
'XS': ['XXS', 'XX-SMALL', '2XS'],
'S': ['SMALL', 'S/M'],
'M': ['MEDIUM', 'S/M', 'M/L'],
'L': ['LARGE', 'L/XL', 'M/L'],
'XL': ['EXTRA', 'XLT', 'XT', 'L/XL'],
'2XL': ['2X', 'XXL', '2XT', '2XLL', '2X/', '2XLT'],
'3XL': ['3X', '3XT', '3XLL', '3XLT'],
'4XL': ['4X', '4XT', '4XLT'],
'5XL': ['5X', '5XT', '5XLT'],
'6XL': ['6X'],
}
def build_matching_table(matching_rules):
"""Build matching table from matching rules
:param matching_rules: matching rules used to build matching table
:type matching_rules: dict
:return: matching table `{'S/M: ['S', 'M'], '2X': ['2XL'], ...}`
:rtype: dict
"""
matching_table = {}
# transform matching rules to the "shortcut": "group_key" table
for key, values in matching_rules.items():
if not values: # skip undefined rules i.e. "[]"
continue
# add rule for key
if key not in matching_table:
# NOTE: set('ab') would be {'a', 'b'}
# so it's impossible to matching_table[key] = set(key)
matching_table[key] = set()
matching_table[key].add(key)
for value in values:
if value not in matching_table:
matching_table[value] = set()
matching_table[value].add(key)
else:
matching_table[value].add(key)
return matching_table
MATCHING_RULES = build_matching_table(SIZES_MAPPING)
print(tabulate(MATCHING_TABLE.items(), headers=['From', 'To'], tablefmt="simple"))
In [ ]:
# process data into the new table
# def get_groups(mtable, size):
# """Get size groups for the given `size` according to matching table
# :param size: size (case insensetive)
# :type size: str
# :return: list of strings i.e. size groups or ``['UNDEFINED']``
# if not found
# :rtype: list or ['UNDEFINED']
# """
# return list(mtable.get(size, default=size))
# for k, v in MATCHING_TABLE.items():
# res = db.size_mapping.update_many(
# {"size": k},
# {"$set": {"size": get_groups(MATCHING_TABLE, k)}})
# print(res.raw_result)
Let's calculate data entropy for results
In [27]:
results = db.size_mapping.aggregate(
[
{
"$group": {
"_id": "$size",
"count": {"$sum": 1},
}
},
{
"$sort": {
"count": -1,
}
}
]
)
NEW_SIZES = [(str(x['_id']), x['count']) for x in list(results)]
print(
"\n" +
tabulate(NEW_SIZES[:20], headers=['Size', 'Number of Products'], tablefmt="orgtbl") +
"\n"
)
# calculate probability vector
p = []
for _, count in NEW_SIZES:
p.append(count)
size_prob_vector = np.array(p) / TOTAL_NUMBER_OF_PRODUCTS
# calculate entropy
first_entropy = entropy(size_prob_vector)
print("Data entropy: ", first_entropy)
In [36]:
from functools import reduce
total_matched_products = (sum([x[1] for x in NEW_SIZES[:11]]))
percent_from_db_total = round((total_matched_products / TOTAL_NUMBER_OF_PRODUCTS) * 100, 2)
print("Matched: {} Percent from total: {}".format(total_matched_products, percent_from_db_total))
In [ ]: