In [98]:
    
%matplotlib inline
import numpy as np
import pandas as pd
from scipy.stats import entropy
from tabulate import tabulate
from pymongo import MongoClient
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (20,8)
db = MongoClient()['stores']
TOTAL_NUMBER_OF_PRODUCTS = db.data.count()
results = db.data.aggregate(
    [
        {
            "$group": {
                "_id": "$size",
                "count": {"$sum": 1},
            }
        },
        {
            "$sort": {
                "count": -1,
            }
        }
    ]
)
ALL_SIZES = [(str(x['_id']), x['count']) for x in list(results)]
print('Number of uniq. sizes: {}'.format(len(ALL_SIZES)))
    
    
In [99]:
    
DISTRIBUTORS = list(db.data.distinct("source"))
results = db.data.aggregate(
    [
        {
            "$group": {
                "_id": "$source",
                "sizes": {"$addToSet": "$size"},
            }
        },
        {
            "$project": {
                "_id": 1,
                "count": {"$size": "$sizes"}
            }
        },
        {
            "$sort": {
                "count": -1,
            }
        }
    ]
)
SIZES_PER_DISTRIBUTOR = [
    (str(x['_id']), x['count']) 
    for x in list(results)
]
print(tabulate(SIZES_PER_DISTRIBUTOR, 
               headers=['Distributor', 'Number of uniq. Sizes'], 
               tablefmt="simple"))
df_values_by_key = pd.DataFrame(SIZES_PER_DISTRIBUTOR, 
                                index=[x[0] for x in SIZES_PER_DISTRIBUTOR], 
                                columns=['Distributor', 'Sizes'])
df_values_by_key.iloc[::-1].plot.barh()
    
    
    Out[99]:
    
Print joint table with first 60 sizes.
In [100]:
    
import operator
all_sizes_table = []
number_of_sizes = 180
for sizes in zip(ALL_SIZES[0:number_of_sizes:3],
                 ALL_SIZES[1:number_of_sizes:3],
                 ALL_SIZES[2:number_of_sizes:3]):
    all_sizes_table.append(list(reduce(operator.add, sizes)))
print(
    tabulate(
        all_sizes_table[:60], 
        headers=3*['Size', 'Number of Products'], 
        tablefmt="simple"))
    
    
In [101]:
    
# calculate probability vector
p = [x[1] for x in ALL_SIZES]
size_prob_vector = np.array(p) / TOTAL_NUMBER_OF_PRODUCTS
# calculate entropy
first_entropy = entropy(size_prob_vector)
print("Data entropy:", first_entropy)
    
    
In [49]:
    
# create new collection
db.data.aggregate(
    [
        {
            "$project": {
                "_id": 1,
                "source": 1,
                "size": 1,
            },
        },
        {
            "$out": "size_mapping"
        }
        
    ]
)
print('Db "size_mapping" created')
# create indexes
db.size_mapping.create_index([("size", 1)])
db.size_mapping.create_index([("source", 1)])
print('Indexes "size", "source" for "size_mapping" created.')
print(list(db.size_mapping.find().limit(5)))
    
    
In [102]:
    
SIZES_LIST_PER_DISTRIBUTOR = db.size_mapping.aggregate(
    [
        {
            "$group": {
                "_id": "$source",
                "sizes": {"$addToSet": "$size"},
            },
        },
        {
            "$project": {
                "_id": 1,
                "sizes": 1,
                "number_of_sizes": {"$size": "$sizes"},
            }
        },
        {
            "$sort": {
                "number_of_sizes": -1
            }
        }
        
    ]
)
TABLE_SIZES_LIST_PER_DISTRIBUTOR = [
    (str(x['_id']), x['sizes'], x['number_of_sizes']) 
    for x in SIZES_LIST_PER_DISTRIBUTOR
]
    
In [103]:
    
for distr, sizes, num in TABLE_SIZES_LIST_PER_DISTRIBUTOR:
    print('Sizes for: "{}"'.format(distr))
    print(", ".join(sizes))
    print(80*"-")
    
    
Since the number of sizes is low (1117 uniq sizes), the task could be resolved using tivial brute force, i.e. map sizes using mapping table.
During the observation of data i noticed that sizes are defined for adult, youth, toddler and baby:
kid, toddler, baby sizes chart youth sizes chart
I.e. could tag products accodring to the size.
TAG_FROM_SIZE = {
    'adult': ['XS', 'S', 'M', 'L', 'XL', '2XL', '3XL', '4XL', '5XL', '6XL'],
    'youth': ['YXS', 'YSM', 'YMD', 'YLG', 'YXL', '8H', '10H', '12H', '14H', '16H', '18H', '20H'],
    'kid': []
}
In [50]:
    
SIZES_MAPPING = {
    'ALL': [],
    'NO SIZE': ['PLAIN', 'CONE', 'BLANKET'],
    'ONE': ['OS', 'ONE SIZE', '1 SIZ', 'O/S'],
    'XS': ['XXS', 'XX-SMALL', '2XS'],
    'S': ['SMALL', 'S/M'],
    'M': ['MEDIUM', 'S/M', 'M/L'],
    'L': ['LARGE', 'L/XL', 'M/L'],
    'XL': ['EXTRA', 'XLT', 'XT', 'L/XL'],
    '2XL': ['2X', 'XXL', '2XT', '2XLL', '2X/', '2XLT'],
    '3XL': ['3X', '3XT', '3XLL', '3XLT'],
    '4XL': ['4X', '4XT', '4XLT'],
    '5XL': ['5X', '5XT', '5XLT'],
    '6XL': ['6X'],
}
def build_matching_table(matching_rules):
    """Build matching table from matching rules
    :param matching_rules: matching rules used to build matching table
    :type matching_rules: dict
    :return: matching table `{'S/M: ['S', 'M'], '2X': ['2XL'], ...}`
    :rtype: dict
    """
    matching_table = {}
    # transform matching rules to the "shortcut": "group_key" table
    for key, values in matching_rules.items():
        if not values:  # skip undefined rules i.e. "[]"
            continue
        # add rule for key
        if key not in matching_table:
            # NOTE: set('ab') would be {'a', 'b'}
            # so it's impossible to matching_table[key] = set(key)
            matching_table[key] = set()
            matching_table[key].add(key)
            
        for value in values:
            if value not in matching_table:
                matching_table[value] = set()
                matching_table[value].add(key)
            else:
                matching_table[value].add(key)
    return matching_table
 
MATCHING_RULES = build_matching_table(SIZES_MAPPING)
print(tabulate(MATCHING_TABLE.items(), headers=['From', 'To'], tablefmt="simple"))
    
    
In [ ]:
    
# process data into the new table
# def get_groups(mtable, size):
#     """Get size groups for the given `size` according to matching table
#     :param size: size (case insensetive)
#     :type size: str
#     :return: list of strings i.e. size groups or ``['UNDEFINED']``
#     if not found
#     :rtype: list or ['UNDEFINED']
#     """
#     return list(mtable.get(size, default=size))
# for k, v in MATCHING_TABLE.items():
#     res = db.size_mapping.update_many(
#         {"size": k}, 
#         {"$set": {"size": get_groups(MATCHING_TABLE, k)}})
#     print(res.raw_result)
    
Let's calculate data entropy for results
In [27]:
    
results = db.size_mapping.aggregate(
    [
        {
            "$group": {
                "_id": "$size",
                "count": {"$sum": 1},
            }
        },
        {
            "$sort": {
                "count": -1,
            }
        }
    ]
)
NEW_SIZES = [(str(x['_id']), x['count']) for x in list(results)]
print(
    "\n" + 
    tabulate(NEW_SIZES[:20], headers=['Size', 'Number of Products'], tablefmt="orgtbl") +
    "\n"
)
# calculate probability vector
p = []
for _, count in NEW_SIZES:
    p.append(count)
size_prob_vector = np.array(p) / TOTAL_NUMBER_OF_PRODUCTS
# calculate entropy
first_entropy = entropy(size_prob_vector)
print("Data entropy: ", first_entropy)
    
    
In [36]:
    
from functools import reduce
total_matched_products = (sum([x[1] for x in NEW_SIZES[:11]]))
percent_from_db_total = round((total_matched_products / TOTAL_NUMBER_OF_PRODUCTS) * 100, 2)
print("Matched: {} Percent from total: {}".format(total_matched_products, percent_from_db_total))
    
    
In [ ]: