Group sizes with ML


In [11]:
# common imports
%matplotlib inline
import numpy as np
import pandas as pd
from tabulate import tabulate
from pymongo import MongoClient
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (20,8)

db = MongoClient()['stores']

# assuming we there is collection named
# `size_mapping` with fields `['_id', 'source', 'size']`
TOTAL_NUMBER_OF_RECORDS = db.size_mapping.count()
NUMBER_OF_UNIQ_SIZES = len(db.size_mapping.distinct("size"))

results = db.size_mapping.aggregate(
    [
        {
            "$group": {
                "_id": "$size",
                "distributors": {"$addToSet": "$source"},
                "count": {"$sum": 1},
            }
        },
        {
            "$project": {
                "_id": 1,
                "distributors_number": {"$size": "$distributors"},
                "distributors": 1,
                "count": 1,
            },
        },
        {
            "$sort": {
                "count": -1,
            }
        }
    ]
)

SIZES_WITH_DISTRIBUTORS = [
    (str(x['_id']), 
     x['count'], 
     x['distributors_number'],
     x['distributors']) 
    for x in list(results)
]
print('Records: {} Uniq. sizes: {}'.format(
    TOTAL_NUMBER_OF_RECORDS, NUMBER_OF_UNIQ_SIZES))

table = [(x[0], x[1], x[2]) for x in SIZES_WITH_DISTRIBUTORS[:20]]
print(tabulate(table, 
               headers=['Size', 'Count', '# of Distr.'], 
               tablefmt="simple"))


Records: 1561159 Uniq. sizes: 1117
Size        Count    # of Distr.
--------  -------  -------------
M          202965             20
L          202641             20
S          202589             20
XL         186166             20
2XL        152014             18
3XL        106966             19
XS          71104             18
4XL         50869             17
MEDIUM      31645              1
LARGE       31599              2
SMALL       31506              1
EXTRA       31098              1
2X          20091              9
3X          20085              8
5XL         17541             18
ONE         13786              7
XXL         11288              8
OS           7615              7
ONE SIZE     7269             10
XX           7184              1
6XL          6993             15
4X           6945              8
ALL          4087              2
L/XL         3901             14
XLT          3819             13
OSFA         3687              3
S/M          3490             15
LT           3355             13
2XLT         3004             12
3XLT         2782             11

In [ ]: