In [11]:
# common imports
%matplotlib inline
import numpy as np
import pandas as pd
from tabulate import tabulate
from pymongo import MongoClient
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (20,8)
db = MongoClient()['stores']
# assuming we there is collection named
# `size_mapping` with fields `['_id', 'source', 'size']`
TOTAL_NUMBER_OF_RECORDS = db.size_mapping.count()
NUMBER_OF_UNIQ_SIZES = len(db.size_mapping.distinct("size"))
results = db.size_mapping.aggregate(
[
{
"$group": {
"_id": "$size",
"distributors": {"$addToSet": "$source"},
"count": {"$sum": 1},
}
},
{
"$project": {
"_id": 1,
"distributors_number": {"$size": "$distributors"},
"distributors": 1,
"count": 1,
},
},
{
"$sort": {
"count": -1,
}
}
]
)
SIZES_WITH_DISTRIBUTORS = [
(str(x['_id']),
x['count'],
x['distributors_number'],
x['distributors'])
for x in list(results)
]
print('Records: {} Uniq. sizes: {}'.format(
TOTAL_NUMBER_OF_RECORDS, NUMBER_OF_UNIQ_SIZES))
table = [(x[0], x[1], x[2]) for x in SIZES_WITH_DISTRIBUTORS[:20]]
print(tabulate(table,
headers=['Size', 'Count', '# of Distr.'],
tablefmt="simple"))
In [ ]: