Aggregation (via pymongo)


In [ ]:
# import pymongo
from pymongo import MongoClient
from pprint import pprint

In [ ]:
# Create client
client = MongoClient('mongodb://localhost:32768')

In [ ]:
# Connect to database
db = client['fifa']

In [ ]:
# Get collection
my_collection = db['player']

Do An Aggregation

Basic process to convert pipelines from a JavaScript array to a Python list

  1. Convert all comments (from "//" to "#")
  2. Title-case all true/false to True/False
  3. Quote all operators and fields ($match --> '$match')
  4. Important: When using $sort operator in Python 2, wrap list with SON() method (from bson import SON)

Tips to avoid above process

  1. Use 1/0 for True/False
  2. Quote things in JavaScript ahead of time

Helper Functions


In [ ]:
def print_docs(pipeline, limit=5):

    pipeline.append({'$limit':limit})

    # Run Aggregation
    docs = my_collection.aggregate(pipeline)

    # Print Results
    for idx, doc in enumerate(docs):
#         print(type(doc))
        pprint(doc)
    #     print(f"#{idx + 1}: {doc}\n\n")

Aggregation


In [ ]:
# $match - Filter out Goalkeepers
match_a = {
    '$match': {
        'positionFull': {'$ne': 'Goalkeeper'}
    }
}

# Create Pipeline
pipeline = [
    match_a, 
]

# Fetch and Print the Results
print_docs(pipeline, limit=1)

In [ ]:
# $project - Keep only the fields we're interested in
project_a = {
    '$project': {
        '_id': True,  # Note: not required, _id is included by default
        'name': {'$concat': ['$firstName', ' ', '$lastName']},
        'pos': '$positionFull',  # Note: renaming
        'rating': True,
        'attributes': True
    }
}

# Create Pipeline
pipeline = [
    match_a, 
    project_a, 
]

# Fetch and Print the Results
print_docs(pipeline, limit=5)

In [ ]:
# $unwind - Convert N documents to 6*N documents (so we can do math on attributes)
unwind_a = {
    '$unwind': '$attributes'
}

# Create Pipeline
pipeline = [
    match_a, 
    project_a, 
    unwind_a, 
]

# Fetch and Print the Results
print_docs(pipeline, limit=5)

In [ ]:
# $group - $sum the value of the attributes (and pass the rest of the fields through the _id)
group_a = {
    '$group': {
        '_id': {
            'id': '$_id',
            'rating': '$rating',
            'name': '$name',
            'pos': '$pos'
        },
        "sum_attributes": {
            '$sum': "$attributes.value"
        }
    }
}

# Create Pipeline
pipeline = [
    match_a, 
    project_a, 
    unwind_a, 
    group_a, 
]

# Fetch and Print the Results
print_docs(pipeline, limit=5)

In [ ]:
# $project - Keep only the fields we're interested in
# Note: this is our second $project operator !!!
project_b = {
    '$project': {
        '_id': False,  # turn off _id

        'id': '$_id.id',
        'name': '$_id.name',
        'pos': '$_id.pos',
        'rating': '$_id.rating',

        'avg_attributes': {"$divide": ['$sum_attributes', 6]},
        'rating_attribute_difference': {"$subtract": [{"$divide": ['$sum_attributes', 6]}, '$_id.rating']}
    }
}

# Create Pipeline
pipeline = [
    match_a, 
    project_a, 
    unwind_a, 
    group_a, 
    project_b,
]

# Fetch and Print the Results
print_docs(pipeline, limit=5)

In [ ]:
# $match - Find anybody rated LESS than 75 that has a higher than 75 avg_attributes
# Note: this is our second $match operator !!!
match_b = {
    '$match': {
        'rating': {'$lt': 75},
        'avg_attributes': {'$gte': 75}
    }
}

# Create Pipeline
pipeline = [
    match_a, 
    project_a, 
    unwind_a, 
    group_a, 
    project_b, 
    match_b, 
]

# Fetch and Print the Results
print_docs(pipeline, limit=5)

In [ ]:
# $sort - Based on the amount of injustice
# Note: This step could be placed above previous "$match" step, but placing it here is more efficient with less
#       data to sort
sort_a = {
    '$sort': {
        'rating_attribute_difference': -1
    }
}

# Create Pipeline
pipeline = [
    match_a, 
    project_a, 
    unwind_a, 
    group_a, 
    project_b, 
    match_b, 
    sort_a,
]

# Fetch and Print the Results
print_docs(pipeline, limit=5)

Final Pipeline


In [ ]:
# Create Pipeline
pipeline = [match_a, project_a, unwind_a, group_a, project_b, match_b, sort_a]

# Run Aggregation
docs = my_collection.aggregate(pipeline)

# Print Results
for idx, doc in enumerate(docs):
    print(f"#{idx + 1}: {doc['name']}, a {doc['pos']}, is rated {doc['rating']} instead of {doc['avg_attributes']:.0f}")