In [ ]:
# import pymongo
from pymongo import MongoClient
from pprint import pprint
In [ ]:
# Create client
client = MongoClient('mongodb://localhost:32768')
In [ ]:
# Connect to database
db = client['fifa']
In [ ]:
# Get collection
my_collection = db['player']
In [ ]:
def print_docs(pipeline, limit=5):
pipeline.append({'$limit':limit})
# Run Aggregation
docs = my_collection.aggregate(pipeline)
# Print Results
for idx, doc in enumerate(docs):
# print(type(doc))
pprint(doc)
# print(f"#{idx + 1}: {doc}\n\n")
Aggregation
In [ ]:
# $match - Filter out Goalkeepers
match_a = {
'$match': {
'positionFull': {'$ne': 'Goalkeeper'}
}
}
# Create Pipeline
pipeline = [
match_a,
]
# Fetch and Print the Results
print_docs(pipeline, limit=1)
In [ ]:
# $project - Keep only the fields we're interested in
project_a = {
'$project': {
'_id': True, # Note: not required, _id is included by default
'name': {'$concat': ['$firstName', ' ', '$lastName']},
'pos': '$positionFull', # Note: renaming
'rating': True,
'attributes': True
}
}
# Create Pipeline
pipeline = [
match_a,
project_a,
]
# Fetch and Print the Results
print_docs(pipeline, limit=5)
In [ ]:
# $unwind - Convert N documents to 6*N documents (so we can do math on attributes)
unwind_a = {
'$unwind': '$attributes'
}
# Create Pipeline
pipeline = [
match_a,
project_a,
unwind_a,
]
# Fetch and Print the Results
print_docs(pipeline, limit=5)
In [ ]:
# $group - $sum the value of the attributes (and pass the rest of the fields through the _id)
group_a = {
'$group': {
'_id': {
'id': '$_id',
'rating': '$rating',
'name': '$name',
'pos': '$pos'
},
"sum_attributes": {
'$sum': "$attributes.value"
}
}
}
# Create Pipeline
pipeline = [
match_a,
project_a,
unwind_a,
group_a,
]
# Fetch and Print the Results
print_docs(pipeline, limit=5)
In [ ]:
# $project - Keep only the fields we're interested in
# Note: this is our second $project operator !!!
project_b = {
'$project': {
'_id': False, # turn off _id
'id': '$_id.id',
'name': '$_id.name',
'pos': '$_id.pos',
'rating': '$_id.rating',
'avg_attributes': {"$divide": ['$sum_attributes', 6]},
'rating_attribute_difference': {"$subtract": [{"$divide": ['$sum_attributes', 6]}, '$_id.rating']}
}
}
# Create Pipeline
pipeline = [
match_a,
project_a,
unwind_a,
group_a,
project_b,
]
# Fetch and Print the Results
print_docs(pipeline, limit=5)
In [ ]:
# $match - Find anybody rated LESS than 75 that has a higher than 75 avg_attributes
# Note: this is our second $match operator !!!
match_b = {
'$match': {
'rating': {'$lt': 75},
'avg_attributes': {'$gte': 75}
}
}
# Create Pipeline
pipeline = [
match_a,
project_a,
unwind_a,
group_a,
project_b,
match_b,
]
# Fetch and Print the Results
print_docs(pipeline, limit=5)
In [ ]:
# $sort - Based on the amount of injustice
# Note: This step could be placed above previous "$match" step, but placing it here is more efficient with less
# data to sort
sort_a = {
'$sort': {
'rating_attribute_difference': -1
}
}
# Create Pipeline
pipeline = [
match_a,
project_a,
unwind_a,
group_a,
project_b,
match_b,
sort_a,
]
# Fetch and Print the Results
print_docs(pipeline, limit=5)
In [ ]:
# Create Pipeline
pipeline = [match_a, project_a, unwind_a, group_a, project_b, match_b, sort_a]
# Run Aggregation
docs = my_collection.aggregate(pipeline)
# Print Results
for idx, doc in enumerate(docs):
print(f"#{idx + 1}: {doc['name']}, a {doc['pos']}, is rated {doc['rating']} instead of {doc['avg_attributes']:.0f}")