MongoDB in Python


In [40]:
from pymongo import MongoClient, IndexModel, ASCENDING, DESCENDING
from bson.son import SON

cl = MongoClient()

scratch_db = cl.scratch

In [6]:
first = scratch_db.zips.find().limit(10)
for item in first:
    print(item)


{'_id': '01001', 'city': 'AGAWAM', 'loc': [-72.622739, 42.070206], 'pop': 15338, 'state': 'MA'}
{'_id': '01002', 'city': 'CUSHMAN', 'loc': [-72.51565, 42.377017], 'pop': 36963, 'state': 'MA'}
{'_id': '01005', 'city': 'BARRE', 'loc': [-72.108354, 42.409698], 'pop': 4546, 'state': 'MA'}
{'_id': '01007', 'city': 'BELCHERTOWN', 'loc': [-72.410953, 42.275103], 'pop': 10579, 'state': 'MA'}
{'_id': '01008', 'city': 'BLANDFORD', 'loc': [-72.936114, 42.182949], 'pop': 1240, 'state': 'MA'}
{'_id': '01010', 'city': 'BRIMFIELD', 'loc': [-72.188455, 42.116543], 'pop': 3706, 'state': 'MA'}
{'_id': '01011', 'city': 'CHESTER', 'loc': [-72.988761, 42.279421], 'pop': 1688, 'state': 'MA'}
{'_id': '01012', 'city': 'CHESTERFIELD', 'loc': [-72.833309, 42.38167], 'pop': 177, 'state': 'MA'}
{'_id': '01013', 'city': 'CHICOPEE', 'loc': [-72.607962, 42.162046], 'pop': 23396, 'state': 'MA'}
{'_id': '01020', 'city': 'CHICOPEE', 'loc': [-72.576142, 42.176443], 'pop': 31495, 'state': 'MA'}

Index comparison

See how indexes affect queries First without then with


In [23]:
scratch_db.zips.drop_indexes()

count = scratch_db.zips.find().count()
city_count = scratch_db.zips.find({"city": "FLAGSTAFF"}).count()
city_explain = scratch_db.zips.find({"city": "FLAGSTAFF"}).explain()['executionStats']

print(count)
print(city_count)
print(city_explain)


29353
2
{'executionSuccess': True, 'nReturned': 2, 'executionTimeMillis': 9, 'totalKeysExamined': 0, 'totalDocsExamined': 29353, 'executionStages': {'stage': 'COLLSCAN', 'filter': {'city': {'$eq': 'FLAGSTAFF'}}, 'nReturned': 2, 'executionTimeMillisEstimate': 10, 'works': 29355, 'advanced': 2, 'needTime': 29352, 'needYield': 0, 'saveState': 229, 'restoreState': 229, 'isEOF': 1, 'invalidates': 0, 'direction': 'forward', 'docsExamined': 29353}, 'allPlansExecution': []}

In [38]:
scratch_db.zips.drop_indexes()
scratch_db.zips.create_index([("city", ASCENDING)])


count = scratch_db.zips.find().count()
city_count = scratch_db.zips.find({"city": "FLAGSTAFF"}).count()
city_explain = scratch_db.zips.find({"city": "FLAGSTAFF"}).explain()['executionStats']

print(count)
print(city_count)
print(city_explain)


29353
2
{'executionSuccess': True, 'nReturned': 2, 'executionTimeMillis': 0, 'totalKeysExamined': 2, 'totalDocsExamined': 2, 'executionStages': {'stage': 'FETCH', 'nReturned': 2, 'executionTimeMillisEstimate': 0, 'works': 3, 'advanced': 2, 'needTime': 0, 'needYield': 0, 'saveState': 0, 'restoreState': 0, 'isEOF': 1, 'invalidates': 0, 'docsExamined': 2, 'alreadyHasObj': 0, 'inputStage': {'stage': 'IXSCAN', 'nReturned': 2, 'executionTimeMillisEstimate': 0, 'works': 3, 'advanced': 2, 'needTime': 0, 'needYield': 0, 'saveState': 0, 'restoreState': 0, 'isEOF': 1, 'invalidates': 0, 'keyPattern': {'city': 1}, 'indexName': 'city_1', 'isMultiKey': False, 'multiKeyPaths': {'city': []}, 'isUnique': False, 'isSparse': False, 'isPartial': False, 'indexVersion': 2, 'direction': 'forward', 'indexBounds': {'city': ['["FLAGSTAFF", "FLAGSTAFF"]']}, 'keysExamined': 2, 'seeks': 1, 'dupsTested': 0, 'dupsDropped': 0, 'seenInvalidated': 0}}, 'allPlansExecution': []}

You can see with the index it's execution is a bit different.

Seeing the executionTimeMillis parameter shows that the second one is executed much faster.

This is because the index allow you to search the index instead of all the documents.

Some other information about the dataset


In [47]:
print("Amount of cities per state:")
pipeline = [
    {"$unwind": "$state"},
    {"$group": {"_id": "$state", "count": {"$sum": 1}}},
    {"$sort": SON([("count", -1), ("_id", -1)])}
    ]
results = scratch_db.zips.aggregate(pipeline)
for result in results:
    print("State %s:   %d" % tuple(result.values()))


Amount of cities per state:
State TX:   1671
State NY:   1595
State CA:   1516
State PA:   1458
State IL:   1237
State OH:   1007
State MO:   994
State IA:   922
State MN:   882
State MI:   876
State VA:   816
State KY:   809
State FL:   804
State WI:   716
State KS:   715
State NC:   705
State IN:   676
State WV:   656
State GA:   635
State OK:   586
State TN:   582
State AR:   578
State NE:   574
State AL:   567
State NJ:   540
State WA:   484
State MA:   474
State LA:   464
State MD:   420
State CO:   414
State ME:   410
State ND:   391
State SD:   384
State OR:   384
State MS:   363
State SC:   350
State MT:   314
State NM:   276
State AZ:   270
State CT:   263
State ID:   244
State VT:   243
State NH:   218
State UT:   205
State AK:   195
State WY:   140
State NV:   104
State HI:   80
State RI:   69
State DE:   53
State DC:   24

In [50]:
print("Amount of cities with fewer then 50 people")
lt = scratch_db.zips.find({"pop": {"$lt": 50}})
print("%d cities" % lt.count())
for city in lt.limit(10):
    print("%s: %d" % (city['city'], city['pop']))


Amount of cities with fewer then 50 people
356 cities
BUCKLAND: 16
CAMBRIDGE: 0
CLAYVILLE: 45
EAST HEBRON: 47
WEST NOTTINGHAM: 27
BUSTINS ISLAND: 0
CUSHING ISLAND: 28
FRYE: 28
RUMFORD POINT: 36
CUSHING: 12

Geolocation

Mongodb also has build in support for geolocation indexes

This allows for searching for example nearby shops for a given location


In [67]:
scratch_db.zips.create_index([("loc", "2dsphere")])
flagstaff = scratch_db.zips.find_one({"city": "FLAGSTAFF"})
nearby = scratch_db.zips.find({"loc": {
    "$near": {
        "$geometry": {
            'type': 'Point',
            'coordinates': flagstaff['loc']
        },
        "$maxDistance": 50000
    }
}})

for city in nearby:
    print(city['city'])


FLAGSTAFF
FLAGSTAFF
MORMON LAKE
SEDONA