This IPython Notebook provides an interactive way to follow along with and explore the numbered examples from Mining the Social Web (2nd Edition). The intent behind this notebook is to reinforce the concepts from the sample code in a fun, convenient, and effective way. This notebook assumes that you are reading along with the book and have the context of the discussion as you work through these exercises.
In the somewhat unlikely event that you've somehow stumbled across this notebook outside of its context on GitHub, you can find the full source code repository here.
You are free to use or adapt this notebook for any purpose you'd like. However, please respect the Simplified BSD License that governs its use.
In [ ]:
import os
import sys
import envoy
data_file = os.path.join(os.getcwd(), 'enron.mbox.json')
# Run a command just as you would in a terminal on the virtual machine to
# import the data file into MongoDB.
r = envoy.run('mongoimport --db enron --collection mbox ' + \
'--file %s' % data_file)
# Print its standard output
print r.std_out
print sys.stderr.write(r.std_err)
In [ ]:
import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
# Connects to the MongoDB server running on
# localhost:27017 by default
client = pymongo.MongoClient()
# Get a reference to the enron database
db = client.enron
# Reference the mbox collection in the Enron database
mbox = db.mbox
# The number of messages in the collection
print "Number of messages in mbox:"
print mbox.count()
print
# Pick a message to look at...
msg = mbox.find_one()
# Display the message as pretty-printed JSON. The use of
# the custom serializer supplied by PyMongo is necessary in order
# to handle the date field that is provided as a datetime.datetime
# tuple.
print "A message:"
print json.dumps(msg, indent=1, default=json_util.default)
In [ ]:
import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
from datetime import datetime as dt
client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox
# Create a small date range here of one day
start_date = dt(2001, 4, 1) # Year, Month, Day
end_date = dt(2001, 4, 2) # Year, Month, Day
# Query the database with the highly versatile "find" command,
# just like in the MongoDB shell.
msgs = [ msg
for msg in mbox.find({"Date" :
{
"$lt" : end_date,
"$gt" : start_date
}
}).sort("date")]
# Create a convenience function to make pretty-printing JSON a little
# less cumbersome
def pp(o, indent=1):
print json.dumps(msgs, indent=indent, default=json_util.default)
print "Messages from a query by date range:"
pp(msgs)
In [ ]:
import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox
senders = [ i for i in mbox.distinct("From") ]
receivers = [ i for i in mbox.distinct("To") ]
cc_receivers = [ i for i in mbox.distinct("Cc") ]
bcc_receivers = [ i for i in mbox.distinct("Bcc") ]
print "Num Senders:", len(senders)
print "Num Receivers:", len(receivers)
print "Num CC Receivers:", len(cc_receivers)
print "Num BCC Receivers:", len(bcc_receivers)
In [ ]:
senders = set(senders)
receivers = set(receivers)
cc_receivers = set(cc_receivers)
bcc_receivers = set(bcc_receivers)
# Find the number of senders who were also direct receivers
senders_intersect_receivers = senders.intersection(receivers)
# Find the senders that didn't receive any messages
senders_diff_receivers = senders.difference(receivers)
# Find the receivers that didn't send any messages
receivers_diff_senders = receivers.difference(senders)
# Find the senders who were any kind of receiver by
# first computing the union of all types of receivers
all_receivers = receivers.union(cc_receivers, bcc_receivers)
senders_all_receivers = senders.intersection(all_receivers)
print "Num senders in common with receivers:", len(senders_intersect_receivers)
print "Num senders who didn't receive:", len(senders_diff_receivers)
print "Num receivers who didn't send:", len(receivers_diff_senders)
print "Num senders in common with *all* receivers:", len(senders_all_receivers)
In [ ]:
# In a Mongo shell, you could try this query for the same effect:
# db.mbox.find({"To" : {"$regex" : /.*enron.com.*/i} },
# {"To" : 1, "_id" : 0})
senders = [ i
for i in mbox.distinct("From")
if i.lower().find("@enron.com") > -1 ]
receivers = [ i
for i in mbox.distinct("To")
if i.lower().find("@enron.com") > -1 ]
cc_receivers = [ i
for i in mbox.distinct("Cc")
if i.lower().find("@enron.com") > -1 ]
bcc_receivers = [ i
for i in mbox.distinct("Bcc")
if i.lower().find("@enron.com") > -1 ]
print "Num Senders:", len(senders)
print "Num Receivers:", len(receivers)
print "Num CC Receivers:", len(cc_receivers)
print "Num BCC Receivers:", len(bcc_receivers)
In [ ]:
import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox
aliases = ["kenneth.lay@enron.com", "ken_lay@enron.com", "ken.lay@enron.com",
"kenneth_lay@enron.net", "klay@enron.com"] # More possibilities?
to_msgs = [ msg
for msg in mbox.find({"To" : { "$in" : aliases } })]
from_msgs = [ msg
for msg in mbox.find({"From" : { "$in" : aliases } })]
print "Number of message sent to:", len(to_msgs)
print "Number of messages sent from:", len(from_msgs)
In [ ]:
import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
# The basis of our query
FROM = "kenneth.lay@enron.com"
client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox
# Get the recipient lists for each message
recipients_per_message = db.mbox.aggregate([
{"$match" : {"From" : FROM} },
{"$project" : {"From" : 1, "To" : 1} },
{"$group" : {"_id" : "$From", "recipients" : {"$addToSet" : "$To" } } }
])['result'][0]['recipients']
# Collapse the lists of recipients into a single list
all_recipients = [recipient
for message in recipients_per_message
for recipient in message]
# Calculate the number of recipients per sent message and sort
recipients_per_message_totals = \
sorted([len(recipients)
for recipients in recipients_per_message])
# Demonstrate how to use $unwind followed by $group to collapse
# the recipient lists into a single list (with no duplicates
# per the $addToSet operator)
unique_recipients = db.mbox.aggregate([
{"$match" : {"From" : FROM} },
{"$project" : {"From" : 1, "To" : 1} },
{"$unwind" : "$To"},
{"$group" : {"_id" : "From", "recipients" : {"$addToSet" : "$To"}} }
])['result'][0]['recipients']
print "Num total recipients on all messages:", len(all_recipients)
print "Num recipients for each message:", recipients_per_message_totals
print "Num unique recipients", len(unique_recipients)
In [ ]:
import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox
# Create an index if it doesn't already exist
mbox.ensure_index([("$**", "text")], name="TextIndex")
# Get the collection stats (collstats) on a collection
# named "mbox"
print json.dumps(db.command("collstats", "mbox"), indent=1)
# Use the db.command method to issue a "text" command
# on collection "mbox" with parameters, remembering that
# we need to use json_util to handle serialization of our JSON
print json.dumps(db.command("text", "mbox",
search="raptor",
limit=1),
indent=1, default=json_util.default)
In [ ]:
import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox
results = mbox.aggregate([
{
# Create a subdocument called DateBucket with each date component projected
# so that these fields can be grouped on in the next stage of the pipeline
"$project" :
{
"_id" : 0,
"DateBucket" :
{
"year" : {"$year" : "$Date"},
"month" : {"$month" : "$Date"},
"day" : {"$dayOfMonth" : "$Date"},
"hour" : {"$hour" : "$Date"},
}
}
},
{
"$group" :
{
# Group by year and date by using these fields for the key.
"_id" : {"year" : "$DateBucket.year", "month" : "$DateBucket.month"},
# Increment the sum for each group by 1 for every document that's in it
"num_msgs" : {"$sum" : 1}
}
},
{
"$sort" : {"_id.year" : 1, "_id.month" : 1}
}
])
print results
In [ ]:
from prettytable import PrettyTable
pt = PrettyTable(field_names=['Year', 'Month', 'Num Msgs'])
pt.align['Num Msgs'], pt.align['Month'] = 'r', 'r'
[ pt.add_row([ result['_id']['year'], result['_id']['month'], result['num_msgs'] ])
for result in results['result'] ]
print pt
In [ ]: