You can download MongoDB for your platform from the official MongoDB page. You can also read specific MongoDB installation instructions.
You do not need to install MongoDB on your machine for most of the exercises in this course, however for best learning outcome we recommend that you do it. It's fast and easy!
MongoDB has a lot of drivers and client libraries. The one we will be using in this course is PyMongo. See the official documentation for PyMongo installation instructions.
In [1]:
import pprint
def get_client():
from pymongo import MongoClient
return MongoClient('mongodb://localhost:27017/')
def get_db():
# 'examples' here is the database name. It will be created if it does not exist.
db = get_client().examples
return db
In [2]:
def add_city(db):
db.cities.insert_one({"name" : "Chicago"})
def get_city(db):
return db.cities.find_one()
In [3]:
db = get_db()
#add_city(db)
for city in db.cities.find():
pprint.pprint(city)
_id
field. If we don't specify one it will insert one for us
In [4]:
#db.cities.insert_one({"name": "London", "isOld": True})
In [5]:
#db.cities.insert_one({"name": "New York", "isNew": True})
In [6]:
def find_in_cities(query):
for city in db.cities.find(query):
pprint.pprint(city)
In [7]:
find_in_cities({"name": "London", "isOld": False})
In [8]:
find_in_cities({"name": "London", "isOld": True})
In [9]:
query = {"name": "London", "isOld": True}
projection = {"_id": 0, "name": 1}
for city in db.cities.find(query, projection):
pprint.pprint(city)
In [10]:
db.cities.count()
Out[10]:
In [11]:
list(city for city in db.cities.find())
Out[11]:
In [12]:
from pymongo import MongoClient
import csv
import json
import io
import re
import pprint
field_map = {
"name" : "name",
"bodyStyle_label" : "bodyStyle",
"assembly_label" : "assembly",
"class_label" : "class",
"designer_label" : "designer",
"engine_label" : "engine",
"length" : "length",
"height" : "height",
"width" : "width",
"weight" : "weight",
"wheelbase" : "wheelbase",
"layout_label" : "layout",
"manufacturer_label" : "manufacturer",
"modelEndYear" : "modelEndYear",
"modelStartYear" : "modelStartYear",
"predecessorLabel" : "predecessorLabel",
"productionStartYear" : "productionStartYear",
"productionEndYear" : "productionEndYear",
"transmission" : "transmission"
}
fields = field_map.keys()
def skip_lines(input_file, skip):
for i in range(0, skip):
next(input_file)
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def strip_automobile(v):
return re.sub(r"\s*\(automobile\)\s*", " ", v)
def strip_city(v):
return re.sub(r"\s*\(city\)\s*", " ", v)
def parse_array(v):
if (v[0] == "{") and (v[-1] == "}"):
v = v.lstrip("{")
v = v.rstrip("}")
v_array = v.split("|")
v_array = [i.strip() for i in v_array]
return v_array
return v
def mm_to_meters(v):
if v < 0.01:
return v * 1000
return v
def clean_dimension(d, field, v):
if is_number(v):
if field == "weight":
d[field] = float(v) / 1000.0
else:
d[field] = mm_to_meters(float(v))
def clean_year(d, field, v):
d[field] = v[0:4]
def parse_array2(v):
if (v[0] == "{") and (v[-1] == "}"):
v = v.lstrip("{")
v = v.rstrip("}")
v_array = v.split("|")
v_array = [i.strip() for i in v_array]
return (True, v_array)
return (False, v)
def ensure_not_array(v):
(is_array, v) = parse_array(v)
if is_array:
return v[0]
return v
def ensure_array(v):
(is_array, v) = parse_array2(v)
if is_array:
return v
return [v]
def ensure_float(v):
if is_number(v):
return float(v)
def ensure_int(v):
if is_number(v):
return int(v)
def ensure_year_array(val):
#print "val:", val
vals = ensure_array(val)
year_vals = []
for v in vals:
v = v[0:4]
v = int(v)
if v:
year_vals.append(v)
return year_vals
def empty_val(val):
val = val.strip()
return (val == "NULL") or (val == "")
def years(row, start_field, end_field):
start_val = row[start_field]
end_val = row[end_field]
if empty_val(start_val) or empty_val(end_val):
return []
start_years = ensure_year_array(start_val)
if start_years:
start_years = sorted(start_years)
end_years = ensure_year_array(end_val)
if end_years:
end_years = sorted(end_years)
all_years = []
if start_years and end_years:
#print start_years
#print end_years
for i in range(0, min(len(start_years), len(end_years))):
for y in range(start_years[i], end_years[i]+1):
all_years.append(y)
return all_years
def process_file_autos(input_file):
input_data = csv.DictReader(open(input_file))
autos = []
skip_lines(input_data, 3)
for row in input_data:
auto = {}
model_years = {}
production_years = {}
dimensions = {}
for field, val in row.iteritems():
if field not in fields or empty_val(val):
continue
if field in ["bodyStyle_label", "class_label", "layout_label"]:
val = val.lower()
val = strip_automobile(val)
val = strip_city(val)
val = val.strip()
val = parse_array(val)
if field in ["length", "width", "height", "weight", "wheelbase"]:
clean_dimension(dimensions, field_map[field], val)
elif field in ["modelStartYear", "modelEndYear"]:
clean_year(model_years, field_map[field], val)
elif field in ["productionStartYear", "productionEndYear"]:
clean_year(production_years, field_map[field], val)
else:
auto[field_map[field]] = val
if dimensions:
auto['dimensions'] = dimensions
auto['modelYears'] = years(row, 'modelStartYear', 'modelEndYear')
auto['productionYears'] = years(row, 'productionStartYear', 'productionEndYear')
autos.append(auto)
return autos
In [13]:
def insert_autos(infile, db):
data = process_file_autos(infile)
# Add your code here. Insert the data in one command.
db.autos.insert_many(data)
In [14]:
db = get_db()
#insert_autos('../autos.csv', db)
db.autos.find_one()
Out[14]:
In [15]:
db.autos.count()
Out[15]:
Documentation for mongoimport can be found here.
The following command is used
mongoimport -db dbname -c collectionname --file input-file.json
mongoimport --help
If no hostname and credentials are supplied, mongoimport
will try to connect to the default localhost:27017
In [16]:
list(city for city in db.autos.find())
Out[16]:
In [17]:
query = {'dimensions.weight': {"$gt": 2000}}
db.autos.count(query)
Out[17]:
In [18]:
#using ASCII to query for string types
query = {'name': {"$gte": "H", "$lte": "Z"}}
print db.autos.count(query)
print db.autos.find_one(query)
These operators can also be used with datetime
objects
In [19]:
def printCountAndOne(collection, query):
print "***"
print collection.count(query)
pprint.pprint(collection.find_one(query))
In [20]:
print db.autos.count()
printCountAndOne(db.autos, {'assembly': {"$exists": 1}})
printCountAndOne(db.autos, {'assembly': {"$exists": 0}})
In [21]:
printCountAndOne(db.autos, {"assembly": {"$regex": "Japan"}})
In [22]:
printCountAndOne(db.autos, {"productionYears": {"$in": [1986,1987,1988, 1992]}})
In [23]:
printCountAndOne(db.autos, {"productionYears": {"$all": [1986,1987,1988, 1992]}})
In [24]:
from random import randint
new_name = "test {}".format(randint(0,999))
print "new name is {}".format(new_name)
auto = db.autos.find_one({"productionYears": {"$in": [1986,1987,1988, 1992]}})
auto['name'] = new_name
db.autos.save(auto)
db.autos.find_one({"name": {"$eq": new_name}})
Out[24]:
In [25]:
#if exists then update else insert
db.autos.update_one({"assembly": {"$regex": "Japan"}}, {"$set": {"test": "japanese"}})
print db.autos.count({"test": "japanese"})
#if exists then remove field else ignore
db.autos.update_one({"test": "japanese"}, {"$unset": {"test": ""}})
print db.autos.count({"test": "japanese"})
NOTE Need to remember that the second argument must have operator $set
or $unset
. If that is not there then the entire document found by the first one would be replace by what is passed as the second argument.
In [26]:
db.autos.update_many({"assembly": {"$regex": "Japan"}}, {"$set": {"test": "japanese"}})
print db.autos.count({"test": "japanese"})
In [27]:
#db.autos.drop()
query = {"test": "japanese"}
#print db.autos.delete_many(query)
print db.autos.count()
In this problem set you work with another type of infobox data, audit it, clean it, come up with a data model, insert it into MongoDB and then run some queries against your database. The set contains data about Arachnid class animals.
Your task in this exercise is to parse the file, process only the fields that are listed in the FIELDS dictionary as keys, and return a list of dictionaries of cleaned values.
The following things should be done:
[ { 'label': 'Argiope',
'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
'name': 'Argiope',
'synonym': ["One", "Two"],
'classification': {
'family': 'Orb-weaver spider',
'class': 'Arachnid',
'phylum': 'Arthropod',
'order': 'Spider',
'kingdom': 'Animal',
'genus': None
}
},
{ 'label': ... , }, ...
]
In [28]:
import codecs
import csv
import json
import pprint
import re
DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
'URI': 'uri',
'rdf-schema#comment': 'description',
'synonym': 'synonym',
'name': 'name',
'family_label': 'family',
'class_label': 'class',
'phylum_label': 'phylum',
'order_label': 'order',
'kingdom_label': 'kingdom',
'genus_label': 'genus'}
def remove_brackets_part(string):
pattern = re.compile(r"\([^)]*?\)")
return pattern.sub("", string).strip()
def process_file(filename, fields):
process_fields = fields.values()
data = []
with open(filename, "r") as f:
reader = csv.DictReader(f)
for i in range(3):
l = reader.next()
for line in reader:
for key, value in fields.iteritems():
if key != value:
line[value] = line[key]
if value in line:
line[value] = line[value].strip()
line['label'] = remove_brackets_part(line['label'])
if ('name' not in line) or (line['name'] is None) or line['name'].isalnum():
line['name'] = line['label']
if 'synonym' in line:
if line['synonym'] == 'NULL':
line['synonym'] = None
else:
line['synonym'] = parse_array(line['synonym'])
for key in line.keys():
if (key not in line) or (line[key] == 'NULL'):
line[key] = None
data.append({'label': line['label'],
'uri': line['uri'],
'description': line['description'],
'name': line['name'],
'synonym': line['synonym'],
'classification': {
'family': line['family'],
'class': line['class'],
'phylum': line['phylum'],
'order': line['order'],
'kingdom': line['kingdom'],
'genus': line['genus']
}
})
return data
def parse_array(v):
if (v[0] == "{") and (v[-1] == "}"):
v = v.lstrip("{")
v = v.rstrip("}")
v_array = v.split("|")
v_array = [i.strip() for i in v_array]
return v_array
return [v]
In [29]:
def test():
data = process_file(DATAFILE, FIELDS)
print "Your first entry:"
pprint.pprint(data[0])
first_entry = {
"synonym": None,
"name": "Argiope",
"classification": {
"kingdom": "Animal",
"family": "Orb-weaver spider",
"order": "Spider",
"phylum": "Arthropod",
"genus": None,
"class": "Arachnid"
},
"uri": "http://dbpedia.org/resource/Argiope_(spider)",
"label": "Argiope",
"description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
}
assert len(data) == 76
assert data[0] == first_entry
assert data[17]["name"] == "Ogdenia"
assert data[48]["label"] == "Hydrachnidiae"
assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"]
In [30]:
test()
In [31]:
db = get_db()
with open('arachnid.json') as f:
data = json.loads(f.read())
db.arachnid.insert(data, db)
For this exercise, the arachnid data is already in the database. You have been given the task of including 'binomialAuthority' information in the records. You will do this by processing the arachnid.csv to extract binomial authority data and then using this data to update the corresponding data base records.
The following things should be done in the function add_field:
The following should be done in the function update_db:
For item {'Argiope': 'Jill Ward'} in the data dictionary, the resulting document structure should look like this:
{ 'label': 'Argiope',
'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
'name': 'Argiope',
'synonym': ["One", "Two"],
'classification': {
'binomialAuthority' : 'Jill Ward'
'family': 'Orb-weaver spider',
'class': 'Arachnid',
'phylum': 'Arthropod',
'order': 'Spider',
'kingdom': 'Animal',
'genus': None
}
}
In [32]:
DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
'binomialAuthority_label': 'binomialAuthority'}
def add_field(filename, fields):
"""
Complete this function to set up a dictionary for adding binomialAuthority
information to the database.
"""
process_fields = fields.keys()
data = {}
with open(filename, "r") as f:
reader = csv.DictReader(f)
for i in range(3):
l = reader.next()
for line in reader:
binomialAuthority_label = line['binomialAuthority_label']
if binomialAuthority_label == 'NULL':
continue
label = remove_brackets_part(line['rdf-schema#label'])
data[label] = binomialAuthority_label
return data
def update_db(data, db):
"""
Use the dictionary you generated from add_field to update the database.
"""
for key, value in data.iteritems():
pprint.pprint("{}:{}".format(key, value))
db.arachnid.update_many({"label": {"$eq": key}}, {"$set": {"classification.binomialAuthority": value}})
In [33]:
def testFinal():
# Please change only the add_field and update_db functions!
# Changes done to this function will not be taken into account
# when doing a Test Run or Submit, they are just for your own reference
# and as an example for running this code locally!
data = add_field(DATAFILE, FIELDS)
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples
update_db(data, db)
updated = db.arachnid.find_one({'label': 'Opisthoncana'})
assert updated['classification']['binomialAuthority'] == 'Embrik Strand'
pprint.pprint(data)
testFinal()