In [1]:
"""
Your task is to sucessfully run the exercise to see how pymongo works
and how easy it is to start using it.
You don't actually have to change anything in this exercise,
but you can change the city name in the add_city function if you like.
Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB (see Instructor comments for link to installation information)
and uncomment the get_db function.
"""
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
# 'examples' here is the database name. It will be created if it does not exist.
db = client.examples
return db
def add_city(db):
db.cities.insert({"name" : "Chicago"})
def get_city(db):
return db.cities.find_one()
if __name__ == "__main__":
db = get_db() # uncomment this line if you want to run this locally
add_city(db)
print get_city(db)
In [11]:
#!/usr/bin/env python
"""
Your task is to complete the 'porsche_query' function and in particular the query
to find all autos where the manufacturer field matches "Porsche".
Please modify only 'porsche_query' function, as only that will be taken into account.
Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB and download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials at
the following link:
https://www.udacity.com/wiki/ud032
"""
def get_db(db_name):
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client[db_name]
return db
def porsche_query():
# Please fill in the query to find all autos manuafactured by Porsche
query = {'manufacturer' : 'Porsche'}
return query
def find_porsche(db, query):
return db.autos.find(query)
if __name__ == "__main__":
db = get_db('examples')
query = porsche_query()
p = find_porsche(db, query)
import pprint
In [ ]:
from autos import process_file
def insert_autos(infile, db):
autos = process_file(infile)
# Your code here. Insert the data in one command
# autos will be a list of dictionaries, as in the example in the previous video
# You have to insert data in a collection 'autos'
db.autos.insert(autos)
if __name__ == "__main__":
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples
insert_autos('autos-small.csv', db)
print db.autos.find_one()
In [6]:
#!/usr/bin/env python
""" Your task is to write a query that will return all cities
that are founded in 21st century.
Please modify only 'range_query' function, as only that will be taken into account.
Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""
from datetime import datetime
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.examples
return db
def range_query():
# You can use datetime(year, month, day) to specify date in the query
query = {"foundingDate" : {"$gte" : datetime(2001,1,1)}}
return query
if __name__ == "__main__":
db = get_db()
query = range_query()
cities = db.cities.find(query)
print "Found cities:", cities.count()
import pprint
pprint.pprint(cities[0])
In [ ]:
#!/usr/bin/env python
""" Your task is to write a query that will return all cars manufactured by "Ford Motor Company"
that are assembled in Germany, United Kingdom, or Japan.
Please modify only 'in_query' function, as only that will be taken into account.
Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.examples
return db
def in_query():
# Write the query
query = {"manufacturer" : "Ford Motor Company", "assembly" : {"$in" : ["Germany", "United Kingdom", "Japan"]}}
return query
if __name__ == "__main__":
db = get_db()
query = in_query()
autos = db.autos.find(query, {"name":1, "manufacturer":1, "assembly": 1, "_id":0})
print "Found autos:", autos.count()
import pprint
for a in autos:
pprint.pprint(a)
In [ ]:
#!/usr/bin/env python
""" Your task is to write a query that will return all cars with width dimension greater than 2.5
Please modify only 'dot_query' function, as only that will be taken into account.
Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.examples
return db
def dot_query():
query = {"dimensions.width" : {"$gt" : 2.5}}
return query
if __name__ == "__main__":
db = get_db()
query = dot_query()
cars = db.cars.find(query)
print "Found cars:", cars.count()
import pprint
pprint.pprint(cars[0])
In [72]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it, clean it,
come up with a data model, insert it into a MongoDB and then run some queries against your database.
The set contains data about Arachnid class.
Your task in this exercise is to parse the file, process only the fields that are listed in the
FIELDS dictionary as keys, and return a dictionary of cleaned values.
The following things should be done:
- keys of the dictionary changed according to the mapping in FIELDS dictionary
- trim out redundant description in parenthesis from the 'rdf-schema#label' field, like "(spider)"
- if 'name' is "NULL" or contains non-alphanumeric characters, set it to the same value as 'label'.
- if a value of a field is "NULL", convert it to None
- if there is a value in 'synonym', it should be converted to an array (list)
by stripping the "{}" characters and splitting the string on "|". Rest of the cleanup is up to you,
eg removing "*" prefixes etc
- strip leading and ending whitespace from all fields, if there is any
- the output structure should be as follows:
{ 'label': 'Argiope',
'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
'name': 'Argiope',
'synonym': ["One", "Two"],
'classification': {
'family': 'Orb-weaver spider',
'class': 'Arachnid',
'phylum': 'Arthropod',
'order': 'Spider',
'kingdom': 'Animal',
'genus': None
}
}
* Note that the value associated with the classification key is a dictionary with
taxonomic labels.
"""
import codecs
import csv
import json
import pprint
import re
DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
'URI': 'uri',
'rdf-schema#comment': 'description',
'synonym': 'synonym',
'name': 'name',
'family_label': 'family',
'class_label': 'class',
'phylum_label': 'phylum',
'order_label': 'order',
'kingdom_label': 'kingdom',
'genus_label': 'genus'}
def process_file(filename, fields):
process_fields = fields.keys()
data = []
with open(filename, "r") as f:
reader = csv.DictReader(f)
for i in range(3):
l = reader.next()
for line in reader:
# swap keys, strip whitespace
for key,value in FIELDS.items():
line[value] = line.pop(key)
line[value] = line[value].strip(' ')
# discard unwanted pairs
for key in line.keys():
if key not in FIELDS.values():
del line[key]
# strip junk from label and name
line['label'] = re.sub(r' (.*)', '', line['label'])
if line['name'] == 'NULL' or re.match(r'\W|\w*\s\W', line['name']):
line['name'] = line['label']
# replace 'NULL' with None
for key,value in line.items():
if value == 'NULL':
line[key] = None
# turn synonym string into a list
if line['synonym'] != None:
if line['synonym'][0] != '{':
line['synonym'] = [line['synonym']]
elif line['synonym'][0] == '{':
line['synonym'] = line['synonym'].replace('{', '').replace('}', '').replace('*', '').split('|')
line['synonym'] = [syn.strip(' ') for syn in line['synonym']]
# combine certain pairs into classification
line['classification'] = {}
for key in ['family','class','phylum','order','kingdom','genus']:
line['classification'][key] = line[key]
del line[key]
data.append(line)
return data
def parse_array(v):
if (v[0] == "{") and (v[-1] == "}"):
v = v.lstrip("{")
v = v.rstrip("}")
v_array = v.split("|")
v_array = [i.strip() for i in v_array]
return v_array
return [v]
def test():
data = process_file(DATAFILE, FIELDS)
pprint.pprint(data[0])
assert data[0] == {
"synonym": None,
"name": "Argiope",
"classification": {
"kingdom": "Animal",
"family": "Orb-weaver spider",
"order": "Spider",
"phylum": "Arthropod",
"genus": None,
"class": "Arachnid"
},
"uri": "http://dbpedia.org/resource/Argiope_(spider)",
"label": "Argiope",
"description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
}
if __name__ == "__main__":
test()
In [75]:
import json
def insert_data(data, db):
db.arachnid.insert(data)
if __name__ == "__main__":
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples
with open('arachnid.json') as f:
data = json.loads(f.read())
insert_data(data, db)
print db.arachnid.find_one()
In [23]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it, clean it,
come up with a data model, insert it into a MongoDB and then run some queries against your database.
The set contains data about Arachnid class.
The data is already in the database. But you have been given a task to also include 'binomialAuthority'
information in the data, so you have to go through the data and update the existing entries.
The following things should be done in the function add_field:
- process the csv file and extract 2 fields - 'rdf-schema#label' and 'binomialAuthority_label'
- clean up the 'rdf-schema#label' same way as in the first exercise - removing redundant "(spider)" suffixes
- return a dictionary, with 'label' being the key, and 'binomialAuthority_label' the value
- if 'binomialAuthority_label' is "NULL", skip the item
The following should be done in the function update_db:
- query the database by using the field 'label'
- update the data, by adding a new item under 'classification' with a key 'binomialAuthority'
The resulting data should look like this:
- the output structure should be as follows:
{ 'label': 'Argiope',
'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
'name': 'Argiope',
'synonym': ["One", "Two"],
'classification': {
'binomialAuthority': None,
'family': 'Orb-weaver spider',
'class': 'Arachnid',
'phylum': 'Arthropod',
'order': 'Spider',
'kingdom': 'Animal',
'genus': None
}
}
"""
import codecs
import csv
import json
import pprint
import re
DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
'binomialAuthority_label': 'binomialAuthority'}
def add_field(filename, fields):
process_fields = fields.keys()
data = {}
with open(filename, "r") as f:
reader = csv.DictReader(f)
for i in range(3):
l = reader.next()
for line in reader:
# swap keys, strip whitespace
for key,value in FIELDS.items():
line[value] = line.pop(key)
line[value] = line[value].strip(' ')
# discard unwanted pairs
for key in line.keys():
if key not in FIELDS.values():
del line[key]
# strip junk from label and name
line['label'] = re.sub(r' (.*)', '', line['label'])
# turn binomialAuthority string into a list
if line['binomialAuthority'] != None:
if line['binomialAuthority'][0] != '{':
line['binomialAuthority'] = [line['binomialAuthority']]
elif line['binomialAuthority'][0] == '{':
line['binomialAuthority'] = line['binomialAuthority'].replace('{', '').replace('}', '').replace('*', '').split('|')
line['binomialAuthority'] = [syn.strip(' ') for syn in line['binomialAuthority']]
# no NULLs
if line['binomialAuthority'][0] != 'NULL':
data[line['label']] = line['binomialAuthority']
return data
def update_db(data, db):
for key,value in data.items():
db.arachnid.update({"label" : key},
{"$set" : {"classification" : {"binomialAuthority" : value[0]}}})
def test():
# Please change only the add_field and update_db functions!
# Changes done to this function will not be taken into account
# when doing a Test Run or Submit, they are just for your own reference
# and as an example for running this code locally!
data = add_field(DATAFILE, FIELDS)
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples
update_db(data, db)
updated = db.arachnid.find_one({'label': 'Opisthoncana'})
pprint.pprint(data)
if __name__ == "__main__":
test()