In [1]:
"""
Your task is to sucessfully run the exercise to see how pymongo works
and how easy it is to start using it.
You don't actually have to change anything in this exercise,
but you can change the city name in the add_city function if you like.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB (see Instructor comments for link to installation information)
and uncomment the get_db function.
"""


def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    # 'examples' here is the database name. It will be created if it does not exist.
    db = client.examples
    return db

def add_city(db):
    db.cities.insert({"name" : "Chicago"})
    
def get_city(db):
    return db.cities.find_one()


if __name__ == "__main__":

    db = get_db() # uncomment this line if you want to run this locally
    add_city(db)
    print get_city(db)


{u'_id': ObjectId('54ca787cf4dc0b090c65c643'), u'name': u'Chicago'}

In [11]:
#!/usr/bin/env python
"""
Your task is to complete the 'porsche_query' function and in particular the query
to find all autos where the manufacturer field matches "Porsche".
Please modify only 'porsche_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB and download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials at
the following link:
https://www.udacity.com/wiki/ud032
"""


def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db


def porsche_query():
    # Please fill in the query to find all autos manuafactured by Porsche
    query = {'manufacturer' : 'Porsche'}
    return query


def find_porsche(db, query):
    return db.autos.find(query)


if __name__ == "__main__":

    db = get_db('examples')
    query = porsche_query()
    p = find_porsche(db, query)
    import pprint

In [ ]:
from autos import process_file


def insert_autos(infile, db):
    autos = process_file(infile)
    
    

    # Your code here. Insert the data in one command
    # autos will be a list of dictionaries, as in the example in the previous video
    # You have to insert data in a collection 'autos'
    
    db.autos.insert(autos)


  
if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    insert_autos('autos-small.csv', db)
    print db.autos.find_one()

In [6]:
#!/usr/bin/env python
""" Your task is to write a query that will return all cities
that are founded in 21st century.
Please modify only 'range_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""
from datetime import datetime
    
def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


def range_query():
    # You can use datetime(year, month, day) to specify date in the query
    query = {"foundingDate" : {"$gte" : datetime(2001,1,1)}}
    return query


if __name__ == "__main__":

    db = get_db()
    query = range_query()
    cities = db.cities.find(query)

    print "Found cities:", cities.count()
    import pprint
    pprint.pprint(cities[0])


 Found cities: 0
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-6-d0a1eb37cc05> in <module>()
     32     print "Found cities:", cities.count()
     33     import pprint
---> 34     pprint.pprint(cities[0])

/usr/local/lib/python2.7/site-packages/pymongo/cursor.pyc in __getitem__(self, index)
    595             for doc in clone:
    596                 return doc
--> 597             raise IndexError("no such item for Cursor instance")
    598         raise TypeError("index %r cannot be applied to Cursor "
    599                         "instances" % index)

IndexError: no such item for Cursor instance

In [ ]:
#!/usr/bin/env python
""" Your task is to write a query that will return all cars manufactured by "Ford Motor Company"
that are assembled in Germany, United Kingdom, or Japan.
Please modify only 'in_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""

def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


def in_query():
    # Write the query
    query = {"manufacturer" : "Ford Motor Company", "assembly" : {"$in" : ["Germany", "United Kingdom", "Japan"]}}
    
    return query


if __name__ == "__main__":

    db = get_db()
    query = in_query()
    autos = db.autos.find(query, {"name":1, "manufacturer":1, "assembly": 1, "_id":0})

    print "Found autos:", autos.count()
    import pprint
    for a in autos:
        pprint.pprint(a)

In [ ]:
#!/usr/bin/env python
""" Your task is to write a query that will return all cars with width dimension greater than 2.5
Please modify only 'dot_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""


def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


def dot_query():
    query = {"dimensions.width" : {"$gt" : 2.5}}
    return query


if __name__ == "__main__":

    db = get_db()
    query = dot_query()
    cars = db.cars.find(query)

    print "Found cars:", cars.count()
    import pprint
    pprint.pprint(cars[0])

In [72]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it, clean it, 
come up with a data model, insert it into a MongoDB and then run some queries against your database.
The set contains data about Arachnid class.
Your task in this exercise is to parse the file, process only the fields that are listed in the
FIELDS dictionary as keys, and return a dictionary of cleaned values. 

The following things should be done:
- keys of the dictionary changed according to the mapping in FIELDS dictionary
- trim out redundant description in parenthesis from the 'rdf-schema#label' field, like "(spider)"
- if 'name' is "NULL" or contains non-alphanumeric characters, set it to the same value as 'label'.
- if a value of a field is "NULL", convert it to None
- if there is a value in 'synonym', it should be converted to an array (list)
  by stripping the "{}" characters and splitting the string on "|". Rest of the cleanup is up to you,
  eg removing "*" prefixes etc
- strip leading and ending whitespace from all fields, if there is any
- the output structure should be as follows:
{ 'label': 'Argiope',
  'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
  'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
  'name': 'Argiope',
  'synonym': ["One", "Two"],
  'classification': {
                    'family': 'Orb-weaver spider',
                    'class': 'Arachnid',
                    'phylum': 'Arthropod',
                    'order': 'Spider',
                    'kingdom': 'Animal',
                    'genus': None
                    }
}
  * Note that the value associated with the classification key is a dictionary with
    taxonomic labels.
"""
import codecs
import csv
import json
import pprint
import re

DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}


def process_file(filename, fields):

    process_fields = fields.keys()
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()

        for line in reader:
            
            # swap keys, strip whitespace
            for key,value in FIELDS.items():  
                line[value] = line.pop(key)
                line[value] = line[value].strip(' ')
            
            # discard unwanted pairs
            for key in line.keys():
                if key not in FIELDS.values():
                    del line[key]
            
            # strip junk from label and name
            line['label'] = re.sub(r' (.*)', '', line['label'])
            if line['name'] == 'NULL' or re.match(r'\W|\w*\s\W', line['name']):
                line['name'] = line['label']
            
            # replace 'NULL' with None
            for key,value in line.items():
                if value == 'NULL':
                    line[key] = None
            
            # turn synonym string into a list
            if line['synonym'] != None:
                if line['synonym'][0] != '{':
                    line['synonym'] = [line['synonym']]
                elif line['synonym'][0] == '{':
                    line['synonym'] = line['synonym'].replace('{', '').replace('}', '').replace('*', '').split('|')
                    line['synonym'] = [syn.strip(' ') for syn in line['synonym']]
            
            # combine certain pairs into classification
            line['classification'] = {}
            for key in ['family','class','phylum','order','kingdom','genus']:
                line['classification'][key] = line[key]
                del line[key]

            data.append(line)        
 
    return data


def parse_array(v):
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return [v]


def test():
    data = process_file(DATAFILE, FIELDS)

    pprint.pprint(data[0])
    assert data[0] == {
                        "synonym": None, 
                        "name": "Argiope", 
                        "classification": {
                            "kingdom": "Animal", 
                            "family": "Orb-weaver spider", 
                            "order": "Spider", 
                            "phylum": "Arthropod", 
                            "genus": None, 
                            "class": "Arachnid"
                        }, 
                        "uri": "http://dbpedia.org/resource/Argiope_(spider)", 
                        "label": "Argiope", 
                        "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
                    }


if __name__ == "__main__":
    test()


{'classification': {'class': 'Arachnid',
                    'family': 'Orb-weaver spider',
                    'genus': None,
                    'kingdom': 'Animal',
                    'order': 'Spider',
                    'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}

In [75]:
import json

def insert_data(data, db):
    db.arachnid.insert(data)


if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    with open('arachnid.json') as f:
        data = json.loads(f.read())
        insert_data(data, db)
        print db.arachnid.find_one()


{u'synonym': None, u'description': u'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.', u'classification': {u'kingdom': u'Animal', u'family': u'Orb-weaver spider', u'order': u'Spider', u'phylum': u'Arthropod', u'genus': None, u'class': u'Arachnid'}, u'uri': u'http://dbpedia.org/resource/Argiope_(spider)', u'label': u'Argiope', u'_id': ObjectId('54d5a0012cc823199eb36f4b'), u'name': u'Argiope'}

In [23]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it, clean it, 
come up with a data model, insert it into a MongoDB and then run some queries against your database.
The set contains data about Arachnid class.

The data is already in the database. But you have been given a task to also include 'binomialAuthority'
information in the data, so you have to go through the data and update the existing entries.

The following things should be done in the function add_field:
- process the csv file and extract 2 fields - 'rdf-schema#label' and 'binomialAuthority_label'
- clean up the 'rdf-schema#label' same way as in the first exercise - removing redundant "(spider)" suffixes
- return a dictionary, with 'label' being the key, and 'binomialAuthority_label' the value
- if 'binomialAuthority_label' is "NULL", skip the item

The following should be done in the function update_db:
- query the database by using the field 'label'
- update the data, by adding a new item under 'classification' with a key 'binomialAuthority'


The resulting data should look like this:
- the output structure should be as follows:
{ 'label': 'Argiope',
  'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
  'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
  'name': 'Argiope',
  'synonym': ["One", "Two"],
  'classification': {
                    'binomialAuthority': None,
                    'family': 'Orb-weaver spider',
                    'class': 'Arachnid',
                    'phylum': 'Arthropod',
                    'order': 'Spider',
                    'kingdom': 'Animal',
                    'genus': None
                    }
}
"""
import codecs
import csv
import json
import pprint
import re

DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'binomialAuthority_label': 'binomialAuthority'}


def add_field(filename, fields):

    process_fields = fields.keys()
    data = {}
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()
        for line in reader:
            
            # swap keys, strip whitespace
            for key,value in FIELDS.items():  
                line[value] = line.pop(key)
                line[value] = line[value].strip(' ')
            
            # discard unwanted pairs
            for key in line.keys():
                if key not in FIELDS.values():
                    del line[key]
            
            # strip junk from label and name
            line['label'] = re.sub(r' (.*)', '', line['label'])
            
            # turn binomialAuthority string into a list
            if line['binomialAuthority'] != None:
                if line['binomialAuthority'][0] != '{':
                    line['binomialAuthority'] = [line['binomialAuthority']]
                elif line['binomialAuthority'][0] == '{':
                    line['binomialAuthority'] = line['binomialAuthority'].replace('{', '').replace('}', '').replace('*', '').split('|')
                    line['binomialAuthority'] = [syn.strip(' ') for syn in line['binomialAuthority']]

            # no NULLs
            if line['binomialAuthority'][0] != 'NULL':
                data[line['label']] = line['binomialAuthority']


    return data


def update_db(data, db):
    for key,value in data.items():
        db.arachnid.update({"label" : key}, 
                           {"$set" : {"classification" : {"binomialAuthority" : value[0]}}})



def test():
    # Please change only the add_field and update_db functions!
    # Changes done to this function will not be taken into account
    # when doing a Test Run or Submit, they are just for your own reference
    # and as an example for running this code locally!
    
    data = add_field(DATAFILE, FIELDS)
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    update_db(data, db)

    updated = db.arachnid.find_one({'label': 'Opisthoncana'})
    pprint.pprint(data)



if __name__ == "__main__":
    test()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-23-5238060093b5> in <module>()
    104 
    105 if __name__ == "__main__":
--> 106     test()

<ipython-input-23-5238060093b5> in test()
     98 
     99     updated = db.arachnid.find_one({'label': 'Opisthoncana'})
--> 100     assert updated['classification']['binomialAuthority'] == 'Embrik Strand'
    101     pprint.pprint(data)
    102 

TypeError: 'NoneType' object has no attribute '__getitem__'