Retrieve JSON documents which are accessible via REST API and store them in mongodb.
In [ ]:
from pymongo import MongoClient
from urllib import urlopen
from jsonpath_rw import jsonpath, parse
from datetime import datetime
import json
import yaml
In [ ]:
with open('config.yaml') as yaml_file:
cfg = yaml.load(yaml_file)
rest_api = cfg['rest-api']
json_cfg = cfg['json-path']
mongo_db = cfg['mongo']
api_list_items = rest_api['url'] + rest_api['get_list']
api_get_item = rest_api['url'] + rest_api['get_detail']
item_id_field = cfg['json-path']['item_id']
item_list_path = parse(cfg['json-path']['item_list'])
In [ ]:
client = MongoClient(mongo_db['url'])
db = client[mongo_db['database']][mongo_db['collection']]
print "%d entries in database." % db.find().count()
In [ ]:
# functions for REST API calls
def get_item_list(offset = 0, limit = 100, url = api_list_items):
request = urlopen(url % (offset, limit))
return json.loads(request.read())
def get_item(id, url = api_get_item):
request = urlopen(url % id)
return json.loads(request.read())
In [ ]:
max_items = 10**6
limit=100
for offset in xrange(0, max_items, limit):
print ("%s - fetching items %s - %s" % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), offset, offset+limit))
# download item list
result = get_item_list(offset=offset, limit=limit)
item_list = [ item.value for item in item_list_path.find(result) ]
# stop if result list is empty
if len(item_list) == 0:
print "no more results returned"
break
# extract IDs and compare with items already in database
item_ids = [ item[item_id_field] for item in item_list ]
known_ids = [ item[item_id_field] for item in db.find( {item_id_field: { "$in": item_ids }} ) ]
new_ids = [ x for x in item_ids if x not in known_ids ]
print "-> got %d ids (%d known, %d new)" % (len(item_ids), len(known_ids), len(new_ids))
# fetch new items from REST API
items = []
for id in new_ids:
item = get_item(id)
items.append(item)
# insert new items in database
if len(items) != 0:
result = db.insert_many(items)