In [1]:
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
es = Elasticsearch()
In [2]:
# Some helpers stolen from scrAPI to make the data a little more readable
def strip_empty(document, required=tuple()):
''' Removes empty fields from the processed schema
'''
new_doc = {}
for k, v in document.items():
if k in required:
new_doc[k] = v
else:
new_val = do_strip_empty(v)
if k == 'otherProperties':
new_val = [property for property in new_val if property.get('properties')]
if new_val:
new_doc[k] = new_val
return new_doc
def strip_list(l):
return list(filter(lambda x: x, map(do_strip_empty, l)))
def do_strip_empty(value):
''' Filters empty values from container types
'''
return {
dict: strip_empty,
list: strip_list,
tuple: strip_list
}.get(type(value), lambda x: x)(value)
In [3]:
def grid_actions(filename):
'''Generator that yields elasticsearch actions from the input file'''
with open(filename) as f:
records = json.load(f)['institutes']
for record in records:
record['num_types'] = len(record.get('types', []))
yield {
'_index': 'grid',
'_type': 'institutions',
'_id': record['id'],
'_source': strip_empty(record)
}
In [4]:
bulk(es, grid_actions('grid_2015_10_09.json'), stats_only=True)
Out[4]: