In [1]:
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es = Elasticsearch()

In [2]:
# Some helpers stolen from scrAPI to make the data a little more readable
def strip_empty(document, required=tuple()):
    ''' Removes empty fields from the processed schema
    '''
    new_doc = {}
    for k, v in document.items():
        if k in required:
            new_doc[k] = v
        else:
            new_val = do_strip_empty(v)
            if k == 'otherProperties':
                new_val = [property for property in new_val if property.get('properties')]
            if new_val:
                new_doc[k] = new_val
    return new_doc


def strip_list(l):
    return list(filter(lambda x: x, map(do_strip_empty, l)))


def do_strip_empty(value):
    ''' Filters empty values from container types
    '''
    return {
        dict: strip_empty,
        list: strip_list,
        tuple: strip_list
    }.get(type(value), lambda x: x)(value)

In [3]:
def grid_actions(filename):
    '''Generator that yields elasticsearch actions from the input file'''
    with open(filename) as f:
        records = json.load(f)['institutes']
        for record in records:
            record['num_types'] = len(record.get('types', []))
            yield {
                '_index': 'grid',
                '_type': 'institutions',
                '_id': record['id'],
                '_source': strip_empty(record)
            }

In [4]:
bulk(es, grid_actions('grid_2015_10_09.json'), stats_only=True)


Out[4]:
(49942, 0)