In [1]:
%matplotlib inline
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from pyelasticsearch import ElasticSearch, bulk_chunks
import pandas as pd

In [4]:
ES_HOST = 'http://localhost:9200/'
INDEX_NAME = "expo2009"
DOC_TYPE = "flight"

In [5]:
es = ElasticSearch(ES_HOST)

In [6]:
es.count('*')['count']


Out[6]:
12140825

In [ ]:
# init index
try :
    es.delete_index(INDEX_NAME)
    print('Deleting %s'%(INDEX_NAME))
except :
    print('ERROR: Deleting %s failed!'%(INDEX_NAME))
    pass

In [ ]:
es.create_index(INDEX_NAME)

In [ ]:
# https://pyelasticsearch.readthedocs.io/en/latest/api/#pyelasticsearch.ElasticSearch.put_mapping
# https://www.elastic.co/guide/en/elasticsearch/reference/current/null-value.html
mapping = {
    'flight': {
        'properties': {
            'SecurityDelay': {
                'type': 'integer',
                'null_value': -1
            },
            'FlightNum': {
                'type': 'string'
            },
            'Origin': {
                'type': 'string'
            },
            'LateAircraftDelay': {
                'type': 'integer',
                'null_value': -1
            },
            'NASDelay': {
                'type': 'integer',
                'null_value': -1
            },
            'ArrTime': {
                'type': 'integer'
            },
            'AirTime': {
                'type': 'integer'
            },
            'DepTime': {
                'type': 'integer'
            },
            'Month': {
                'type': 'string'
            },
            'CRSElapsedTime': {
                'type': 'integer'
            },
            'DayofMonth': {
                'type': 'string'
            },
            'Distance': {
                'type': 'integer'
            },
            'CRSDepTime': {
                'type': 'integer',
            },
            'DayOfWeek': {
                'type': 'string'
            },
            'CancellationCode': {
                'type': 'string'
            },
            'Dest': {
                'type': 'string'
            },
            'DepDelay': {
                'type': 'integer'
            },
            'TaxiIn': {
                'type': 'integer'
            },
            'UniqueCarrier': {
                'type': 'string'
            },
            'ArrDelay': {
                'type': 'integer'
            },
            'Cancelled': {
                'type': 'boolean'
            },
            'Diverted': {
                'type': 'boolean'
            },
            'message': {
                'type': 'string'
            },
            'TaxiOut': {
                'type': 'integer'
            },
            'ActualElapsedTime': {
                'type': 'integer'
            },
            'CarrierDelay': {
                'type': 'integer',
                'null_value': -1
            },
            '@timestamp': {
                'format': 'strict_date_optional_time||epoch_millis',
                'type': 'date'
            },
            'Year': {
                'type': 'string'
            },
            'WeatherDelay': {
                'type': 'integer',
                'null_value': -1
            },
            'CRSArrTime': {
                'type': 'integer'
            },
            'TailNum': {
                'type': 'string'
            }
        }
    }

}
es.put_mapping(index=INDEX_NAME, doc_type=DOC_TYPE,mapping=mapping )

In [10]:
es.count('*')['count']


Out[10]:
16922190

In [ ]:
# if import fails, we can selectivly remove entries

# GET expo2009/_search
# {
#   "query": {
#     "range": {
#         "@timestamp" : { "gte" : "2002-01-01T00:00:00" }
#     }
#   }
# }

# # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-delete-by-query.html

# POST expo2009/_delete_by_query
# {
#   "query": { 
#     "range": {
#         "@timestamp" : { "gte" : "2002-01-01T00:00:00" }
#     }
#   }
# }

In [ ]:
# curl -XPOST "http://localhost:9200/expo2009/_delete_by_query" -H 'Content-Type: application/json' -d'
# {
#   "query": { 
#     "range": {
#         "@timestamp" : { "gte" : "2002-01-01T00:00:00" }
#     }
#   }
# }'

In [ ]: