In [1]:
import json
import time
from elasticsearch import Elasticsearch

In [2]:
import pickle

In [ ]:
es = Elasticsearch([{'host': 'teadaiegpu1.hopto.org', 'port': 9200}])
def insert(paths):
    for path in paths:
        with open(path, 'rb') as handle:
            contents = pickle.load(handle)
            noerror = []
            for c in contents:
                if 'error' not in c.keys():
                    if c['IMDb'] != None:
                        c['IMDb'] = float(c['IMDb'])
                    noerror.append(c)
            for t in noerror:
                es.index(index='yahoomovie', doc_type='content', body=t)

In [3]:
with open('/home/yang/notebook/ptt-web-crawler/PttWebCrawler/yahoo_movie/contents_1_2501.pickle', 'rb') as handle:
    contents_1_2501 = pickle.load(handle)
with open('/home/yang/notebook/ptt-web-crawler/PttWebCrawler/yahoo_movie/contents_2501_5001.pickle', 'rb') as handle:
    contents_2501_5001 = pickle.load(handle)
with open('/home/yang/notebook/ptt-web-crawler/PttWebCrawler/yahoo_movie/contents_5001_7242.pickle', 'rb') as handle:
    contents_5001_7242 = pickle.load(handle)

In [32]:
all_contents = contents_1_2501+contents_2501_5001+contents_5001_7242

In [33]:
len(all_contents)


Out[33]:
7241

In [34]:
noerror = []
for c in all_contents:
    if 'error' not in c.keys():
        if c['IMDb'] != None:
            c['IMDb'] = float(c['IMDb'])
        noerror.append(c)

In [35]:
len(noerror)


Out[35]:
6165

In [41]:
es = Elasticsearch([{'host': 'teadaiegpu1.hopto.org', 'port': 9200}])
for t in noerror:
    es.index(index='yahoomovie', doc_type='content', body=t)

In [ ]: