In [1]:
import json
import time
from elasticsearch import Elasticsearch
In [2]:
import pickle
In [ ]:
es = Elasticsearch([{'host': 'teadaiegpu1.hopto.org', 'port': 9200}])
def insert(paths):
for path in paths:
with open(path, 'rb') as handle:
contents = pickle.load(handle)
noerror = []
for c in contents:
if 'error' not in c.keys():
if c['IMDb'] != None:
c['IMDb'] = float(c['IMDb'])
noerror.append(c)
for t in noerror:
es.index(index='yahoomovie', doc_type='content', body=t)
In [3]:
with open('/home/yang/notebook/ptt-web-crawler/PttWebCrawler/yahoo_movie/contents_1_2501.pickle', 'rb') as handle:
contents_1_2501 = pickle.load(handle)
with open('/home/yang/notebook/ptt-web-crawler/PttWebCrawler/yahoo_movie/contents_2501_5001.pickle', 'rb') as handle:
contents_2501_5001 = pickle.load(handle)
with open('/home/yang/notebook/ptt-web-crawler/PttWebCrawler/yahoo_movie/contents_5001_7242.pickle', 'rb') as handle:
contents_5001_7242 = pickle.load(handle)
In [32]:
all_contents = contents_1_2501+contents_2501_5001+contents_5001_7242
In [33]:
len(all_contents)
Out[33]:
In [34]:
noerror = []
for c in all_contents:
if 'error' not in c.keys():
if c['IMDb'] != None:
c['IMDb'] = float(c['IMDb'])
noerror.append(c)
In [35]:
len(noerror)
Out[35]:
In [41]:
es = Elasticsearch([{'host': 'teadaiegpu1.hopto.org', 'port': 9200}])
for t in noerror:
es.index(index='yahoomovie', doc_type='content', body=t)
In [ ]: