In [7]:
from imdb import IMDb
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()
ia = IMDb()
listaPelis = ia.get_top250_movies()
listaPelis
Out[7]:
Tarda bastante en ejecutarse (5 a 15 min), mete 250 peliculas en elastic
quitado parametro de es.index (, id=i)
In [8]:
for i in range(10,250):
peli = listaPelis[i]
peli2 = ia.get_movie(peli.movieID)
string = peli2.summary()
separado = string.split('\n')
solucion = {}
for i in range(2,len(separado)):
sep2 = separado[i].split(':')
#Forma de evitar que haya fallo al pasar el split a diccionario
#Caso del fallo en los 2 cuadros de abajo
sep2[1:len(sep2)] = [''.join(sep2[1:len(sep2)])]
solucion.update(dict([sep2]))
es.index(index='prueba-index', doc_type='text', body=solucion)
In [12]:
separado
Out[12]:
In [9]:
sep2[1]
Out[9]:
In [21]:
import pandas as pd
lista=[]
for i in range(0400000,0400010,1):
peli = ia.get_movie(i)
lista.append(peli.summary())
datos = pd.DataFrame(lista)
print datos.values
In [22]:
import pandas as pd
lista=[]
datos = pd.DataFrame([])
for i in range(0005000,0005003):
lista.append(ia.get_movie(i))
lista.append(ia.get_movie_plot(i))
datos = datos.append(lista)
print datos.values
In [23]:
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()
'''
doc = {
'prueba': 'Holi',
'text': 'A man throws away an old top hat and a tramp uses it to sole his boots.',
}
res = es.index(index="movies-index", doc_type='text', id=1, body=doc)
print(res['created'])
'''
res = es.get(index="movies-index", doc_type='text', id=6)
print(res['_source'])
es.indices.refresh(index="movies-index")
res = es.search(index="movies-index", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
print("%(text)s" % hit["_source"])
In [7]:
# make sure ES is up and running
import requests
res = requests.get('http://localhost:9200')
print(res.content)
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
In [281]:
#Lista con el top 250 de peliculas
top = ia.get_top250_movies()
#Recorro la lista y saco los datos para indexarlos en elastic search, el id es el orden en la lista
for i in range(0,250):
es.index(index='films-index', doc_type='text', id=i, body=top[i].data)
In [24]:
res = es.search(index="films-index", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total'])
#Modificar para que funcione
for hit in res['hits']['hits']:
print("%(kind)s %(title)s %(year)s %(rating)s" % hit["_source"])
In [28]:
res = es.search(index="prueba-index", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
print("%(Title)s %(Genres)s %(Director)s %(Cast)s %(Writer)s %(Country)s %(Language)s %(Rating)s %(Plot)s" % hit["_source"])
In [27]:
res = es.search(index="prueba-index", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
print("%(Title)s" % hit["_source"])
In [26]:
res = es.search(index="prueba-index", body={"query": {"match_all": {}}})
res
Out[26]:
In [56]:
res = es.search(index="prueba-index", body={
"query":
{"match" : {'Director': 'Christopher Nolan'}
},
{
"highlight" : {
"fields" : {
"Language" : {}
}
}
}
})
res
In [57]:
res = es.search(index="prueba-index", body={"query": {"match" : {'Director': 'Christophe Nola'}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
print("%(Title)s" % hit["_source"])
In [104]:
bodyQuery = {
"query": {
"multi_match" : {
"query" : "Int",
"fields": ["Plot", "Title"],
"fuzziness": "2"
}
}
}
res = es.search(index="prueba-index", body=bodyQuery)
#print res
#print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
print("%(Title)s" % hit["_source"])
In [102]:
bodyQuery = {
"query": {
"regexp":{
"Title": "wonder.*"
}
}
}
res = es.search(index="prueba-index", body=bodyQuery)
#print res
#print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
print("%(Title)s" % hit["_source"])
In [93]:
bodyQuery2 = {
"query": {
"match" : {
"Title" : {
"query" : "wond",
"operator" : "and",
"zero_terms_query": "all"
}
}
},
"highlight" : {
"fields" : {
"Title" : {},
"Plot" : {"fragment_size" : 150, "number_of_fragments" : 3}
},
#Permite el hightlight sobre campos que no se han hecho query
#como Plot en este ejemplo
"require_field_match" : False
}
}
res = es.search(index="prueba-index", body=bodyQuery2)
print("Got %d Hits:" % res['hits']['total'])
# Uso el [0] porque solo hay 1 hit, si hubiese mas, pues habria mas campos
# de la lista, habria que usar el for de arriba para sacar el highlight de
# cada uno de la lista
#print res['hits']['hits'][0]['highlight']
for hit in res['hits']['hits']:
print(hit)
In [114]:
bodyQuery2 = {
"query": {
"bool": {
"should": [
{ "match": {
"Title": {
"query": "wonder" + ".*",
"fuzziness": "AUTO",
"prefix_length" : 1,
"operator": "and"
}
}},
{ "match": {
"Plot": {
"query": "wonder" + ".*",
"fuzziness": 2,
"prefix_length" : 1,
"operator": "and"
}
}
},
{ "match": {
"Genres": {
"query": "wonder" + ".*",
"fuzziness": "AUTO",
"prefix_length" : 1,
"operator": "and"
}
}},
{ "match": {
"Director": {
"query": "wonder" + ".*",
"fuzziness": "AUTO",
"prefix_length" : 1,
"operator": "and"
}
}},
{ "match": {
"Writer": {
"query": "wonder" + ".*",
"fuzziness": "AUTO",
"prefix_length" : 1,
"operator": "and"
}
}},
{ "match": {
"Cast": {
"query": "wonder" + ".*",
"fuzziness": "AUTO",
"prefix_length" : 1,
"operator": "and"
}
}},
{ "match": {
"Country": {
"query": "wonder" + ".*",
"fuzziness": "AUTO",
"prefix_length" : 1,
"operator": "and"
}
}},
{ "match": {
"Language": {
"query": "wonder" + ".*",
"fuzziness": "AUTO",
"prefix_length" : 1,
"operator": "and"
}
}},
{ "match": {
"Rating": {
"query": "wonder" + ".*",
"fuzziness": "AUTO",
"prefix_length" : 1,
"operator": "and"
}
}},
]
}
},
"highlight": {
"fields": {
"Title": {},
"Plot": {},
"Director": {}
},
# Permite el hightlight sobre campos que no se han hecho query
# como Plot en este ejemplo
"require_field_match": False
}
}
'''
"query": {
"match": {
"Title": {
"query": buscado,
"fuzziness": "AUTO",
"boost" : 2.0,
"prefix_length" : 1,
"max_expansions": 100,
#"minimum_should_match" : 10,
"operator": "and"
}
}
},
"highlight": {
"fields": {
"Title": {},
"Plot": {"fragment_size": 300, "number_of_fragments": 3}
},
# Permite el hightlight sobre campos que no se han hecho query
# como Plot en este ejemplo
"require_field_match": False
}
'''
res = es.search(index="prueba-index", body= bodyQuery2)
print("Got %d Hits:" % res['hits']['total'])
# Uso el [0] porque solo hay 1 hit, si hubiese mas, pues habria mas campos
# de la lista, habria que usar el for de arriba para sacar el highlight de
# cada uno de la lista
# print res['hits']['hits'][0]['highlight']
resultado = []
for hit in res['hits']['hits']:
resultado.append(hit)
print resultado[10]['_source']['Title']
In [25]:
es.delete(index='prueba-index', doc_type='text', id=1)