In [60]:
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import mapper, sessionmaker
import requests
import json
from elasticsearch import Elasticsearch
import re
my_index = 'zadolbali'
In [25]:
print(requests.get('http://localhost:9200').text)
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
In [26]:
def get_es_stats():
print(requests.get('http://localhost:9200/_cat/health?v').text)
print(requests.get('http://localhost:9200/_cat/nodes?v').text)
print(requests.get('http://localhost:9200/_cat/shards?v').text)
print(requests.get('http://localhost:9200/_cat/indices?v').text)
get_es_stats()
In [4]:
def delete_index(index):
return requests.delete('http://localhost:9200/{0}?pretty'.format(index)).text
print(delete_index(my_index))
In [5]:
get_es_stats()
In [6]:
def create_index(index, settings):
return requests.put('http://localhost:9200/zadolbali', data=json.dumps(settings)).text
def setup_index_mapping(index, settings):
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
return requests.put('http://localhost:9200/zadolbali/_mappings/story?pretty', headers=headers, data=json.dumps(settings)).text
def get_index_state(index):
print(requests.get('http://localhost:9200/{0}/_settings?pretty'.format(index)).text)
print(requests.get('http://localhost:9200/{0}/_mapping?pretty'.format(index)).text)
In [7]:
create_settings = {
'settings' : {
'index' : {
'number_of_shards' : 5,
'number_of_replicas' : 1
}
}
}
create_index(my_index, create_settings)
Out[7]:
In [8]:
mapping_settings = {
'properties': {
'id': { 'type': 'integer' },
'title': {
'type': 'text',
'analyzer': 'russian'
},
'text': {
'type': 'text',
'analyzer': 'russian'
},
'published': {
'type': 'date',
'format': 'yyyyMMdd'
},
'likes': { 'type': 'integer' },
'tags': {
'type': 'keyword'
},
'url': { 'type': 'text' }
}
}
print(setup_index_mapping(my_index, mapping_settings))
In [9]:
get_index_state(my_index)
In [10]:
class Story(object):
pass
def loadSession():
dbPath = '../corpus/stories.sqlite'
engine = create_engine('sqlite:///%s' % dbPath, echo=True)
bookmarks = Table('stories', MetaData(engine), autoload=True)
mapper(Story, bookmarks)
Session = sessionmaker(bind=engine)
session = Session()
return session
session = loadSession()
In [56]:
stories = session.query(Story).all()
print(len(stories))
print(dir(stories[0]))
In [12]:
# 'hrefs', 'id', 'likes', 'published', 'tags', 'text', 'title', 'url'
def index_data(index):
for story in stories:
body = {
'id': story.id,
'title': story.title,
'text': story.text,
'published': story.published,
'likes': story.likes,
'tags': story.tags.split(' '),
'url': story.url
}
es.index(index=index, doc_type='story', id=story.id, body=body)
index_data(my_index)
In [13]:
def run_easy_tests(index):
print(es.get(index=index, id=2))
print(es.search(index=index, doc_type='story', body={'query': {'match': {'tags': 'mail'}}}))
In [14]:
print(run_easy_tests(my_index))
In [15]:
def run_hard_tests(index):
print(es.search(index=index, doc_type='story', body={'query': {'match': {'text': 'страховая'}}}))
print(es.search(index=index, doc_type='story', body={'query': {'match': {'text': 'страхов'}}}))
In [16]:
print(run_hard_tests(my_index))
In [17]:
print(delete_index(my_index))
print(create_index(my_index, create_settings))
print(requests.post('http://localhost:9200/zadolbali/_close').text)
In [18]:
def setup_index_settings(index, settings):
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
return requests.put('http://localhost:9200/zadolbali/_settings?pretty', headers=headers, data=json.dumps(settings)).text
In [19]:
# stolen from https://gist.github.com/svartalf/4465752
index_settings = {
'analysis': {
'analyzer': {
'ru': {
'type': 'custom',
'tokenizer': 'standard',
'filter': ['lowercase', 'russian_morphology', 'english_morphology', 'ru_stopwords'],
},
},
'filter': {
'ru_stopwords': {
'type': 'stop',
'stopwords': u'а,без,более,бы,был,была,были,было,быть,в,вам,вас,весь,во,вот,все,всего,всех,вы,где,да,даже,для,до,его,ее,если,есть,еще,же,за,здесь,и,из,или,им,их,к,как,ко,когда,кто,ли,либо,мне,может,мы,на,надо,наш,не,него,нее,нет,ни,них,но,ну,о,об,однако,он,она,они,оно,от,очень,по,под,при,с,со,так,также,такой,там,те,тем,то,того,тоже,той,только,том,ты,у,уже,хотя,чего,чей,чем,что,чтобы,чье,чья,эта,эти,это,я,a,an,and,are,as,at,be,but,by,for,if,in,into,is,it,no,not,of,on,or,such,that,the,their,then,there,these,they,this,to,was,will,with',
},
'ru_stemming': {
'type': 'snowball',
'language': 'Russian',
}
},
}
}
print(setup_index_settings(my_index, index_settings))
In [20]:
print(requests.post('http://localhost:9200/zadolbali/_open').text)
mapping_settings = {
'properties': {
'id': { 'type': 'integer' },
'title': {
'type': 'text',
'analyzer': 'ru'
},
'text': {
'type': 'text',
'analyzer': 'ru'
},
'published': {
'type': 'date',
'format': 'yyyyMMdd'
},
'likes': { 'type': 'integer' },
'tags': {
'type': 'keyword'
},
'url': { 'type': 'text' }
}
}
print(setup_index_mapping(my_index, mapping_settings))
In [21]:
index_data(my_index)
In [22]:
get_index_state(my_index)
In [23]:
print(run_hard_tests(my_index))
In [63]:
print(requests.post('http://localhost:9200/zadolbali/story/_search?pretty', data=json.dumps({
"size": 0,
"aggs" : {
"genres" : {
"terms" : {
"field" : "text",
"size" : 100
}
}
}
})).text)
In [78]:
all_text = ' '.join(story.text for story in stories)
# print(all_text)
In [76]:
%%timeit
re.findall('кошка', all_text)
In [77]:
%%timeit
es.search(index='zadolbali', doc_type='story', body={'query': {'match': {'text': 'кошка'}}})
In [81]:
%%timeit
re.findall('кошка', all_text)
In [85]:
%%timeit
es.search(index='zadolbali', doc_type='story', body={'query': {'match': {'text': 'кошка'}}})
In [86]:
%%timeit
re.findall('я', all_text)
In [87]:
%%timeit
es.search(index='zadolbali', doc_type='story', body={'query': {'match': {'text': 'я'}}})
In [86]:
%%timeit
re.findall('страховая', all_text)
In [87]:
%%timeit
es.search(index='zadolbali', doc_type='story', body={'query': {'match': {'text': 'я'}}})
In [99]:
%%timeit
re.findall('душистый', all_text)
In [100]:
%%timeit
es.search(index='zadolbali', doc_type='story', body={'query': {'match': {'text': 'душистый'}}})
In [ ]: