In [22]:
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import mapper, sessionmaker
import requests
import json
from elasticsearch import Elasticsearch
my_index = 'zadolbali'
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
def get_es_stats():
print(requests.get('http://localhost:9200/_cat/health?v').text)
print(requests.get('http://localhost:9200/_cat/nodes?v').text)
print(requests.get('http://localhost:9200/_cat/shards?v').text)
print(requests.get('http://localhost:9200/_cat/indices?v').text)
In [23]:
def create_index(index, settings):
return requests.put('http://localhost:9200/zadolbali', data=json.dumps(settings)).text
def delete_index(index):
return requests.delete('http://localhost:9200/{0}?pretty'.format(index)).text
def setup_index_settings(index, settings):
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
return requests.put('http://localhost:9200/zadolbali/_settings?pretty', headers=headers, data=json.dumps(settings)).text
def setup_index_mapping(index, settings):
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
return requests.put('http://localhost:9200/zadolbali/_mappings/story?pretty', headers=headers, data=json.dumps(settings)).text
def get_index_state(index):
print(requests.get('http://localhost:9200/{0}/_settings?pretty'.format(index)).text)
print(requests.get('http://localhost:9200/{0}/_mapping?pretty'.format(index)).text)
In [24]:
delete_index(my_index)
Out[24]:
In [25]:
create_settings = {
'settings' : {
'index' : {
'number_of_shards' : 5,
'number_of_replicas' : 1
}
}
}
create_index(my_index, create_settings)
print(requests.post('http://localhost:9200/zadolbali/_close').text)
In [26]:
# stolen from https://gist.github.com/svartalf/4465752
index_settings = {
'analysis': {
'analyzer': {
'ru': {
'type': 'custom',
'tokenizer': 'standard',
'filter': ['lowercase', 'russian_morphology', 'english_morphology', 'ru_stopwords'],
},
},
'filter': {
'ru_stopwords': {
'type': 'stop',
'stopwords': u'а,без,более,бы,был,была,были,было,быть,в,вам,вас,весь,во,вот,все,всего,всех,вы,где,да,даже,для,до,его,ее,если,есть,еще,же,за,здесь,и,из,или,им,их,к,как,ко,когда,кто,ли,либо,мне,может,мы,на,надо,наш,не,него,нее,нет,ни,них,но,ну,о,об,однако,он,она,они,оно,от,очень,по,под,при,с,со,так,также,такой,там,те,тем,то,того,тоже,той,только,том,ты,у,уже,хотя,чего,чей,чем,что,чтобы,чье,чья,эта,эти,это,я,a,an,and,are,as,at,be,but,by,for,if,in,into,is,it,no,not,of,on,or,such,that,the,their,then,there,these,they,this,to,was,will,with',
},
'ru_stemming': {
'type': 'snowball',
'language': 'Russian',
}
},
}
}
print(setup_index_settings(my_index, index_settings))
In [31]:
print(requests.post('http://localhost:9200/zadolbali/_open').text)
mapping_settings = {
'properties': {
'id': { 'type': 'integer' },
'title': {
'type': 'text',
'analyzer': 'ru',
"fields": {
"keyword": {
"type": "keyword"
}
}
},
'text': {
'type': 'text',
'analyzer': 'ru',
"fields": {
"keyword": {
"type": "keyword"
},
"length": {
"type": "token_count",
"analyzer": "standard"
}
}
},
'published': {
'type': 'date',
'format': 'yyyyMMdd'
},
'likes': { 'type': 'integer' },
'tags': {
'type': 'keyword'
},
'url': { 'type': 'text' }
}
}
print(setup_index_mapping(my_index, mapping_settings))
In [32]:
class Story(object):
pass
def loadSession():
dbPath = '../corpus/stories.sqlite'
engine = create_engine('sqlite:///%s' % dbPath, echo=True)
bookmarks = Table('stories', MetaData(engine), autoload=True)
mapper(Story, bookmarks)
Session = sessionmaker(bind=engine)
session = Session()
return session
session = loadSession()
In [33]:
stories = session.query(Story).all()
print(len(stories))
print(dir(stories[0]))
In [36]:
# 'hrefs', 'id', 'likes', 'published', 'tags', 'text', 'title', 'url'
def index_data(index):
# for story in stories[:100]:
for story in stories:
body = {
'id': story.id,
'title': story.title,
'text': story.text,
'published': story.published,
'likes': story.likes,
'tags': story.tags.split(' '),
'url': story.url
}
es.index(index=index, doc_type='story', id=story.id, body=body)
In [37]:
index_data(my_index)
get_index_state(my_index)
In [ ]: