In [22]:
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import mapper, sessionmaker
import requests
import json
from elasticsearch import Elasticsearch

my_index = 'zadolbali'

es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

def get_es_stats():
    print(requests.get('http://localhost:9200/_cat/health?v').text)
    print(requests.get('http://localhost:9200/_cat/nodes?v').text)
    print(requests.get('http://localhost:9200/_cat/shards?v').text)
    print(requests.get('http://localhost:9200/_cat/indices?v').text)

In [23]:
def create_index(index, settings):
    return requests.put('http://localhost:9200/zadolbali', data=json.dumps(settings)).text

def delete_index(index):
    return requests.delete('http://localhost:9200/{0}?pretty'.format(index)).text

def setup_index_settings(index, settings):
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    return requests.put('http://localhost:9200/zadolbali/_settings?pretty', headers=headers, data=json.dumps(settings)).text

def setup_index_mapping(index, settings):
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    return requests.put('http://localhost:9200/zadolbali/_mappings/story?pretty', headers=headers, data=json.dumps(settings)).text

def get_index_state(index):
    print(requests.get('http://localhost:9200/{0}/_settings?pretty'.format(index)).text)
    print(requests.get('http://localhost:9200/{0}/_mapping?pretty'.format(index)).text)

In [24]:
delete_index(my_index)


Out[24]:
'{\n  "acknowledged" : true\n}\n'

In [25]:
create_settings = {
    'settings' : {
        'index' : {
            'number_of_shards' : 5,
            'number_of_replicas' : 1
        }
    }
}

create_index(my_index, create_settings)

print(requests.post('http://localhost:9200/zadolbali/_close').text)


{"acknowledged":true}

In [26]:
# stolen from https://gist.github.com/svartalf/4465752

index_settings = {
    'analysis': {
        'analyzer': {
            'ru': {
                'type': 'custom',
                'tokenizer': 'standard',
                'filter': ['lowercase', 'russian_morphology', 'english_morphology', 'ru_stopwords'],
            },
        },
        'filter': {
            'ru_stopwords': {
                'type': 'stop',
                'stopwords': u'а,без,более,бы,был,была,были,было,быть,в,вам,вас,весь,во,вот,все,всего,всех,вы,где,да,даже,для,до,его,ее,если,есть,еще,же,за,здесь,и,из,или,им,их,к,как,ко,когда,кто,ли,либо,мне,может,мы,на,надо,наш,не,него,нее,нет,ни,них,но,ну,о,об,однако,он,она,они,оно,от,очень,по,под,при,с,со,так,также,такой,там,те,тем,то,того,тоже,той,только,том,ты,у,уже,хотя,чего,чей,чем,что,чтобы,чье,чья,эта,эти,это,я,a,an,and,are,as,at,be,but,by,for,if,in,into,is,it,no,not,of,on,or,such,that,the,their,then,there,these,they,this,to,was,will,with',
            },
            'ru_stemming': {
                'type': 'snowball',
                'language': 'Russian',
            }
        },

    }
}


print(setup_index_settings(my_index, index_settings))


{
  "acknowledged" : true
}


In [31]:
print(requests.post('http://localhost:9200/zadolbali/_open').text)

mapping_settings = {
    'properties': {
        'id': { 'type': 'integer' },
        'title':  { 
            'type': 'text',
            'analyzer': 'ru',
            "fields": {
                "keyword": { 
                  "type": "keyword"
                }
            }
        },
        'text': { 
            'type': 'text',
            'analyzer': 'ru',
            "fields": {
                "keyword": { 
                  "type": "keyword"
                },
                "length": { 
                  "type": "token_count",
                  "analyzer": "standard"
                }
            }
        },
        'published': {
            'type': 'date',
            'format': 'yyyyMMdd'
            },
        'likes': { 'type': 'integer' },
        'tags': { 
            'type': 'keyword'
        },
        'url': { 'type': 'text' }
    }
}

print(setup_index_mapping(my_index, mapping_settings))


{"acknowledged":true}
{
  "acknowledged" : true
}


In [32]:
class Story(object):
    pass
 
def loadSession():
    dbPath = '../corpus/stories.sqlite'
    engine = create_engine('sqlite:///%s' % dbPath, echo=True)
 
    bookmarks = Table('stories', MetaData(engine), autoload=True)
    mapper(Story, bookmarks)
 
    Session = sessionmaker(bind=engine)
    session = Session()
    return session

session = loadSession()


2017-11-06 17:35:07,465 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2017-11-06 17:35:07,466 INFO sqlalchemy.engine.base.Engine ()
2017-11-06 17:35:07,467 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2017-11-06 17:35:07,468 INFO sqlalchemy.engine.base.Engine ()
2017-11-06 17:35:07,471 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("stories")
2017-11-06 17:35:07,476 INFO sqlalchemy.engine.base.Engine ()
2017-11-06 17:35:07,479 INFO sqlalchemy.engine.base.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = 'stories' AND type = 'table'
2017-11-06 17:35:07,480 INFO sqlalchemy.engine.base.Engine ()
2017-11-06 17:35:07,481 INFO sqlalchemy.engine.base.Engine PRAGMA foreign_key_list("stories")
2017-11-06 17:35:07,483 INFO sqlalchemy.engine.base.Engine ()
2017-11-06 17:35:07,484 INFO sqlalchemy.engine.base.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = 'stories' AND type = 'table'
2017-11-06 17:35:07,485 INFO sqlalchemy.engine.base.Engine ()
2017-11-06 17:35:07,487 INFO sqlalchemy.engine.base.Engine PRAGMA index_list("stories")
2017-11-06 17:35:07,488 INFO sqlalchemy.engine.base.Engine ()
2017-11-06 17:35:07,489 INFO sqlalchemy.engine.base.Engine PRAGMA index_list("stories")
2017-11-06 17:35:07,490 INFO sqlalchemy.engine.base.Engine ()
2017-11-06 17:35:07,492 INFO sqlalchemy.engine.base.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = 'stories' AND type = 'table'
2017-11-06 17:35:07,493 INFO sqlalchemy.engine.base.Engine ()

In [33]:
stories = session.query(Story).all()
print(len(stories))
print(dir(stories[0]))


2017-11-06 17:35:08,681 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2017-11-06 17:35:08,685 INFO sqlalchemy.engine.base.Engine SELECT stories.id AS stories_id, stories.title AS stories_title, stories.published AS stories_published, stories.tags AS stories_tags, stories.text AS stories_text, stories.likes AS stories_likes, stories.hrefs AS stories_hrefs, stories.url AS stories_url 
FROM stories
2017-11-06 17:35:08,688 INFO sqlalchemy.engine.base.Engine ()
23558
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_sa_class_manager', '_sa_instance_state', 'hrefs', 'id', 'likes', 'published', 'tags', 'text', 'title', 'url']

In [36]:
# 'hrefs', 'id', 'likes', 'published', 'tags', 'text', 'title', 'url'
def index_data(index):
#     for story in stories[:100]:
    for story in stories:
        body = {
            'id': story.id,
            'title': story.title,
            'text': story.text,
            'published': story.published,
            'likes': story.likes,
            'tags': story.tags.split(' '),
            'url': story.url
        }
        es.index(index=index, doc_type='story', id=story.id, body=body)

In [37]:
index_data(my_index)
get_index_state(my_index)


{
  "zadolbali" : {
    "settings" : {
      "index" : {
        "number_of_shards" : "5",
        "provided_name" : "zadolbali",
        "creation_date" : "1509977827892",
        "analysis" : {
          "filter" : {
            "ru_stemming" : {
              "type" : "snowball",
              "language" : "Russian"
            },
            "ru_stopwords" : {
              "type" : "stop",
              "stopwords" : "а,без,более,бы,был,была,были,было,быть,в,вам,вас,весь,во,вот,все,всего,всех,вы,где,да,даже,для,до,его,ее,если,есть,еще,же,за,здесь,и,из,или,им,их,к,как,ко,когда,кто,ли,либо,мне,может,мы,на,надо,наш,не,него,нее,нет,ни,них,но,ну,о,об,однако,он,она,они,оно,от,очень,по,под,при,с,со,так,также,такой,там,те,тем,то,того,тоже,той,только,том,ты,у,уже,хотя,чего,чей,чем,что,чтобы,чье,чья,эта,эти,это,я,a,an,and,are,as,at,be,but,by,for,if,in,into,is,it,no,not,of,on,or,such,that,the,their,then,there,these,they,this,to,was,will,with"
            }
          },
          "analyzer" : {
            "ru" : {
              "filter" : [
                "lowercase",
                "russian_morphology",
                "english_morphology",
                "ru_stopwords"
              ],
              "type" : "custom",
              "tokenizer" : "standard"
            }
          }
        },
        "number_of_replicas" : "1",
        "uuid" : "dcAcE0VIRpuaQPloG-fTpw",
        "version" : {
          "created" : "5060399"
        }
      }
    }
  }
}

{
  "zadolbali" : {
    "mappings" : {
      "story" : {
        "properties" : {
          "id" : {
            "type" : "integer"
          },
          "likes" : {
            "type" : "integer"
          },
          "published" : {
            "type" : "date",
            "format" : "yyyyMMdd"
          },
          "tags" : {
            "type" : "keyword"
          },
          "text" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword"
              },
              "length" : {
                "type" : "token_count",
                "analyzer" : "standard"
              }
            },
            "analyzer" : "ru"
          },
          "title" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword"
              }
            },
            "analyzer" : "ru"
          },
          "url" : {
            "type" : "text"
          }
        }
      }
    }
  }
}


In [ ]: