In [1]:
from whoosh.index import create_in
from whoosh.fields import *
import io
from __future__ import print_function
from os.path import join, isdir
from os import listdir
In [2]:
data_path = './data'
index_path = './index'
In [3]:
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
ix = create_in(index_path,schema)
writer = ix.writer()
In [6]:
data_folders = listdir(data_path)
for folder in data_folders:
if isdir(join(data_path, folder)):
files = listdir(join(data_path, folder))
for fl in files:
data_file_path = join(data_path, folder, fl)
try:
writer.add_document(title = fl[:fl.find('.txt')].decode(encoding='utf-8'),
path = data_file_path.decode(encoding='utf-8'), content = io.open(data_file_path, 'r',encoding='utf-8').read())
except Exception:
print(data_file_path)
print(folder, end=' ')
writer.commit()
In [7]:
from whoosh.qparser import QueryParser
In [17]:
with ix.searcher() as searcher:
query = QueryParser("content", ix.schema).parse(u'Литва')
results = searcher.search(query)
print(results[0]['path'])
In [ ]: