In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('corpus/EC2000.csv.gz', encoding='utf-8')

In [3]:
from whoosh.index import create_in
from whoosh.fields import *

In [4]:
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True))

In [5]:
ix = create_in("index", schema)

In [6]:
writer = ix.writer()

In [7]:
for i in df.index:
    row = df.ix[i]
    writer.add_document(title=unicode(row['fName']), path=unicode(row['id']), content=unicode(row['text']))

In [8]:
writer.commit()

In [9]:
!ls index


_MAIN_1.toc  MAIN_b4lsimdozg69n1xo.seg	MAIN_WRITELOCK

In [10]:
#!rm -rf index/*

In [11]:
from whoosh.qparser import QueryParser

In [12]:
query = QueryParser("content", ix.schema).parse("zoe")
results = ix.searcher().search(query)

In [15]:
for r in results:
    print(r.highlights(fieldname='content', minscore=0))


<b class="match term0">Zoe</b>'s Kitchen, Inc. (NYSE:ZOES)
Q2 2015 Earnings Call...by. Welcome to the <b class="match term0">Zoe</b>'s Kitchen Second Quarter...the values of living <b class="match term0">Zoe</b>'s. Hundreds of team members
While our Rachel <b class="match term0">Zoe</b> collection and our Beyond

In [14]:
r.highlights(fieldname='content', minscore=0)


Out[14]:
u'While our Rachel <b class="match term0">Zoe</b> collection and our Beyond'

In [ ]: