ROUTE A
ROUTE B
In [1]:
class Concordance:
pass
In [2]:
class Corpus:
pass
def build_concordance(self, terms, search_type):
pass
#return concordance
Or use NLTK's built-in functions
In [3]:
import re
test = 'Here is an important string; about an important topic.'
# use \b to match the start of a word
results = re.finditer(r'\ban', test)
In [4]:
def print_hits(results):
for hit in results:
start, end = hit.start(), hit.end()
print test[start:end]
print_hits(results)
In [5]:
def print_concordance(test, results):
raw_text = test.split()
print raw_text
print_concordance(test, results)
In [6]:
def print_concordance(test, term):
#FIXME how would this handle phrases?
raw_text = test.split()
indices = [i for i, word in enumerate(raw_text) if word == term]
hits = indices
print hits
print_concordance(test, 'an')
In [7]:
#TODO do this on strings!
indices = [m.start() for m in re.finditer(r'\ban', test)]
print indices
In [8]:
for i in indices:
print test[-10:i] + '\t' + 'an' + '\t' + test[i+len('an'):10]
In [9]:
for i in indices:
left = test[:i]
right = test[i+len('an'):]
print left[-15:] + '\t' + 'an' + '\t' + right[:15]
#todo whole words?
In [10]:
test[:35]
Out[10]: