In [69]:
# Setup
import re
data = """
A "concordance" is an alphabetical list of the words present in a text with a count of how
often each word appears and citations of where each word appears in the text (e.g., page
number). Write a program -- in the programming language of your choice -- that will
generate a concordance of an arbitrary text document written in English: the text can be
read from stdin, and the program should output the concordance to stdout or a file. For
each word, it should print the count and the sorted list of citations, in this case the
zero-indexed sentence number in which that word occurs. You may assume that the input
contains only spaces, newlines, standard English letters, and standard English punctuation
marks.
"""
In [88]:
invalid_chars = re.compile("[^\w'.]")
def split_into_words_initial(input, sections='. '):
"""
OK, starting here. The first thing that comes to mind is
it may be necessary to use an existing library to split into word eg special characters
root words ect...
This is my first pass so I will do a string.split
The result of this should be a list of lists [ [word, ...], ... ]
The outer list will contain the 'sections' and each section will contain a list of words
:param sections: How to split the text into sections. '. ' should define the end of a sentence
Note: that this will split up words like 'zero-indexed' into 'zero' and 'indexed'
"""
# The first thing that I see is that there is a lot of puntuation,
for section in input.lower().split(sections):
# I would nltk.tokenize from the nltk library if this were not a test
# but I this is fine
yield invalid_chars.sub(' ', section).split()
# Test to see if this is working correctly
# list(split_into_words_initial(data))
In [89]:
def concordance(input):
"""
Count of how often each word appears and citations of where each word appears in the text
:param input: a text
:returns: a data structure of {word: [section_num, ... ] }
"""
# Initial thought is that result can be a dict of {word: {section: count, ...}, ... }
# I Then modified this to a less compact form because the output requires printing all of the
# and list of sentence numbers
# So the final data structure is {word: [section_num, ... ] }
result = {}
# Loop over the sections
for section_num, section in enumerate(split_into_words_initial(input)):
for word in section:
result.setdefault(word, []).append(section_num)
return result
concordance_result = concordance(data)
# print(concordance_result)
# OK, this looks right lets store this an format this for printing
In [90]:
def printer(result):
"""
Print the result of the concordance function to stdout.
:param result: result ofconcordance function. This is not strictly enforced
"""
# Loop over alphibetical words
for word in sorted(result):
concordance = result[word]
total_occurrences = len(concordance)
sentence_numbers = ",".join([str(i) for i in concordance])
print("{:15} {{{}:{}}}".format(word, total_occurrences, sentence_numbers))
printer(concordance_result)