In [151]:
import scala.collection.mutable
val data = """
A "concordance" is an alphabetical list of the words present in a text with a count of how
often each word appears and citations of where each word appears in the text (e.g., page
number). Write a program -- in the programming language of your choice -- that will
generate a concordance of an arbitrary text document written in English: the text can be
read from stdin, and the program should output the concordance to stdout or a file. For
each word, it should print the count and the sorted list of citations, in this case the
zero-indexed sentence number in which that word occurs. You may assume that the input
contains only spaces, newlines, standard English letters, and standard English punctuation
marks.
"""
In [167]:
// Regex to remove invalid chars from the text
val invalid_chars = """[^\w'.]""".r
// Split into sections
val sentence_sep = """\. """.r
In [168]:
type ListOfLists = Array[Array[String]]
/**
The result of this function will be a list of lists [ [word, ...], ... ]
The outer list will contain the 'sections' and each section will contain a list of words
*/
def split_into_words_initial(input: String): ListOfLists = {
return sentence_sep.split(input.toLowerCase).map( { x =>
invalid_chars
.replaceAllIn(x," ")
.split(" +")
.map(_.trim)
.filter(_ != "")
})
}
split_into_words_initial(data)
In [170]:
type MapOfLists = HashMap[String, mutable.ArrayBuffer[Int]]
/**
Count of how often each word appears and citations of where each word appears in the text
:param input: a text
:returns: a data structure of {word: [section_num, ... ] }
*/
def concordance(input: String): MapOfLists = {
// Create an empty result value
var result = new MapOfLists()
val sections = split_into_words_initial(input)
// Loop over section and its index
for ((section, section_num) <- sections.zipWithIndex) {
// Loop over the words in a section
for (word <- section) {
// Get the record of which sections the word is in
var word_record = result.getOrElseUpdate(word, mutable.ArrayBuffer.empty[Int])
// Append the current section to the array
word_record += section_num
}
}
return result
}
val concordance_result = concordance(data)
In [172]:
/**
Print the result of the concordance function to stdout.
:param result: result of the concordance function
*/
def printer(result: MapOfLists) = {
for((word, record) <- result.toSeq.sortBy(_._1)) {
val total_occurrences = record.length
val sentence_numbers = record mkString ","
println(f"$word%-15s - {$total_occurrences:$sentence_numbers}")
}
}
printer(concordance_result)
In [ ]: