In [151]:
import scala.collection.mutable

val data = """
A "concordance" is an alphabetical list of the words present in a text with a count of how
often each word appears and citations of where each word appears in the text (e.g., page
number). Write a program -- in the programming language of your choice -- that will
generate a concordance of an arbitrary text document written in English: the text can be
read from stdin, and the program should output the concordance to stdout or a file. For
each word, it should print the count and the sorted list of citations, in this case the
zero-indexed sentence number in which that word occurs. You may assume that the input
contains only spaces, newlines, standard English letters, and standard English punctuation
marks.
"""


import scala.collection.mutable
data: String = """

A "concordance" is an alphabetical list of the words present in a text with a count of how
often each word appears and citations of where each word appears in the text (e.g., page
number). Write a program -- in the programming language of your choice -- that will
generate a concordance of an arbitrary text document written in English: the text can be
read from stdin, and the program should output the concordance to stdout or a file. For
each word, it should print the count and the sorted list of citations, in this case the
zero-indexed sentence number in which that word occurs. You may assume that the input
contains only spaces, newlines, standard English letters, and standard English punctuation
marks.

"""

In [167]:
// Regex to remove invalid chars from the text
val invalid_chars = """[^\w'.]""".r

// Split into sections
val sentence_sep = """\. """.r


invalid_chars: util.matching.Regex = [^\w'.]
sentence_sep: util.matching.Regex = \. 

In [168]:
type ListOfLists = Array[Array[String]]

/**
The result of this function will be a list of lists [ [word, ...], ... ]
The outer list will contain the 'sections' and each section will contain a list of words
*/
def split_into_words_initial(input: String): ListOfLists = {
    
    return sentence_sep.split(input.toLowerCase).map( { x => 
        invalid_chars
          .replaceAllIn(x," ")
          .split(" +")
          .map(_.trim) 
          .filter(_ != "")
    })

}

split_into_words_initial(data)


defined type ListOfLists
defined function split_into_words_initial
res122_2: ListOfLists = Array(
  Array(
    "a",
    "concordance",
    "is",
    "an",
    "alphabetical",
    "list",
    "of",
    "the",
    "words",
    "present",
    "in",
    "a",
    "text",
...

In [170]:
type MapOfLists = HashMap[String, mutable.ArrayBuffer[Int]]

/**
Count of how often each word appears and citations of where each word appears in the text
    
:param input: a text
:returns: a data structure of {word: [section_num, ... ] }
*/
def concordance(input: String): MapOfLists = {
    
    // Create an empty result value
    var result = new MapOfLists()
    
    val sections = split_into_words_initial(input)
    
    // Loop over section and its index
    for ((section, section_num) <- sections.zipWithIndex) {

        // Loop over the words in a section
        for (word <- section) {

            // Get the record of which sections the word is in
            var word_record = result.getOrElseUpdate(word, mutable.ArrayBuffer.empty[Int])
            // Append the current section to the array
            word_record += section_num
        }
    }        
    return result
}

val concordance_result = concordance(data)


defined type MapOfLists
defined function concordance
concordance_result: MapOfLists = Map(
  "marks." -> ArrayBuffer(3),
  "count" -> ArrayBuffer(0, 2),
  "text" -> ArrayBuffer(0, 0, 1, 1),
  "is" -> ArrayBuffer(0),
  "standard" -> ArrayBuffer(3, 3),
  "can" -> ArrayBuffer(1),
  "newlines" -> ArrayBuffer(3),
  "of" -> ArrayBuffer(0, 0, 0, 1, 1, 2),
  "input" -> ArrayBuffer(3),
  "concordance" -> ArrayBuffer(0, 1, 1),
  "programming" -> ArrayBuffer(1),
  "written" -> ArrayBuffer(1),
  "you" -> ArrayBuffer(3),
...

In [172]:
/**
Print the result of the concordance function to stdout. 

:param result: result of the concordance function
*/
def printer(result: MapOfLists) = {
    for((word, record) <- result.toSeq.sortBy(_._1)) {
        val total_occurrences = record.length
        val sentence_numbers = record mkString  ","
        println(f"$word%-15s - {$total_occurrences:$sentence_numbers}")
    }
}

printer(concordance_result)


a               - {6:0,0,0,1,1,1}
alphabetical    - {1:0}
an              - {2:0,1}
and             - {4:0,1,2,3}
appears         - {2:0,0}
arbitrary       - {1:1}
assume          - {1:3}
be              - {1:1}
can             - {1:1}
case            - {1:2}
choice          - {1:1}
citations       - {2:0,2}
concordance     - {3:0,1,1}
contains        - {1:3}
count           - {2:0,2}
document        - {1:1}
e.g.            - {1:0}
each            - {3:0,0,2}
english         - {3:1,3,3}
file            - {1:1}
for             - {1:2}
from            - {1:1}
generate        - {1:1}
how             - {1:0}
in              - {6:0,0,1,1,2,2}
indexed         - {1:2}
input           - {1:3}
is              - {1:0}
it              - {1:2}
language        - {1:1}
letters         - {1:3}
list            - {2:0,2}
marks.          - {1:3}
may             - {1:3}
newlines        - {1:3}
number          - {2:0,2}
occurs          - {1:2}
of              - {6:0,0,0,1,1,2}
often           - {1:0}
only            - {1:3}
or              - {1:1}
output          - {1:1}
page            - {1:0}
present         - {1:0}
print           - {1:2}
program         - {2:1,1}
programming     - {1:1}
punctuation     - {1:3}
read            - {1:1}
sentence        - {1:2}
should          - {2:1,2}
sorted          - {1:2}
spaces          - {1:3}
standard        - {2:3,3}
stdin           - {1:1}
stdout          - {1:1}
text            - {4:0,0,1,1}
that            - {3:1,2,3}
the             - {10:0,0,1,1,1,1,2,2,2,3}
this            - {1:2}
to              - {1:1}
where           - {1:0}
which           - {1:2}
will            - {1:1}
with            - {1:0}
word            - {4:0,0,2,2}
words           - {1:0}
write           - {1:1}
written         - {1:1}
you             - {1:3}
your            - {1:1}
zero            - {1:2}
defined function printer

In [ ]: