In [1]:
import string
from itertools import chain
from src import context_dictionary
from src.prepare_court_data import import_dataframe
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import ArrayType, StringType, IntegerType, MapType
from nltk.tokenize import sent_tokenize, word_tokenize
df_opinions_unparsed = spark.read.load('data/wash_state_1000_opinions.parquet')
Then build the lists of words for each sentence in each document. The end result is a list for each document in which each item is a list of words for each sentence. We chose the nested list so we could preserve sentence boundaries when counting the number of times each word appears in the context of another word. The context does not extend beyond the sentence boundary.
In [2]:
token_lists = udf(lambda doc: [
word_tokenize( # NLTK word tokenizer is smarter (can separate contractions)
sentence.translate( # translate can change one character into another
str.maketrans(string.punctuation, ' '*len(string.punctuation)) # a translator that changes punctuation within words
)
)
for sentence in sent_tokenize(doc.replace('\n', ' ').strip())], # bring the documents in divided into sentences
ArrayType(ArrayType(StringType()))) # declare nested array of strings for Spark
df_words = df_opinions_unparsed.withColumn('sents', token_lists('parsed_text'))
df_words.persist()
In [ ]: