Setup the sample project for running our GloVe implementation

Imports required for running the code. Load the sample data for testing the project.


In [1]:
import string
from itertools import chain
from src import context_dictionary
from src.prepare_court_data import import_dataframe
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import ArrayType, StringType, IntegerType, MapType
from nltk.tokenize import sent_tokenize, word_tokenize

df_opinions_unparsed = spark.read.load('data/wash_state_1000_opinions.parquet')

Then build the lists of words for each sentence in each document. The end result is a list for each document in which each item is a list of words for each sentence. We chose the nested list so we could preserve sentence boundaries when counting the number of times each word appears in the context of another word. The context does not extend beyond the sentence boundary.


In [2]:
token_lists = udf(lambda doc: [
    word_tokenize(                                              # NLTK word tokenizer is smarter (can separate contractions)
        sentence.translate(                                     # translate can change one character into another
            str.maketrans(string.punctuation, ' '*len(string.punctuation))  # a translator that changes punctuation within words
            )
        ) 
    for sentence in sent_tokenize(doc.replace('\n', ' ').strip())],         # bring the documents in divided into sentences
    ArrayType(ArrayType(StringType())))                                     # declare nested array of strings for Spark
df_words = df_opinions_unparsed.withColumn('sents', token_lists('parsed_text'))
df_words.persist()

In [ ]: