In [1]:
from src.prepare_court_data import import_dataframe, reverse_stem
from src.ml_transformer import Stemming_Transformer
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pyspark as ps
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, NGram, \
        CountVectorizer, IDF, Word2Vec
from pyspark.sql.functions import udf, col, explode, collect_list, to_date, concat
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, \
        FloatType, ArrayType, BooleanType

In [2]:
# Import json objects from tar file
opinion_df = import_dataframe(spark, 'opinion')
docket_df = import_dataframe(spark, 'docket')
cluster_df = import_dataframe(spark, 'cluster')

In [3]:
# Setup pipeline for adding ML features - tokens, stems, n-grams, tf, tfidf, word2vec
# tokenizer = Tokenizer(inputCol='parsed_text', outputCol='tokens')
tokenizer = RegexTokenizer(inputCol="parsed_text", outputCol="raw_tokens", pattern="\\W", minTokenLength=3)
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='tokens_stop')
stemmer = Stemming_Transformer(inputCol=remover.getOutputCol(), outputCol='tokens')
bigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='bigrams', n=2)
trigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='trigrams', n=3)
cv = CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='token_countvector', minDF=10.0)
idf = IDF(inputCol=cv.getOutputCol(), outputCol='token_idf', minDocFreq=10)
w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_2d')
w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_large')

In [4]:
pipe = Pipeline(stages=[tokenizer, remover, stemmer, cv, idf, w2v_2d, w2v_large])

In [5]:
# Use the pipeline to fit a model
model = pipe.fit(opinion_df)

In [6]:
# Use the model to transform the data
df_transformed = model.transform(opinion_df)

In [9]:
# extract the vector from a specific document and take the squared distance or cosine similarity for all other documents, show the ten nearest
ref_vec = df_transformed.filter(df_transformed.resource_id == '3990749').first()['word2vec_large']

In [13]:
udf_squared_distance = udf(lambda cell: float(ref_vec.squared_distance(cell)), FloatType())
df_transformed \
        .withColumn('squared_distance', udf_squared_distance(df_transformed.word2vec_large)) \
        .sort(col('squared_distance'), ascending=True) \
        .select('resource_id', 'squared_distance').show(10)


+-----------+----------------+
|resource_id|squared_distance|
+-----------+----------------+
|    3990749|             0.0|
|    3998122|      0.27114186|
|    4001633|      0.28030676|
|    3992923|      0.29765123|
|    3992058|       0.3019882|
|    3999305|      0.30707687|
|    4001545|       0.3082107|
|    3999645|      0.30963397|
|    3996555|      0.31466758|
|    3996670|      0.33596513|
+-----------+----------------+
only showing top 10 rows


In [14]:
udf_cos_sim = udf(lambda cell: float(ref_vec.dot(cell) / (ref_vec.norm(2) * cell.norm(2))), FloatType())
df_transformed \
        .withColumn('cos_similarity', udf_cos_sim(df_transformed.word2vec_large)) \
        .sort(col('cos_similarity'), ascending=False) \
        .select('resource_id', 'cos_similarity').show(12)


+-----------+--------------+
|resource_id|cos_similarity|
+-----------+--------------+
|    3008802|           NaN|
|    2714947|           NaN|
|    3990749|           1.0|
|    4001570|     0.8799179|
|    3992923|      0.875615|
|    3992058|    0.87388444|
|    3996670|     0.8699875|
|    3999305|    0.85111225|
|    4000197|    0.84915984|
|    4001526|    0.84371835|
|    3998083|      0.843271|
|    4001806|    0.84148145|
+-----------+--------------+
only showing top 12 rows


In [ ]: