In [1]:
from src.prepare_court_data import import_dataframe, reverse_stem
from src.ml_transformer import Stemming_Transformer
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pyspark as ps
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, NGram, \
        CountVectorizer, IDF, Word2Vec
from pyspark.sql.functions import udf, col, explode, collect_list, to_date, concat
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, \
        FloatType, ArrayType, BooleanType

In [2]:
# Import json objects from tar file
opinion_df = import_dataframe(spark, 'opinion')
docket_df = import_dataframe(spark, 'docket')
cluster_df = import_dataframe(spark, 'cluster')

In [3]:
# Setup pipeline for adding ML features - tokens, stems, n-grams, tf, tfidf, word2vec
# tokenizer = Tokenizer(inputCol='parsed_text', outputCol='tokens')
tokenizer = RegexTokenizer(inputCol="parsed_text", outputCol="raw_tokens", pattern="\\W", minTokenLength=3)
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='tokens_stop')
stemmer = Stemming_Transformer(inputCol=remover.getOutputCol(), outputCol='tokens')
bigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='bigrams', n=2)
trigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='trigrams', n=3)
cv = CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='token_countvector', minDF=10.0)
idf = IDF(inputCol=cv.getOutputCol(), outputCol='token_idf', minDocFreq=10)
w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_2d')
w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_large')

In [4]:
pipe = Pipeline(stages=[tokenizer, remover, stemmer, cv, idf, w2v_2d, w2v_large])

In [5]:
# Use the pipeline to fit a model
model = pipe.fit(opinion_df)

In [6]:
# Use the model to transform the data
df_transformed = model.transform(opinion_df)

In [22]:
# extract the vector from a specific document and take the squared distance or cosine similarity for all other documents, show the ten nearest
ref_vec = df_transformed.filter(df_transformed.resource_id == '1390131').first()['word2vec_large']

In [23]:
udf_squared_distance = udf(lambda cell: float(ref_vec.squared_distance(cell)), FloatType())
df_transformed \
        .withColumn('squared_distance', udf_squared_distance(df_transformed.word2vec_large)) \
        .sort(col('squared_distance'), ascending=True) \
        .select('resource_id', 'squared_distance').show(10)


+-----------+----------------+
|resource_id|squared_distance|
+-----------+----------------+
|    1390131|             0.0|
|    2612515|     0.047901332|
|    1450913|     0.051825076|
|    1162810|     0.055476364|
|    1393380|     0.055871524|
|    1233917|     0.057295434|
|    1219322|      0.05813922|
|    1441544|      0.05997757|
|    1423352|      0.06079702|
|    1185879|       0.0611139|
+-----------+----------------+
only showing top 10 rows


In [24]:
udf_cos_sim = udf(lambda cell: float(ref_vec.dot(cell) / (ref_vec.norm(2) * cell.norm(2))), FloatType())
df_transformed \
        .withColumn('cos_similarity', udf_cos_sim(df_transformed.word2vec_large)) \
        .sort(col('cos_similarity'), ascending=False) \
        .select('resource_id', 'cos_similarity').show(12)


+-----------+--------------+
|resource_id|cos_similarity|
+-----------+--------------+
|    3008802|           NaN|
|    2714947|           NaN|
|    1390131|           1.0|
|    2612515|     0.9115429|
|    1450913|     0.9068901|
|    1162810|        0.8948|
|    1441544|     0.8940455|
|    1219322|     0.8889688|
|    1194652|    0.88873464|
|    1185879|     0.8885578|
|    1393380|    0.88777256|
|    1316809|     0.8869071|
+-----------+--------------+
only showing top 12 rows


In [15]:
df_transformed.count()


Out[15]:
26572

In [18]:
df_transformed.select('cluster_id').distinct().count()


Out[18]:
24369

In [21]:
# create a count for each opinion of the number of times it has been cited by other Washington opinions
df_citecount = spark.createDataFrame(
        df_transformed.select(explode(df_transformed.opinions_cited).alias('cites')) \
                .groupBy('cites') \
                .count() \
                .collect())
df_citecount.orderBy('count', ascending=False).show(truncate=False)


+----------------------------------------------------------+-----+
|cites                                                     |count|
+----------------------------------------------------------+-----+
|http://www.courtlistener.com/api/rest/v3/opinions/1390131/|229  |
|http://www.courtlistener.com/api/rest/v3/opinions/107252/ |203  |
|http://www.courtlistener.com/api/rest/v3/opinions/2624899/|175  |
|http://www.courtlistener.com/api/rest/v3/opinions/108111/ |126  |
|http://www.courtlistener.com/api/rest/v3/opinions/2612679/|117  |
|http://www.courtlistener.com/api/rest/v3/opinions/1407600/|108  |
|http://www.courtlistener.com/api/rest/v3/opinions/1370280/|102  |
|http://www.courtlistener.com/api/rest/v3/opinions/111170/ |98   |
|http://www.courtlistener.com/api/rest/v3/opinions/2594992/|92   |
|http://www.courtlistener.com/api/rest/v3/opinions/107729/ |92   |
|http://www.courtlistener.com/api/rest/v3/opinions/1116120/|80   |
|http://www.courtlistener.com/api/rest/v3/opinions/1194272/|77   |
|http://www.courtlistener.com/api/rest/v3/opinions/136995/ |77   |
|http://www.courtlistener.com/api/rest/v3/opinions/107359/ |76   |
|http://www.courtlistener.com/api/rest/v3/opinions/1159920/|74   |
|http://www.courtlistener.com/api/rest/v3/opinions/1427513/|74   |
|http://www.courtlistener.com/api/rest/v3/opinions/1119446/|71   |
|http://www.courtlistener.com/api/rest/v3/opinions/1160222/|70   |
|http://www.courtlistener.com/api/rest/v3/opinions/1123791/|69   |
|http://www.courtlistener.com/api/rest/v3/opinions/1145323/|69   |
+----------------------------------------------------------+-----+
only showing top 20 rows


In [ ]: