We would like to compare and contrast case clustering based on the opinion text (natural language processing) vs. based on the citation structure (network community detection).
Commmunity detection on the network
Clustering on the opinion texts
Relational topic models (see blei paper) (TODO)
borrowing some code from http://brandonrose.org/clustering
In [8]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/data/courtlistener/'
import numpy as np
import sys
import matplotlib.pyplot as plt
# graph package
import igraph as ig
# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
# our code
sys.path.append(repo_directory + 'code/')
sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import load_tf_idf
# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc
# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')
In [106]:
# limit ourselves to cases upto and including 2015 since we are missing some textfiles from 2016
G = G.subgraph(G.vs.select(year_le=2015))
# make graph undirected
Gud = G.copy()
Gud = Gud.as_undirected()
# get largest connected componenet
components = Gud.clusters(mode='STRONG')
g = components.subgraphs()[np.argmax(components.sizes())]
# CL ids of cases in largest connected component
CLids = g.vs['name']
In [107]:
%%time
# modularity clustering
cd_modularity = g.community_fastgreedy() # .as_clustering().membership
mod_clust = cd_modularity.as_clustering()
mod_clust.summary()
Out[107]:
In [108]:
graph_clusters = pd.Series(mod_clust.membership, index=g.vs['name'])
In [109]:
# %time cd_walktrap = g.community_walktrap()
# wt_clust = cd_walktrap.as_clustering()
# wt_clust.summary()
In [6]:
tfidf_matrix, op_id_to_bow_id = load_tf_idf(nlp_dir)
In [ ]:
%%time
# set number of clusters
num_clusters = 30
# run kmeans
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
nlp_clusters = km.labels_.tolist()
In [113]:
clusters = pd.DataFrame(index=normalized_text_dict.keys(), columns=['nlp', 'graph'])
# add in NLP clusters
clusters['nlp'] = nlp_clusters
# add in communities
clusters['graph'] = graph_clusters
# consider nodes not considered in CD to be their own cluster
# i.e. nodes outside the largest connected component
clusters['graph'].fillna(max(graph_clusters) + 1, inplace=True)
# make formatting
clusters['graph'] = clusters['graph'].astype(np.int)
In [114]:
clusters
Out[114]:
In [115]:
# TODO: match clusters
In [ ]: