In [1]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/data/courtlistener/'
import numpy as np
import sys
# graph package
import igraph as ig
# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info
sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from make_network_data import *
from make_graph import make_graph
from bag_of_words import make_tf_idf
from make_snapshots import make_snapshot_vertex_metrics, update_snapshot_vertex_metrics
# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc
# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [ ]:
setup_data_dir(data_dir)
In [ ]:
make_subnetwork_directory(data_dir, network_name)
In [ ]:
download_op_and_cl_files(data_dir, network_name)
In [ ]:
download_master_edgelist(data_dir)
In [ ]:
download_scdb(data_dir)
In [ ]:
# create the raw case metadata data frame in the raw/ folder
make_subnetwork_raw_case_metadata(data_dir, network_name)
In [ ]:
# create clean case metadata and edgelist from raw data
clean_metadata_and_edgelist(data_dir, network_name)
In [ ]:
make_graph(subnet_dir, network_name)
In [ ]:
%%time
make_network_textfiles(data_dir, network_name)
In [2]:
%%time
make_tf_idf(text_dir, subnet_dir + 'nlp/')
In [3]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')
In [4]:
G.summary()
Out[4]:
In [4]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
'd_pagerank','u_pagerank','rev_pagerank',
'authorities', 'hubs',
'd_eigen', 'u_eigen',
'd_betweenness', 'u_betweenness',
'd_in_closeness', 'd_out_closeness',
'd_all_closeness', 'u_closeness']
# add recent citations
vertex_metrics += ['recentcite_' + str(t) for t in np.arange(1, 10 + 1)]
vertex_metrics += ['recentcite_' + str(t) for t in [15, 20, 25, 30, 35, 40]]
vertex_metrics += ['citerank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]
vertex_metrics += ['polyrank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]
vertex_metrics += ['pagerank_' + str(t * 10) for t in range(1, 9 + 1)]
vertex_metrics += ['num_words']
active_years = range(1900, 2015 + 1)
In [5]:
%%time
make_snapshot_vertex_metrics(G, active_years, vertex_metrics, subnet_dir)
In [39]:
# to_add = ['rev_pagerank', 'num_words']
# to_add += ['citerank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]
In [40]:
# to_add = ['polyrank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]
In [41]:
# to_add = ['d_in_closeness', 'd_out_closeness', 'd_all_closeness', 'd_eigen']
In [44]:
# to_add = ['pagerank_' + str(t * 10) for t in range(1, 9 + 1)]
In [51]:
%%time
update_snapshot_vertex_metrics(G, active_years, to_add, subnet_dir)
In [5]:
G.vs['num_words'] = [0] * len(G.vs)
for op_id in G.vs['name']:
text = open(text_dir + op_id +'.txt', 'r').read()
num_words = len(text.split())
G.vs.find(name=op_id)['num_words'] = num_words
In [6]:
G.write_graphml(subnet_dir + network_name +'_network.graphml')
In [ ]: