In [3]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

data_dir = '/Users/iaincarmichael/data/courtlistener/'

import numpy as np
import sys
import matplotlib.pyplot as plt
from scipy.stats import rankdata
import cPickle as pickle

# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from rankscore_experiment_sort import *
from rankscore_experiment_LR import *
from rankscore_experiment_match import *

from make_tr_edge_df import *


# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

parameters from make snapshots


In [7]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                  'd_pagerank','u_pagerank',
                  'authorities', 'hubs',
                  #'d_eigen', 'u_eigen', # d_eigen is being problematic
                  'u_eigen',
                  'd_betweenness', 'u_betweenness',
                  'd_closeness', 'u_closeness']

# add recent citations
vertex_metrics += ['recentcite_' + str(t) for t in np.arange(1, 10 + 1)]
vertex_metrics += ['recentcite_' + str(t) for t in [15, 20, 25, 30, 35, 40]]

vertex_metrics += ['age', 'similarity']

active_years = range(1900, 2015 + 1)

test parameters


In [9]:
test_seed = 4332,
num_test_cases = 10

test_cases = get_test_cases(G, active_years, num_test_cases, seed=test_seed)

rank by sorting


In [21]:
%%time 
rankloss_sort = get_rankscores_sort(G, test_cases, vertex_metrics, subnet_dir)


CPU times: user 3min 11s, sys: 32 s, total: 3min 43s
Wall time: 3min 44s

In [59]:
MRS_sort = rankloss_sort['MRS'].mean().sort_values()
RR_sort = rankloss_sort['RR'].mean().sort_values()
PAK100_sort = rankloss_sort['PAK100'].mean().sort_values()
PAK1000_sort = rankloss_sort['PAK1000'].mean().sort_values()

MRS_sort


Out[59]:
similarity       0.045186
age              0.203464
recentcite_7     0.209990
recentcite_6     0.210109
recentcite_8     0.210439
recentcite_5     0.213641
recentcite_9     0.213744
u_eigen          0.214163
recentcite_10    0.214385
recentcite_4     0.223274
hubs             0.224445
recentcite_15    0.226459
recentcite_3     0.234811
degree           0.235087
recentcite_20    0.235906
u_closeness      0.237946
outdegree        0.239978
recentcite_25    0.251798
recentcite_2     0.256311
recentcite_30    0.261893
u_pagerank       0.268186
recentcite_1     0.269283
recentcite_35    0.273660
recentcite_40    0.284319
authorities      0.288553
u_betweenness    0.295292
d_betweenness    0.308325
indegree         0.335238
d_pagerank       0.410735
d_closeness      0.538128
dtype: float64

In [25]:
# histogram of scores

# plt.figure(figsize=[20, 20])
# k = 1
# h = ceil(scores_sort.shape[1] / 4.0)
# for c in sort_mean.index:
#     plt.subplot(h, 4, k)
#     plt.hist(scores_sort[c])
#     plt.xlabel(c)
    
#     k += 1

Match


In [26]:
num_to_keep = 5000

In [27]:
%%time
rankloss_match = get_rankscores_match(G, test_cases, vertex_metrics, subnet_dir, num_to_keep)


CPU times: user 4min 21s, sys: 49.7 s, total: 5min 11s
Wall time: 5min 12s

In [60]:
MRS_match = rankloss_match['MRS'].mean().sort_values()
RR_match = rankloss_match['RR'].mean().sort_values()
PAK100_match = rankloss_match['PAK100'].mean().sort_values()
PAK1000_match = rankloss_match['PAK1000'].mean().sort_values()

MRS_match


Out[60]:
similarity       0.087850
age              0.272298
recentcite_6     0.285066
recentcite_5     0.286009
recentcite_7     0.288943
recentcite_8     0.290901
recentcite_4     0.291334
recentcite_3     0.296397
recentcite_9     0.297070
recentcite_10    0.301320
recentcite_2     0.312719
recentcite_1     0.314620
recentcite_15    0.317860
recentcite_20    0.333106
u_eigen          0.336644
hubs             0.338221
recentcite_25    0.352149
outdegree        0.358193
recentcite_30    0.364618
degree           0.367391
recentcite_35    0.376324
u_closeness      0.384502
recentcite_40    0.386866
u_pagerank       0.391528
authorities      0.403309
d_betweenness    0.414885
u_betweenness    0.421226
indegree         0.433837
d_pagerank       0.486673
d_closeness      0.594740
dtype: float64

make training data for logistic regression


In [25]:
# how many abset edges to add
num_absent_edges = len(G.es)
seed_edge_df = 32432

# how to normalize yearly metrics
metric_normalization = 'mean'

feature_transform = 'poly2'

In [26]:
%%time 

# make_tr_edge_df(G, subnet_dir,
#                 active_years, num_absent_edges,
#                 vertex_metrics, metric_normalization,
#                 seed=seed_edge_df)


CPU times: user 2 µs, sys: 3 µs, total: 5 µs
Wall time: 5.96 µs

rank by logistic regression


In [27]:
%%time
rankloss_LR, LogRegs = get_rankscores_LR(G, test_cases, vertex_metrics, subnet_dir,
                                         metric_normalization, feature_transform)


CPU times: user 20.6 s, sys: 3.21 s, total: 23.8 s
Wall time: 23.9 s

In [24]:
MRS_LR = rankloss_LR['MRS'].mean().sort_values()
RR_LR = rankloss_LR['RR'].mean().sort_values()
PAK100_LR = rankloss_LR['PAK100'].mean().sort_values()
PAK1000_LR = rankloss_LR['PAK1000'].mean().sort_values()

MRS_LR


Out[24]:
similarity    0.034436
age           0.035402
outdegree     0.046288
indegree      0.050183
dtype: float64

cache results


In [34]:
with open(subnet_dir + 'results/rankloss_sort.p', 'wb') as fp:
    pickle.dump(rankloss_sort, fp)
    
with open(subnet_dir + 'results/rankloss_match.p', 'wb') as fp:
    pickle.dump(rankloss_match, fp)

with open(subnet_dir + 'results/rankloss_LR.p', 'wb') as fp:
    pickle.dump(rankloss_LR, fp)
    
with open(subnet_dir + 'results/LogRegs.p', 'wb') as fp:
    pickle.dump(LogRegs, fp)

In [36]:
rankloss_sort = pickle.load( open( subnet_dir + 'results/rankloss_sort.p', "rb" ) )
rankloss_match = pickle.load( open( subnet_dir + 'results/rankloss_match.p', "rb" ) )
rankloss_LR = pickle.load( open( subnet_dir + 'results/rankloss_LR.p', "rb" ) )
LogRegs = pickle.load( open( subnet_dir + 'results/LogRegs.p', "rb" ) )

In [24]:
# scores_sort.to_csv(subnet_dir + 'results/scores_sort.csv', index=True)
# scores_search.to_csv(subnet_dir + 'results/scores_search.csv', index=True)
# scores_LR.to_csv(subnet_dir + 'results/scores_LR.csv', index=True)
# scores_LR_logloss.to_csv(subnet_dir + 'results/scores_LR_logloss.csv', index=True)

# with open(subnet_dir + 'results/LogRegs.p', 'wb') as fp:
#     pickle.dump(LogRegs, fp)

results


In [62]:
df_metric = pd.DataFrame(columns=['sort', 'match', 'LR'],
                         index = range(len(vertex_metrics)))

df_metric['sort'] = MRS_sort.index
df_metric['match'] = MRS_match.index
df_metric['LR'] = MRS_LR.index

df_metric

In [69]:
rankscores = pd.DataFrame(columns=['sort', 'match', 'LR'],
                         index = vertex_metrics)

rankscores['sort'] = MRS_sort
rankscores['match'] = MRS_match
rankscores['LR'] = MRS_LR

In [72]:
rankscores.sort_values(by='sort', ascending=True)


Out[72]:
sort match LR
similarity 0.045186 0.087850 0.954872
age 0.203464 0.272298 0.959504
recentcite_7 0.209990 0.288943 0.934804
recentcite_6 0.210109 0.285066 0.943219
recentcite_8 0.210439 0.290901 0.928158
recentcite_5 0.213641 0.286009 0.952080
recentcite_9 0.213744 0.297070 0.921747
u_eigen 0.214163 0.336644 0.850612
recentcite_10 0.214385 0.301320 0.915899
recentcite_4 0.223274 0.291334 0.957748
hubs 0.224445 0.338221 0.858392
recentcite_15 0.226459 0.317860 0.896261
recentcite_3 0.234811 0.296397 0.959609
degree 0.235087 0.367391 0.889295
recentcite_20 0.235906 0.333106 0.894102
u_closeness 0.237946 0.384502 0.580034
outdegree 0.239978 0.358193 0.898077
recentcite_25 0.251798 0.352149 0.897073
recentcite_2 0.256311 0.312719 0.956526
recentcite_30 0.261893 0.364618 0.896953
u_pagerank 0.268186 0.391528 0.588722
recentcite_1 0.269283 0.314620 0.944017
recentcite_35 0.273660 0.376324 0.899801
recentcite_40 0.284319 0.386866 0.903414
authorities 0.288553 0.403309 0.831034
u_betweenness 0.295292 0.421226 0.320712
d_betweenness 0.308325 0.414885 0.389026
indegree 0.335238 0.433837 0.912617
d_pagerank 0.410735 0.486673 0.630018
d_closeness 0.538128 0.594740 0.583389

In [73]:
rs_ranking = rankscores.apply(lambda c: rankdata(c))

In [74]:
rs_ranking.sort_values(by='sort')


Out[74]:
sort match LR
similarity 1.0 1.0 26.0
age 2.0 2.0 29.0
recentcite_7 3.0 5.0 22.0
recentcite_6 4.0 3.0 23.0
recentcite_8 5.0 6.0 21.0
recentcite_5 6.0 4.0 25.0
recentcite_9 7.0 9.0 20.0
u_eigen 8.0 15.0 8.0
recentcite_10 9.0 10.0 19.0
recentcite_4 10.0 7.0 28.0
hubs 11.0 16.0 9.0
recentcite_15 12.0 13.0 12.0
recentcite_3 13.0 8.0 30.0
degree 14.0 20.0 10.0
recentcite_20 15.0 14.0 11.0
u_closeness 16.0 22.0 3.0
outdegree 17.0 18.0 15.0
recentcite_25 18.0 17.0 14.0
recentcite_2 19.0 11.0 27.0
recentcite_30 20.0 19.0 13.0
u_pagerank 21.0 24.0 5.0
recentcite_1 22.0 12.0 24.0
recentcite_35 23.0 21.0 16.0
recentcite_40 24.0 23.0 17.0
authorities 25.0 25.0 7.0
u_betweenness 26.0 27.0 1.0
d_betweenness 27.0 26.0 2.0
indegree 28.0 28.0 18.0
d_pagerank 29.0 29.0 6.0
d_closeness 30.0 30.0 4.0

experiment function


In [ ]: