In [15]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

data_dir = '/Users/iaincarmichael/Data/courtlistener/'

import sys
import numpy as np
import igraph as ig

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from experiment_helper_functions import get_test_cases
from run_exper import *


# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
subnet_dir = data_dir + network_name + '/'
results_dir = subnet_dir + 'results/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [6]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

parameters from make snapshots


In [7]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                  'd_pagerank','u_pagerank',
                  'authorities', 'hubs',
                  #'d_eigen', 'u_eigen', # d_eigen is being problematic
                  'u_eigen',
                  'd_betweenness', 'u_betweenness',
                  'd_closeness', 'u_closeness']

# add recent citations
vertex_metrics += ['recentcite_' + str(t) for t in np.arange(1, 10 + 1)]
vertex_metrics += ['recentcite_' + str(t) for t in [15, 20, 25, 30, 35, 40]]
vertex_metrics += ['age', 'similarity']


vertex_metrics = ['age', 'similarity']
vertex_metrics += ['indegree', 'outdegree']

active_years = range(1900, 2015 + 1)

test parameters


In [10]:
test_seed = 4332,
num_test_cases = 10

test_cases = get_test_cases(G, active_years, num_test_cases, seed=test_seed)

In [11]:
name = 'test'

Sort


In [14]:
%%time
test_case_ids = [v.index for v in test_cases]


sort_params = {}
exper_params_sort = {'vertex_metrics': vertex_metrics,
                     'active_years': active_years,
                     'test_case_ids': test_case_ids,
                     'sort_params': sort_params}


run_sort(G, exper_params_sort, subnet_dir, name)


CPU times: user 13.4 s, sys: 3.04 s, total: 16.4 s
Wall time: 16.5 s

Match


In [ ]:
num_to_keep = 5000
match_params = {'num_to_keep': num_to_keep}

exper_params_match = {'vertex_metrics': vertex_metrics,
                     'active_years': active_years,
                     'test_case_ids': test_case_ids,
                     'match_params': match_params}     
        
run_match(G, exper_params_match, subnet_dir, name)

Logreg


In [ ]:
num_absent_edges = len(G.es)
seed_edge_df = 32432
metric_normalization = 'mean'
feature_transform = 'poly2'
make_tr_data = False

logreg_params = {'num_absent_edges': num_absent_edges,
                 'seed_edge_df': seed_edge_df,
                 'metric_normalization': metric_normalization,
                 'feature_transform': feature_transform,
                 'make_tr_data': make_tr_data}


exper_params_logreg = {'vertex_metrics': vertex_metrics,
                       'active_years': active_years,
                       'test_case_ids': test_case_ids,
                       'logreg_params': logreg_params}     
        

run_logreg(G, exper_params_logreg, subnet_dir, name)

load results


In [18]:
rankloss_sort = pickle.load(open( results_dir + 'sort/%s/rankloss_sort.p' % name, "rb" ) )
# rankloss_match = pickle.load(open( results_dir + 'match/%s/rankloss_match.p' % name, "rb" ) )
# rankloss_logreg = pickle.load(open( results_dir + 'logreg/%s/rankloss_logreg.p' % name, "rb" ) )

In [22]:
MRS_sort = rankloss_sort['MRS'].mean().sort_values()
RR_sort = rankloss_sort['RR'].mean().sort_values()
PAK100_sort = rankloss_sort['PAK100'].mean().sort_values()
PAK1000_sort = rankloss_sort['PAK1000'].mean().sort_values()

MRS_sort


Out[22]:
similarity    0.034478
age           0.184642
outdegree     0.282623
indegree      0.385061
dtype: float64

In [ ]: