In [1]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/data/courtlistener/'
import numpy as np
import sys
import matplotlib.pyplot as plt
from scipy.stats import rankdata
import cPickle as pickle
# graph package
import igraph as ig
# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info
sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from rankscore_experiment_sort import *
from rankscore_experiment_LR import *
from rankscore_experiment_match import *
from make_tr_edge_df import *
# which network to download data for
network_name = 'federal' # 'federal', 'ca1', etc
# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')
In [3]:
# vertex_metrics = ['indegree', 'outdegree', 'degree',
# 'd_pagerank','u_pagerank',
# 'authorities', 'hubs',
# #'d_eigen', 'u_eigen', # d_eigen is being problematic
# 'u_eigen',
# 'd_betweenness', 'u_betweenness',
# 'd_closeness', 'u_closeness']
# # add recent citations
# vertex_metrics += ['recentcite_' + str(t) for t in np.arange(1, 10 + 1)]
# vertex_metrics += ['recentcite_' + str(t) for t in [15, 20, 25, 30, 35, 40]]
vertex_metrics = ['indegree', 'outdegree']
vertex_metrics += ['age', 'similarity']
vertex_metrics = ['indegree', 'outdegree']
active_years = range(1900, 2015 + 1)
In [4]:
test_seed = 4332,
num_test_cases = 10
test_cases = get_test_cases(G, active_years, num_test_cases, seed=test_seed)
In [5]:
%%time
rankloss_sort = get_rankscores_sort(G, test_cases, vertex_metrics, subnet_dir)
In [6]:
MRS_sort = rankloss_sort['MRS'].mean().sort_values()
RR_sort = rankloss_sort['RR'].mean().sort_values()
PAK100_sort = rankloss_sort['PAK100'].mean().sort_values()
PAK1000_sort = rankloss_sort['PAK1000'].mean().sort_values()
MRS_sort
Out[6]:
In [7]:
# histogram of scores
# plt.figure(figsize=[20, 20])
# k = 1
# h = ceil(scores_sort.shape[1] / 4.0)
# for c in sort_mean.index:
# plt.subplot(h, 4, k)
# plt.hist(scores_sort[c])
# plt.xlabel(c)
# k += 1
In [8]:
num_to_keep = 5000
In [9]:
%%time
rankloss_match = get_rankscores_match(G, test_cases, vertex_metrics, subnet_dir, num_to_keep)
In [10]:
MRS_match = rankloss_match['MRS'].mean().sort_values()
RR_match = rankloss_match['RR'].mean().sort_values()
PAK100_match = rankloss_match['PAK100'].mean().sort_values()
PAK1000_match = rankloss_match['PAK1000'].mean().sort_values()
MRS_match
Out[10]:
In [11]:
# how many abset edges to add
num_absent_edges = len(G.es)
seed_edge_df = 32432
# how to normalize yearly metrics
metric_normalization = 'mean'
In [30]:
%%time
# make_tr_edge_df(G, subnet_dir,
# active_years, num_absent_edges,
# vertex_metrics, metric_normalization,
# seed=seed_edge_df)
In [31]:
%%time
rankloss_LR, LogRegs = get_rankscores_LR(G, test_cases, vertex_metrics, subnet_dir,
metric_normalization)
In [61]:
MRS_LR = rankloss_LR['MRS'].mean().sort_values()
RR_LR = rankloss_LR['RR'].mean().sort_values()
PAK100_LR = rankloss_LR['PAK100'].mean().sort_values()
PAK1000_LR = rankloss_LR['PAK1000'].mean().sort_values()
MRS_LR
Out[61]:
In [34]:
with open(subnet_dir + 'results/rankloss_sort.p', 'wb') as fp:
pickle.dump(rankloss_sort, fp)
with open(subnet_dir + 'results/rankloss_match.p', 'wb') as fp:
pickle.dump(rankloss_match, fp)
with open(subnet_dir + 'results/rankloss_LR.p', 'wb') as fp:
pickle.dump(rankloss_LR, fp)
with open(subnet_dir + 'results/LogRegs.p', 'wb') as fp:
pickle.dump(LogRegs, fp)
In [36]:
rankloss_sort = pickle.load( open( subnet_dir + 'results/rankloss_sort.p', "rb" ) )
rankloss_match = pickle.load( open( subnet_dir + 'results/rankloss_match.p', "rb" ) )
rankloss_LR = pickle.load( open( subnet_dir + 'results/rankloss_LR.p', "rb" ) )
LogRegs = pickle.load( open( subnet_dir + 'results/LogRegs.p', "rb" ) )
In [24]:
# scores_sort.to_csv(subnet_dir + 'results/scores_sort.csv', index=True)
# scores_search.to_csv(subnet_dir + 'results/scores_search.csv', index=True)
# scores_LR.to_csv(subnet_dir + 'results/scores_LR.csv', index=True)
# scores_LR_logloss.to_csv(subnet_dir + 'results/scores_LR_logloss.csv', index=True)
# with open(subnet_dir + 'results/LogRegs.p', 'wb') as fp:
# pickle.dump(LogRegs, fp)
In [62]:
df_metric = pd.DataFrame(columns=['sort', 'match', 'LR'],
index = range(len(vertex_metrics)))
df_metric['sort'] = MRS_sort.index
df_metric['match'] = MRS_match.index
df_metric['LR'] = MRS_LR.index
df_metric
In [69]:
rankscores = pd.DataFrame(columns=['sort', 'match', 'LR'],
index = vertex_metrics)
rankscores['sort'] = MRS_sort
rankscores['match'] = MRS_match
rankscores['LR'] = MRS_LR
In [72]:
rankscores.sort_values(by='sort', ascending=True)
Out[72]:
In [73]:
rs_ranking = rankscores.apply(lambda c: rankdata(c))
In [74]:
rs_ranking.sort_values(by='sort')
Out[74]:
In [ ]: