In [9]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
from __future__ import division
import os
import sys
import time
from math import *
import copy
import cPickle as pickle
# data
import numpy as np
import pandas as pd
# viz
import matplotlib.pyplot as plt
# graph
import igraph as ig
# NLP
from nltk.corpus import stopwords
# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource
from pipeline.make_clean_data import *
from viz import print_describe
sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *
# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = data_dir + 'vertex_metrics_experiment/'
court_name = 'scotus'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [10]:
G = load_and_clean_graph(data_dir, court_name)
In [11]:
active_years = range(1900, 2015 + 1)
In [12]:
seed_ranking = 4343
R = 1000
In [57]:
results = pd.DataFrame(columns = ['mean_score', 'similarity', 'method'])
In [97]:
columns_to_use = ['similarity']
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
In [98]:
test_case_rank_scores_sim = compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
active_years, R, seed=seed_ranking, print_progress=True)
In [100]:
results.loc['sim', :] = [np.mean(test_case_rank_scores_sim), True, 'similarity']
print_describe(test_case_rank_scores_sim)
plt.hist(test_case_rank_scores_sim)
plt.xlabel('rank scores')
plt.title('only similarity')
Out[100]:
In [94]:
columns_to_use = ['indegree', 'decayed_indegree', 's_pagerank', 'hubs', 'age']
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
In [43]:
test_case_rank_scores_all = compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
active_years, R, seed=seed_ranking, print_progress=True)
In [58]:
results.loc['all', :] = [np.mean(test_case_rank_scores_all), False, 'combined']
print_describe(test_case_rank_scores_all)
plt.hist(test_case_rank_scores_all)
plt.xlabel('rank scores')
plt.title('all metrics no similarity')
Out[58]:
In [16]:
columns_to_use = ['indegree']
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
In [17]:
test_case_rank_scores_indeg = compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
active_years, R, seed=seed_ranking, print_progress=True)
In [59]:
results.loc['indeg', :] = [np.mean(test_case_rank_scores_indeg), False, 'indegree']
print_describe(test_case_rank_scores_indeg)
plt.hist(test_case_rank_scores_indeg)
plt.xlabel('rank scores')
plt.title('indegree, no similarity')
Out[59]:
In [19]:
columns_to_use = ['s_pagerank']
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
In [20]:
test_case_rank_scores_pr = compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
active_years, R, seed=seed_ranking, print_progress=True)
In [60]:
results.loc['s_pagerank', :] = [np.mean(test_case_rank_scores_pr), False, 'pagerank']
print_describe(test_case_rank_scores_pr)
plt.hist(test_case_rank_scores_pr)
plt.xlabel('rank scores')
plt.title('s_pagerank, no similarity')
Out[60]:
In [28]:
columns_to_use = ['hubs']
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
In [29]:
test_case_rank_scores_hubs = compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
active_years, R, seed=seed_ranking, print_progress=True)
In [61]:
results.loc['hubs', :] = [np.mean(test_case_rank_scores_hubs), False, 'hubs']
print_describe(test_case_rank_scores_hubs)
plt.hist(test_case_rank_scores_hubs)
plt.xlabel('rank scores')
plt.title('hubs, no similarity')
Out[61]:
In [53]:
columns_to_use = ['indegree', 'decayed_indegree', 's_pagerank', 'hubs', 'age', 'similarity']
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
In [54]:
test_case_rank_scores_allsim = compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
active_years, R, seed=seed_ranking, print_progress=True)
In [62]:
results.loc['all_sim', :] = [np.mean(test_case_rank_scores_allsim), True, 'combined']
print_describe(test_case_rank_scores_allsim)
plt.hist(test_case_rank_scores_allsim)
plt.xlabel('rank scores')
plt.title('all metrics with similarity')
Out[62]:
In [22]:
columns_to_use = ['indegree', 'similarity']
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
In [23]:
test_case_rank_scores_indegsim = compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
active_years, R, seed=seed_ranking,print_progress=True)
In [63]:
results.loc['indeg_sim', :] = [np.mean(test_case_rank_scores_indegsim), True, 'indegree']
print_describe(test_case_rank_scores_indegsim)
plt.hist(test_case_rank_scores_indegsim)
plt.xlabel('rank scores')
plt.title('indegree with similarity')
Out[63]:
In [25]:
columns_to_use = ['s_pagerank', 'similarity']
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
In [26]:
test_case_rank_scores_prsim = compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
active_years, R, seed=seed_ranking)
In [64]:
results.loc['s_pagerank_sim', :] = [np.mean(test_case_rank_scores_prsim), True, 'pagerank']
print_describe(test_case_rank_scores_prsim)
plt.hist(test_case_rank_scores_prsim)
plt.xlabel('rank scores')
plt.title('s_pagerank with similarity')
Out[64]:
In [31]:
columns_to_use = ['hubs', 'similarity']
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
In [32]:
test_case_rank_scores_hubssim = compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
active_years, R, seed=seed_ranking)
In [65]:
results.loc['hubs_sim', :] = [np.mean(test_case_rank_scores_hubssim), True, 'hubs']
print_describe(test_case_rank_scores_hubssim)
plt.hist(test_case_rank_scores_hubssim)
plt.xlabel('rank scores')
plt.title('hubs with similarity')
Out[65]:
In [102]:
# stupid formatting
results['similarity'] = results['similarity'].astype(np.bool)
results.sort_values(by='mean_score', ascending=False)
Out[102]:
In [90]:
# without similarity
results[~results['similarity']].sort_values(by='mean_score', ascending=False)
Out[90]:
In [92]:
# with similarity
results[results['similarity']].sort_values(by='mean_score', ascending=False)
Out[92]: