In [3]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

from __future__ import division

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# NLP
from nltk.corpus import stopwords


# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource

sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *
from make_case_text_files import *
from bag_of_words import *

# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = data_dir + 'vertex_metrics_experiment/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [9]:
G = load_and_clean_graph(data_dir, court_name)

load similarity matrix


In [10]:
similarity_matrix = load_sparse_csr(filename=experiment_data_dir + 'cosine_sims.npz')

In [11]:
with open(experiment_data_dir + 'CLid_to_index.p', 'rb') as f:
    CLid_to_index = pickle.load(f)

Look at similarities


In [12]:
def get_similarities(similarity_matrix, CLid_A, CLid_B, CLid_to_index):
    """
    Returns the similarities for cases index by CL ids as a list
    
    Parameters
    ----------
    similarity_matrix: precomputed similarity matrix
    
    CLid_A, CLid_B: two lists of CL ids whose similarities we want
    
    CLid_to_index: dict that maps CL ids to similarity_matrix indices
    """
    
    if len(CLid_A) != len(CLid_B):
        raise ValueError('lists not the same length')
    else:
        N = len(CLid_A)
    
    # list to return
    similarities = [0] * N

    # grab each entry
    for i in range(N):
        
        try:
            # convet CL id to matrix index
            idA = CLid_to_index[CLid_A[i]]
            idB = CLid_to_index[CLid_B[i]]

            similarities[i] = similarity_matrix[idA, idB]
        except KeyError:
            # if one of the CLid's is not in the similarity matrix return nan
            similarities[i] = np.nan

    return similarities

In [ ]:
def save_similarity_matrix(experiment_data_dir, similarity_matrix, CLid_to_index):
    """
    saves similarity matrix and CLid_to_index dict
    """
    
    # save similarity matrix
    save_sparse_csr(filename=experiment_data_dir + 'cosine_sims',
                    array=S)

    # save clid to index map
    with open(experiment_data_dir + 'CLid_to_index.p', 'wb') as fp:
        pickle.dump(CLid_to_index, fp)
        
        
def load_similarity_matrix(experiment_data_dir):
    """
    Load similarity matrix and CLid_to_index dict
    
    Parameters
    ----------
    experiment_data_dir:
    
    Output
    ------
    similarity_matrix, CLid_to_index
    """
    
    similarity_matrix = load_sparse_csr(filename=experiment_data_dir + 'cosine_sims.npz')

    with open(experiment_data_dir + 'CLid_to_index.p', 'rb') as f:
        CLid_to_index = pickle.load(f) 
        
        
    return similarity_matrix, CLid_to_index

In [13]:
CLid_ing = []
CLid_ed = []
for e in G.es:
    
    CLid_ing.append(G.vs[e.source]['name'])
    CLid_ed.append(G.vs[e.target]['name'])

In [ ]:
start = time.time()
sims = get_similarities(S, CLid_ing, CLid_ed, CLid_to_index)
runtime = time.time() - start

surgery


In [14]:
len(CLid_to_index.keys())
map_clids = CLid_to_index.keys()

print 'there are %d keys' % len(CLid_to_index.keys())


there are 33157 keys

In [15]:
len(G.vs)

G_clids = G.vs['name']

print 'there are %d vertices in the graph' % len(G.vs)


there are 33253 vertices in the graph

In [21]:
set(G_clids).difference(set(map_clids))


Out[21]:
{'3177214',
 '3177215',
 '3177216',
 '3177217',
 '3177218',
 '3177219',
 '3181041',
 '3181042',
 '3181043',
 '3181499',
 '3181500',
 '3183097',
 '3183098',
 '3183099',
 '3183100',
 '3187262',
 '3187263',
 '3187264',
 '3187550',
 '3187592',
 '3187593',
 '3187594',
 '3189478',
 '3189871',
 '3191204',
 '3191205',
 '3191206',
 '3191207',
 '3195198',
 '3195550',
 '3195551',
 '3195996',
 '3195997',
 '3195998',
 '3197473',
 '3197852',
 '3199607',
 '3199608',
 '3203729',
 '3203730',
 '3203761',
 '3203762',
 '3203763',
 '3203764',
 '3203765',
 '3205023',
 '3205024',
 '3205025',
 '3205842',
 '3205872',
 '3205873',
 '3205874',
 '3206520',
 '3207964',
 '3207965',
 '3207966',
 '3207967',
 '3209792',
 '3209793',
 '3211607',
 '3211608',
 '3211609',
 '3212620',
 '3212621',
 '3212622',
 '3213977',
 '3213978',
 '3213979',
 '3214880',
 '3214881',
 '3214882',
 '3214883',
 '3214884',
 '3214885',
 '3214886',
 '3216493',
 '3216494',
 '3216495',
 '3216496',
 '3216497',
 '3217332',
 '3217333',
 '3217334',
 '3217528',
 '3217529',
 '3217582',
 '3217583',
 '4236644',
 '4238690',
 '4238691'}

In [27]:
len(os.listdir(experiment_data_dir + 'textfiles/'))


Out[27]:
33158

In [ ]: