check out http://www.philippsinger.info/?p=464 for working with large matrices
In [1]:
    
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
from __future__ import division
import os
import sys
import time
from math import *
import copy
import cPickle as pickle
# data
import numpy as np
import pandas as pd
# viz
import matplotlib.pyplot as plt
# graph
import igraph as ig
# NLP
from nltk.corpus import stopwords
# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource
from pipeline.make_clean_data import *
from viz import print_describe
sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *
# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = data_dir + 'vertex_metrics_experiment/'
court_name = 'scotus'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
    
In [2]:
    
G = load_and_clean_graph(data_dir, court_name)
    
In [3]:
    
active_years = range(1900, 2015 + 1)
    
In [149]:
    
# def get_similarity_index(CLid_pair, CLid_to_index):
#     """
#     Workhorse function for get_similarities
#     """
#     try:
#         ida = CLid_to_index[CLid_pair[0]]
#         idb = CLid_to_index[CLid_pair[1]]
#         return (ida, idb)
#     except KeyError:
#         return (np.nan, np.nan)
    
# def get_similarity_indices(Clid_pairs, CLid_to_index):
#     return zip(*[get_similarity_index(pair, CLid_to_index) for pair in Clid_pairs])
    
In [4]:
    
# these functions speed up CSR matrix slicing by an order of magnitude... but actually a numpy matrix works fine
def get_similarity_index(clid, CLid_to_index):
    """
    Workhorse function for get_similarities
    """
    try:
        return CLid_to_index[clid]
    except KeyError:
        return np.nan
    
def get_similarity_indices(CLid_pairs, CLid_to_index):
    
    # get lists of CL ids
    CLids = zip(*CLid_pairs)
    citing_clids = list(CLids[0])
    cited_clids = list(CLids[1])
    # get similarity matrix indices
    idA = np.array([get_similarity_index(clid, CLid_to_index) for clid in citing_clids])
    idB = np.array([get_similarity_index(clid, CLid_to_index) for clid in cited_clids])
    # which indices don't have nans
    not_nan_indices = np.where(~(np.isnan(idB) | np.isnan(idA)))[0]
    similarities = np.array([np.nan]*len(idA))
    similarities[not_nan_indices] = 0.0
    
    # row indices should be smaller set
    if len(set(idA[not_nan_indices])) <= len(set(idB[not_nan_indices])):
        row_indices = idA[not_nan_indices].astype(int)
        col_indices = idB[not_nan_indices].astype(int)
    else:
        col_indices = idA[not_nan_indices].astype(int)
        row_indices = idB[not_nan_indices].astype(int)
    
    return row_indices, col_indices, similarities
def get_similarities2(similarity_matrix, CLid_pairs, CLid_to_index):
    # row/column indices of similarity matrix 
    row_indices, col_indices, similarities = get_similarity_indices(CLid_pairs, CLid_to_index)
    # the rows we want to get from the similarity matrix
    rows_to_get = list(set(row_indices))
    # get row subsetted similarity matrix
    row_subsetted_matrix = similarity_matrix[rows_to_get, :]
    # map the row indices from original matrix to row indices in row subsetting matrix
    row_indices_subseted = [np.where(rows_to_get == i)[0][0] for i in row_indices]
    # get the similarities that we actually have
    if type(row_subsetted_matrix) == np.ndarray:
        sims = row_subsetted_matrix[row_indices_subseted, col_indices]
    else:
        sims = row_subsetted_matrix.toarray()[row_indices_subseted, col_indices]
    
    # update similarities
    similarities[~np.isnan(similarities)] = sims
    
    return similarities.tolist()
    
In [5]:
    
def get_similarity(similarity_matrix, CLid_pair, CLid_to_index):
    """
    Workhorse function for get_similarities
    """
    try:
        ida = CLid_to_index[CLid_pair[0]]
        idb = CLid_to_index[CLid_pair[1]]
        return similarity_matrix[ida, idb]
    except KeyError:
        return np.nan
def get_similarities1(similarity_matrix, CLid_pairs, CLid_to_index):
    """
    Returns the similarities for cases index by CL ids as a list from
    precomuted similarity matrix
    Parameters
    ----------
    similarity_matrix: precomputed similarity matrix
    CLid_pair: lists of CL id pairs whose similarities we want
    CLid_to_index: dict that maps CL ids to similarity_matrix indices
    """
    return [get_similarity(similarity_matrix, pair, CLid_to_index) for pair in CLid_pairs]
    
In [6]:
    
time1 = 0
time2 = 0
seed = 243
R = 100
    
In [7]:
    
# compute ranking metrics function
# get list of test cases
test_vertices = get_test_cases(G, active_years, R, seed)
# load snapshots
snapshots_dict = load_snapshots(experiment_data_dir)
    
In [11]:
    
similarity_matrix, CLid_to_index = load_similarity_matrix(experiment_data_dir)
similarity_matrix.astype(np.float16)
# similarity_matrix = similarity_matrix.toarray()
    
    Out[11]:
In [12]:
    
# run until we get R test cases (some cases might not have any citations)
for i in range(R):
    # randomly select a case
    test_case = test_vertices[i]
    # converted ig index to CL id
    cited_cases = get_cited_cases(G, test_case)
    # get vetex metrics in year before citing year
    snapshot_year = test_case['year'] - 1
    # grab data frame of vertex metrics for test case's snapshot
    snapshot_df = snapshots_dict['vertex_metrics_' +
                                 str(int(snapshot_year))]
    # restrict ourselves to ancestors of ing
    # case strictly before ing year
    ancentors = [v.index for v in G.vs.select(year_le=snapshot_year)]
    # all edges from ing case to previous cases
    edgelist = zip([test_case.index] * len(ancentors), ancentors)
    # get edge data function
    ed_CLids = [G.vs[edge[1]]['name'] for edge in edgelist]
    ing_CLids = [G.vs[edge[0]]['name'] for edge in edgelist]
    
    
    start = time.time()
    sims1 = get_similarities1(similarity_matrix, zip(ing_CLids, ed_CLids), CLid_to_index)
    time1 += (time.time() - start)
    
    start = time.time()
    sims2 = get_similarities2(similarity_matrix, zip(ing_CLids, ed_CLids), CLid_to_index)
    time2 += (time.time() - start)
    
In [10]:
    
print 'matrix time1: %d seconds ' % time1
print 'matrix time2: %d seconds ' % time2
    
    
In [13]:
    
print 'csr matrix time1: %d seconds ' % time1
print 'csr matrix time2: %d seconds ' % time2
    
    
In [11]:
    
columns_to_use = ['indegree', 'similarity']
R = 1000
seed_ranking = 3424
LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)
    
In [13]:
    
start = time.time()
compute_ranking_metrics_LR1(G, LogReg, columns_to_use, experiment_data_dir,
                            active_years, R, seed=seed_ranking,print_progress=True)
print 'new function took %d seconds for %d test cases' % (time.time() - start, R)
    
    
In [12]:
    
start = time.time()
compute_ranking_metrics_LR2(G, LogReg, columns_to_use, experiment_data_dir,
                            active_years, R, seed=seed_ranking,print_progress=True)
print 'new and improved function took %d seconds for %d test cases' % (time.time() - start, R)
    
    
In [ ]: