when computing the rankings group all cases in same ing snapshot year and call get_edge_data once for each group. Ends up not making it faster...


In [1]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

from __future__ import division

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# NLP
from nltk.corpus import stopwords


# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource
from pipeline.make_clean_data import *
from viz import print_describe


sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *

# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = data_dir + 'vertex_metrics_experiment/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
G = load_and_clean_graph(data_dir, court_name)

In [3]:
active_years = range(1900, 2015 + 1)

group by snapshot year


In [17]:
def compute_ranking_metrics_LR_group(G,
                               LogReg,
                               columns_to_use,
                               experiment_data_dir,
                               active_years,
                               R,
                               year_floor=1900,
                               seed=None,
                               print_progress=False):
    '''
    Computes the rank score metric for a given logistic regression object.

    Sample R test cases that have at least one citation. For each test case
    rank test case's ancestors then compute rank score for test cases actual
    citations.

    Parameters
    ------------
    G: network (so we can get each cases' ancestor network)

    LogReg: a logistic regression object
    (i.e. the output of fit_logistic_regression)

    columns_to_use: list of column names of edge metrics data frame that we
    should use to fit logistic regression

    path_to_vertex_metrics_folder: we will need these for prediciton

    year_interval: the year interval between each vertex metric .csv file

    R: how many cases to compute ranking metrics for

    year_floor: sample only cases after this year

    seed: random seed for selecting cases whose ancsetry to score

    Output
    -------
    The average ranking score over all R cases we tested
    '''

    # ranking scores for each test case
    test_case_rank_scores = []
    
    # get list of test cases
    test_vertices = get_test_cases(G, active_years, R, seed=seed)

    # load snapshots
    snapshots_dict = load_snapshots(experiment_data_dir)

    # mabye load the similarities
    if 'similarity' in columns_to_use:
        similarity_matrix, CLid_to_index = load_similarity_matrix(experiment_data_dir)
    else:
        similarity_matrix = None
        CLid_to_index = None
        
        
    # organize edges by ing snapshot year
    case_dict = get_test_cases_by_snapshot_dict(G, test_vertices, active_years)
    
    
    for year in case_dict.keys():
        
        # get vetex metrics in year before citing year
        snapshot_year = year - 1

        # grab data frame of vertex metrics for test case's snapshot
        snapshot_df = snapshots_dict['vertex_metrics_' +
                                     str(int(snapshot_year))]

        # build edgelist for all cases in given year
        edgelist = get_combined_edgelist(G, case_dict[year], snapshot_year)

        # grab edge data
        edge_data = get_edge_data(G, edgelist, snapshot_df, columns_to_use,
                                  similarity_matrix, CLid_to_index,
                                  edge_status=None)

            
        for test_case in case_dict[year]:

            # indices of edge_data 
            df_indices = [test_case['name'] + '_' + v['name']
                          for v in G.vs.select(year_le=snapshot_year)]

            # grab test case edges
            case_edge_data = edge_data.loc[df_indices]

            # rank ancestors
            ancestor_ranking = get_case_ranking_logreg(case_edge_data,
                                                       LogReg, columns_to_use)

            # get cited cases
            cited_cases = get_cited_cases(G, test_case)


            # compute rank score for cited cases
            score = score_ranking(cited_cases, ancestor_ranking)
            
            test_case_rank_scores.append(score)

    # return test_case_rank_scores, case_ranks, test_cases
    return test_case_rank_scores

In [18]:
def get_cited_cases(G, citing_vertex):
    """
    Returns the ciations of a cases whose cited year is strictly less than citing year
    
    Parameters
    ----------
    G: igraph object
    
    citing_vertex: igraph vertex
    
    Output
    ------
    list of CL ids of cited cases
    """
    
    # get neighbors first as ig index
    all_citations = G.neighbors(citing_vertex.index, mode='OUT')

    # return CL indices of cases
    # only return cited cases whose year is stictly less than citing year
    return [G.vs[ig_id]['name'] for ig_id in all_citations
            if G.vs[ig_id]['year'] < citing_vertex['year']]

In [19]:
def get_test_cases_by_snapshot_dict(G, test_cases, active_years):
    """
    Organizes test cases by year

    list is igraph indices
    """
    # get the citing year of each edge
    case_years = [case['year'] for case in test_cases]

    # dict that organizes edges by ing snapshot year
    case_dict = {y: [] for y in active_years}
    for i in range(len(test_cases)):
        case_dict[case_years[i]].append(test_cases[i])

    # only return years with at least one case
    return {k : case_dict[k] for k in case_dict.keys() if len(case_dict[k]) > 1}

In [20]:
def contact_lists(LOL):
    """
    Concatonates a list of lists
    """
    if len(LOL) > 1:
        return LOL[0] +  contact_lists(LOL[1:])
    else:
        return LOL[0]

In [21]:
def get_combined_edgelist(G, test_cases, snapshot_year):
    
    # build edgelist for all cases in given year
    edgelists = []
    for test_case in test_cases:

        # restrict ourselves to ancestors of ing
        # case strictly before ing year
        ancentors = [v.index for v in G.vs.select(year_le=snapshot_year)]

        # append test cases edgelist to edgelist
        edgelists.append(zip([test_case.index] * len(ancentors), ancentors))

    return contact_lists(edgelists)

compare new vs old ranking metrics


In [28]:
columns_to_use = ['indegree', 'similarity']

R = 1000
seed_ranking = 3424

LogReg = fit_logistic_regression(experiment_data_dir, columns_to_use)

In [29]:
start = time.time()
compute_ranking_metrics_LR(G, LogReg, columns_to_use, experiment_data_dir,
                            active_years, R, seed=seed_ranking,print_progress=True)

print 'new function took %d seconds for %d test cases' % (time.time() - start, R)


(1/1000) at 16:01:09
(2/1000) at 16:01:10
(4/1000) at 16:01:10
(8/1000) at 16:01:11
(16/1000) at 16:01:13
(32/1000) at 16:01:21
(64/1000) at 16:01:31
(128/1000) at 16:01:53
(256/1000) at 16:02:29
(512/1000) at 16:03:32
new function took 262 seconds for 1000 test cases

In [30]:
start = time.time()
compute_ranking_metrics_LR_group(G, LogReg, columns_to_use, experiment_data_dir,
                            active_years, R, seed=seed_ranking,print_progress=True)

print 'new and improved function took %d seconds for %d test cases' % (time.time() - start, R)


new and improved function took 294 seconds for 1000 test cases

In [ ]: