In [ ]:


In [3]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

from __future__ import division

import os
import sys
import time
from math import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import igraph as ig
import copy

# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info

from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *


# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = top_directory + 'explore/vertex_metrics_experiment/experiment_data/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [2]:
G = load_and_clean_graph(data_dir, court_name)

In [3]:
year_interval = 10
snapshot_year_list = np.array([year for year in range(1760, 2021) if year % year_interval == 0])
metrics = ['indegree','pagerank']

In [82]:
run_transform_snaphots(experiment_data_dir)

In [ ]:


In [141]:
# get all present edges
edgelist_to_add = G.get_edgelist()
num_edges = len(edgelist_to_add)

# get the citing year of each edge
ing_years = [G.vs[edge[0]]['year'] for edge in edgelist_to_add]

# map the citing year to the snapshot year
snap_ing_years = [get_snapshot_year(y, snapshot_year_list) for y in ing_years]

# dict that organizes edges by ing snapshot year
edges_by_ing_snap_year_dict = {y: [] for y in snapshot_year_list}
for i in range(num_edges):
    sn_year = snap_ing_years[i]
    edge = edgelist_to_add[i]
    
    edges_by_ing_snap_year_dict[sn_year].append(edge)

In [ ]:


In [269]:


In [319]:
# 
edge_data = pd.DataFrame(columns=columns_to_use)
for sn_year in snapshot_year_list:
    # vertex metrcs in snapshot year
    ing_snap_df = snapshots_dict['vertex_metrics_' + str(sn_year)]
    
    # edges to add whos ing year is in the snapshot year
    edges = edges_by_ing_snap_year_dict[sn_year]
    
    sn_num_edges = len(edges)
    
    # CL ids of ed cases (indexes the snap_df rows)
    ed_CLids = [G.vs[edge[1]]['name'] for edge in edges]
    ing_CLids = [G.vs[edge[0]]['name'] for edge in edges]
    
    # ages 
    ed_year = np.array([G.vs[edge[1]]['year'] for edge in edges])
    ing_year = np.array([G.vs[edge[0]]['year'] for edge in edges])
    
    
    # get case similarities
    similarities = [0] * sn_num_edges
    for i in range(sn_num_edges):
        # similarities[i] = similarity_matrix.ix[ing_CLids[i], ed_CLids[i]]
        similarities[i] = 0
    
    
    # ed metrics in ing year 
    ed_metrics = ing_snap_df.loc[ed_CLids]
    
    # create edge data frame 
    sn_edge_data = pd.DataFrame()
    sn_edge_data['indegree'] = ed_metrics['indegree'].tolist()
    sn_edge_data['l_pagerank'] = ed_metrics['l_pagerank'].tolist()
    
    sn_edge_data['age'] = ing_year - ed_year
    sn_edge_data['similarity'] = similarities
    
    

    sn_edge_data.index = [str(edge[0]) + '_' + str(edge[1]) for edge in edges]
    sn_edge_data.index.name = 'CLids'
    
    # edge_data = pd.concat([edge_data, sn_edge_data], axis=1)
    edge_data = edge_data.append(sn_edge_data)

make functions


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [53]:
# load snapshot dataframes
snapshots_dict = load_snapshots(experiment_data_dir, train=True)

# similarity_matrix = pd.read_csv(experiment_data_dir + 'similarity_matrix.csv', index_col=0)
similarity_matrix = 0

# initialize edge data frame
colnames = copy.deepcopy(columns_to_use)
colnames.append('is_edge')
edge_data = pd.DataFrame(columns=colnames)

# get all present edges
present_edgelist = G.get_edgelist()

# organize edges by ing snapshot year
edges_by_ing_snap_year_dict =  get_edges_by_snapshot_dict(G, present_edgelist, snapshot_year_list)

In [54]:
# add present edge data
for sn_year in snapshot_year_list:
    # vertex metrcs in snapshot year
    snapshot_df = snapshots_dict['vertex_metrics_' + str(sn_year)]

    # edges to add whos ing year is in the snapshot year
    edges = edges_by_ing_snap_year_dict[sn_year]

    sn_edge_data = populate_edge_df(G, edges, snapshot_df, similarity_matrix, edge_status='present')
    edge_data.append(sn_edge_data)

In [55]:
edge_data


Out[55]:
indegree l_pagerank age similarity is_edge

In [57]:
sn_edge_data


Out[57]:
indegree l_pagerank age similarity is_edge
CLids
3489_14659 20 -10.430492 10 0 1
8626_19190 17 -9.202492 187 0 1
8626_19540 13 -9.225286 179 0 1
8626_20355 48 -9.056253 160 0 1
8626_20356 6 -10.638067 160 0 1
8626_20781 7 -10.305295 154 0 1
8626_20785 16 -10.275654 154 0 1
8626_20827 4 -10.828957 153 0 1
8626_20828 3 -11.313848 153 0 1
8626_20886 9 -10.623997 152 0 1
8626_21242 31 -9.353188 146 0 1
8626_21340 28 -9.381410 145 0 1
8626_21351 21 -9.922918 145 0 1
8626_21585 31 -9.152011 143 0 1
8626_21598 15 -9.867753 143 0 1
8626_21713 8 -10.006439 142 0 1
8626_21732 6 -10.711029 142 0 1
8626_21806 48 -8.502403 141 0 1
8626_21846 7 -10.889696 141 0 1
8626_21885 10 -10.282365 141 0 1
8626_22488 25 -9.215012 138 0 1
8626_22794 21 -9.804277 136 0 1
8626_23495 46 -8.152902 132 0 1
8626_23673 12 -10.277912 131 0 1
8626_24218 40 -8.583699 129 0 1
8626_24349 5 -10.130515 129 0 1
8626_24374 11 -10.576345 128 0 1
8626_24803 22 -8.852576 127 0 1
8626_25102 12 -9.774079 126 0 1
8626_25148 23 -9.690808 126 0 1
... ... ... ... ... ...
27902_10287 9 -11.079021 32 0 1
27902_10424 6 -11.136528 32 0 1
27902_10818 71 -9.948991 30 0 1
27902_11324 39 -10.493053 28 0 1
27902_11470 18 -10.910480 27 0 1
27902_11727 18 -11.046646 26 0 1
27902_12383 30 -10.593719 22 0 1
27902_12532 62 -10.205436 21 0 1
27902_12537 11 -11.183016 21 0 1
27902_12677 15 -10.828842 20 0 1
27902_13099 28 -10.634809 18 0 1
27902_13176 18 -10.996026 17 0 1
27902_13205 20 -10.842327 17 0 1
27902_13242 23 -10.757460 16 0 1
27902_13293 25 -10.949408 16 0 1
27902_13295 16 -11.127490 16 0 1
27902_13520 34 -10.322698 13 0 1
27902_14659 20 -10.430492 10 0 1
27902_15097 13 -11.056688 9 0 1
27902_16527 11 -11.280293 6 0 1
27902_16529 7 -10.813227 6 0 1
27902_17301 66 -10.249631 2 0 1
27902_20178 22 -8.595442 163 0 1
27902_26192 26 -9.889950 123 0 1
27902_26618 46 -8.759077 121 0 1
27902_29701 608 -7.738451 107 0 1
27902_29749 20 -9.988881 107 0 1
27902_32874 47 -9.569925 93 0 1
28342_15962 18 -10.232796 8 0 1
28342_16685 9 -10.629858 4 0 1

12284 rows × 5 columns


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [64]:
def make_edge_df(G, experiment_data_dir, snapshot_year_list, num_non_edges_to_add, columns_to_use, seed=None):
    
    # load snapshot dataframes
    snapshots_dict = load_snapshots(experiment_data_dir, train=True)
    
    # similarity_matrix = pd.read_csv(experiment_data_dir + 'similarity_matrix.csv', index_col=0)
    similarity_matrix = 0
    
    # initialize edge data frame
    colnames = copy.deepcopy(columns_to_use)
    colnames.append('is_edge')
    edge_data = pd.DataFrame(columns=colnames)
    
    # get all present edges
    present_edgelist = G.get_edgelist()

    # organize edges by ing snapshot year
    edges_by_ing_snap_year_dict =  get_edges_by_snapshot_dict(G, present_edgelist, snapshot_year_list)
    
    # add present edge data
    for sn_year in snapshot_year_list:
        # vertex metrcs in snapshot year
        snapshot_df = snapshots_dict['vertex_metrics_' + str(sn_year)]
        
        # edges to add whos ing year is in the snapshot year
        edges = edges_by_ing_snap_year_dict[sn_year]
    
        sn_edge_data = populate_edge_df(G, edges, snapshot_df, similarity_matrix, edge_status='present')
        edge_data = edge_data.append(sn_edge_data)
        
        
    # get a sample of non-present edges
    absent_edgelist = sample_non_edges(G, year_interval, num_non_edges_to_add,
                                       seed=seed)
    
    # organize edges by ing snapshot year
    edges_by_ing_snap_year_dict =  get_edges_by_snapshot_dict(G, absent_edgelist, snapshot_year_list)
    
    # add absent edge data
    for sn_year in snapshot_year_list:
        # vertex metrcs in snapshot year
        snapshot_df = snapshots_dict['vertex_metrics_' + str(sn_year)]
        
        # edges to add whos ing year is in the snapshot year
        edges = edges_by_ing_snap_year_dict[sn_year]
    
        sn_edge_data = populate_edge_df(G, edges, snapshot_df, similarity_matrix, edge_status='absent')
        edge_data = edge_data.append(sn_edge_data)
        
        
    # edge_data.to_csv(experiment_data_dir + 'edge_data.csv')
    
    return edge_data

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [25]:
def get_edges_by_snapshot_dict(G, edgelist, snapshot_year_list):
    """
    Organizes edges by ing snapshot year
    
    """
    
    num_edges = len(edgelist)

     # get the citing year of each edge
    ing_years = [G.vs[edge[0]]['year'] for edge in edgelist]
    
    # map the citing year to the snapshot year
    snap_ing_years = [get_snapshot_year(y, snapshot_year_list) for y in ing_years]
    
    
     # dict that organizes edges by ing snapshot year
    edges_by_ing_snap_year_dict = {y: [] for y in snapshot_year_list}
    for i in range(num_edges):
        sn_year = snap_ing_years[i]
        edge = edgelist[i]

        edges_by_ing_snap_year_dict[sn_year].append(edge)
    
    return edges_by_ing_snap_year_dict

In [49]:
def get_edge_data(G, edges, snapshot_df, similarity_matrix, edge_status=None):

    
    num_edges = len(edges)
    
    # CL ids of ed cases (indexes the snap_df rows)
    ed_CLids = [G.vs[edge[1]]['name'] for edge in edges]
    ing_CLids = [G.vs[edge[0]]['name'] for edge in edges]
    
    # ages 
    ed_year = np.array([G.vs[edge[1]]['year'] for edge in edges])
    ing_year = np.array([G.vs[edge[0]]['year'] for edge in edges])
    
    
    # get case similarities
    similarities = [0] * num_edges
    for i in range(num_edges):
        # similarities[i] = similarity_matrix.ix[ing_CLids[i], ed_CLids[i]]
        similarities[i] = 0
    
   
    # ed metrics in ing year 
    ed_metrics = snapshot_df.loc[ed_CLids]
    
    # create edge data frame 
    edge_data = pd.DataFrame()
    edge_data['indegree'] = ed_metrics['indegree'].tolist()
    edge_data['l_pagerank'] = ed_metrics['l_pagerank'].tolist()
    
    edge_data['age'] = ing_year - ed_year
    edge_data['similarity'] = similarities
    
    # add edge status
    if edge_status == 'present':
        is_edge = [1] *num_edges
    elif edge_status == 'absent':
        is_edge = [0] *num_edges
    else:
        # TODO: check if edge is present
        is_edge = [-999] * num_edges
    
    edge_data['is_edge'] = is_edge
    

    edge_data.index = [str(edge[0]) + '_' + str(edge[1]) for edge in edges]
    edge_data.index.name = 'CLids'
    
    return edge_data

In [ ]:


In [12]:
columns_to_use = ['indegree', 'l_pagerank', 'age', 'similarity']
num_non_edges_to_add = 10 # len(G.es())
snapshot_year_list = np.array([year for year in range(1760, 2021) if year % 10 == 0])


make_edge_df(G, experiment_data_dir, snapshot_year_list, num_non_edges_to_add, columns_to_use, seed=None)

In [15]:
df = pd.read_csv(experiment_data_dir + 'edge_data.csv', index_col=0)

In [ ]: