notebook.community

Edit and run



In [ ]:



In [3]:

    
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

from __future__ import division

import os
import sys
import time
from math import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import igraph as ig
import copy

# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info

from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *


# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = top_directory + 'explore/vertex_metrics_experiment/experiment_data/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [2]:

    
G = load_and_clean_graph(data_dir, court_name)



In [3]:

    
year_interval = 10
snapshot_year_list = np.array([year for year in range(1760, 2021) if year % year_interval == 0])
metrics = ['indegree','pagerank']



In [82]:

    
run_transform_snaphots(experiment_data_dir)



In [ ]:



In [141]:

    
# get all present edges
edgelist_to_add = G.get_edgelist()
num_edges = len(edgelist_to_add)

# get the citing year of each edge
ing_years = [G.vs[edge[0]]['year'] for edge in edgelist_to_add]

# map the citing year to the snapshot year
snap_ing_years = [get_snapshot_year(y, snapshot_year_list) for y in ing_years]

# dict that organizes edges by ing snapshot year
edges_by_ing_snap_year_dict = {y: [] for y in snapshot_year_list}
for i in range(num_edges):
    sn_year = snap_ing_years[i]
    edge = edgelist_to_add[i]
    
    edges_by_ing_snap_year_dict[sn_year].append(edge)



In [ ]:



In [269]:



In [319]:

    
# 
edge_data = pd.DataFrame(columns=columns_to_use)
for sn_year in snapshot_year_list:
    # vertex metrcs in snapshot year
    ing_snap_df = snapshots_dict['vertex_metrics_' + str(sn_year)]
    
    # edges to add whos ing year is in the snapshot year
    edges = edges_by_ing_snap_year_dict[sn_year]
    
    sn_num_edges = len(edges)
    
    # CL ids of ed cases (indexes the snap_df rows)
    ed_CLids = [G.vs[edge[1]]['name'] for edge in edges]
    ing_CLids = [G.vs[edge[0]]['name'] for edge in edges]
    
    # ages 
    ed_year = np.array([G.vs[edge[1]]['year'] for edge in edges])
    ing_year = np.array([G.vs[edge[0]]['year'] for edge in edges])
    
    
    # get case similarities
    similarities = [0] * sn_num_edges
    for i in range(sn_num_edges):
        # similarities[i] = similarity_matrix.ix[ing_CLids[i], ed_CLids[i]]
        similarities[i] = 0
    
    
    # ed metrics in ing year 
    ed_metrics = ing_snap_df.loc[ed_CLids]
    
    # create edge data frame 
    sn_edge_data = pd.DataFrame()
    sn_edge_data['indegree'] = ed_metrics['indegree'].tolist()
    sn_edge_data['l_pagerank'] = ed_metrics['l_pagerank'].tolist()
    
    sn_edge_data['age'] = ing_year - ed_year
    sn_edge_data['similarity'] = similarities
    
    

    sn_edge_data.index = [str(edge[0]) + '_' + str(edge[1]) for edge in edges]
    sn_edge_data.index.name = 'CLids'
    
    # edge_data = pd.concat([edge_data, sn_edge_data], axis=1)
    edge_data = edge_data.append(sn_edge_data)

make functions



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [53]:

    
# load snapshot dataframes
snapshots_dict = load_snapshots(experiment_data_dir, train=True)

# similarity_matrix = pd.read_csv(experiment_data_dir + 'similarity_matrix.csv', index_col=0)
similarity_matrix = 0

# initialize edge data frame
colnames = copy.deepcopy(columns_to_use)
colnames.append('is_edge')
edge_data = pd.DataFrame(columns=colnames)

# get all present edges
present_edgelist = G.get_edgelist()

# organize edges by ing snapshot year
edges_by_ing_snap_year_dict =  get_edges_by_snapshot_dict(G, present_edgelist, snapshot_year_list)



In [54]:

    
# add present edge data
for sn_year in snapshot_year_list:
    # vertex metrcs in snapshot year
    snapshot_df = snapshots_dict['vertex_metrics_' + str(sn_year)]

    # edges to add whos ing year is in the snapshot year
    edges = edges_by_ing_snap_year_dict[sn_year]

    sn_edge_data = populate_edge_df(G, edges, snapshot_df, similarity_matrix, edge_status='present')
    edge_data.append(sn_edge_data)



In [55]:

    
edge_data









    Out[55]:






  
    
      
      indegree
      l_pagerank
      age
      similarity
      is_edge



In [57]:

    
sn_edge_data









    Out[57]:






  
    
      
      indegree
      l_pagerank
      age
      similarity
      is_edge
    
    
      CLids
      
      
      
      
      
    
  
  
    
      3489_14659
      20
      -10.430492
      10
      0
      1
    
    
      8626_19190
      17
      -9.202492
      187
      0
      1
    
    
      8626_19540
      13
      -9.225286
      179
      0
      1
    
    
      8626_20355
      48
      -9.056253
      160
      0
      1
    
    
      8626_20356
      6
      -10.638067
      160
      0
      1
    
    
      8626_20781
      7
      -10.305295
      154
      0
      1
    
    
      8626_20785
      16
      -10.275654
      154
      0
      1
    
    
      8626_20827
      4
      -10.828957
      153
      0
      1
    
    
      8626_20828
      3
      -11.313848
      153
      0
      1
    
    
      8626_20886
      9
      -10.623997
      152
      0
      1
    
    
      8626_21242
      31
      -9.353188
      146
      0
      1
    
    
      8626_21340
      28
      -9.381410
      145
      0
      1
    
    
      8626_21351
      21
      -9.922918
      145
      0
      1
    
    
      8626_21585
      31
      -9.152011
      143
      0
      1
    
    
      8626_21598
      15
      -9.867753
      143
      0
      1
    
    
      8626_21713
      8
      -10.006439
      142
      0
      1
    
    
      8626_21732
      6
      -10.711029
      142
      0
      1
    
    
      8626_21806
      48
      -8.502403
      141
      0
      1
    
    
      8626_21846
      7
      -10.889696
      141
      0
      1
    
    
      8626_21885
      10
      -10.282365
      141
      0
      1
    
    
      8626_22488
      25
      -9.215012
      138
      0
      1
    
    
      8626_22794
      21
      -9.804277
      136
      0
      1
    
    
      8626_23495
      46
      -8.152902
      132
      0
      1
    
    
      8626_23673
      12
      -10.277912
      131
      0
      1
    
    
      8626_24218
      40
      -8.583699
      129
      0
      1
    
    
      8626_24349
      5
      -10.130515
      129
      0
      1
    
    
      8626_24374
      11
      -10.576345
      128
      0
      1
    
    
      8626_24803
      22
      -8.852576
      127
      0
      1
    
    
      8626_25102
      12
      -9.774079
      126
      0
      1
    
    
      8626_25148
      23
      -9.690808
      126
      0
      1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      27902_10287
      9
      -11.079021
      32
      0
      1
    
    
      27902_10424
      6
      -11.136528
      32
      0
      1
    
    
      27902_10818
      71
      -9.948991
      30
      0
      1
    
    
      27902_11324
      39
      -10.493053
      28
      0
      1
    
    
      27902_11470
      18
      -10.910480
      27
      0
      1
    
    
      27902_11727
      18
      -11.046646
      26
      0
      1
    
    
      27902_12383
      30
      -10.593719
      22
      0
      1
    
    
      27902_12532
      62
      -10.205436
      21
      0
      1
    
    
      27902_12537
      11
      -11.183016
      21
      0
      1
    
    
      27902_12677
      15
      -10.828842
      20
      0
      1
    
    
      27902_13099
      28
      -10.634809
      18
      0
      1
    
    
      27902_13176
      18
      -10.996026
      17
      0
      1
    
    
      27902_13205
      20
      -10.842327
      17
      0
      1
    
    
      27902_13242
      23
      -10.757460
      16
      0
      1
    
    
      27902_13293
      25
      -10.949408
      16
      0
      1
    
    
      27902_13295
      16
      -11.127490
      16
      0
      1
    
    
      27902_13520
      34
      -10.322698
      13
      0
      1
    
    
      27902_14659
      20
      -10.430492
      10
      0
      1
    
    
      27902_15097
      13
      -11.056688
      9
      0
      1
    
    
      27902_16527
      11
      -11.280293
      6
      0
      1
    
    
      27902_16529
      7
      -10.813227
      6
      0
      1
    
    
      27902_17301
      66
      -10.249631
      2
      0
      1
    
    
      27902_20178
      22
      -8.595442
      163
      0
      1
    
    
      27902_26192
      26
      -9.889950
      123
      0
      1
    
    
      27902_26618
      46
      -8.759077
      121
      0
      1
    
    
      27902_29701
      608
      -7.738451
      107
      0
      1
    
    
      27902_29749
      20
      -9.988881
      107
      0
      1
    
    
      27902_32874
      47
      -9.569925
      93
      0
      1
    
    
      28342_15962
      18
      -10.232796
      8
      0
      1
    
    
      28342_16685
      9
      -10.629858
      4
      0
      1
    
  

12284 rows × 5 columns



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [64]:

    
def make_edge_df(G, experiment_data_dir, snapshot_year_list, num_non_edges_to_add, columns_to_use, seed=None):
    
    # load snapshot dataframes
    snapshots_dict = load_snapshots(experiment_data_dir, train=True)
    
    # similarity_matrix = pd.read_csv(experiment_data_dir + 'similarity_matrix.csv', index_col=0)
    similarity_matrix = 0
    
    # initialize edge data frame
    colnames = copy.deepcopy(columns_to_use)
    colnames.append('is_edge')
    edge_data = pd.DataFrame(columns=colnames)
    
    # get all present edges
    present_edgelist = G.get_edgelist()

    # organize edges by ing snapshot year
    edges_by_ing_snap_year_dict =  get_edges_by_snapshot_dict(G, present_edgelist, snapshot_year_list)
    
    # add present edge data
    for sn_year in snapshot_year_list:
        # vertex metrcs in snapshot year
        snapshot_df = snapshots_dict['vertex_metrics_' + str(sn_year)]
        
        # edges to add whos ing year is in the snapshot year
        edges = edges_by_ing_snap_year_dict[sn_year]
    
        sn_edge_data = populate_edge_df(G, edges, snapshot_df, similarity_matrix, edge_status='present')
        edge_data = edge_data.append(sn_edge_data)
        
        
    # get a sample of non-present edges
    absent_edgelist = sample_non_edges(G, year_interval, num_non_edges_to_add,
                                       seed=seed)
    
    # organize edges by ing snapshot year
    edges_by_ing_snap_year_dict =  get_edges_by_snapshot_dict(G, absent_edgelist, snapshot_year_list)
    
    # add absent edge data
    for sn_year in snapshot_year_list:
        # vertex metrcs in snapshot year
        snapshot_df = snapshots_dict['vertex_metrics_' + str(sn_year)]
        
        # edges to add whos ing year is in the snapshot year
        edges = edges_by_ing_snap_year_dict[sn_year]
    
        sn_edge_data = populate_edge_df(G, edges, snapshot_df, similarity_matrix, edge_status='absent')
        edge_data = edge_data.append(sn_edge_data)
        
        
    # edge_data.to_csv(experiment_data_dir + 'edge_data.csv')
    
    return edge_data



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [25]:

    
def get_edges_by_snapshot_dict(G, edgelist, snapshot_year_list):
    """
    Organizes edges by ing snapshot year
    
    """
    
    num_edges = len(edgelist)

     # get the citing year of each edge
    ing_years = [G.vs[edge[0]]['year'] for edge in edgelist]
    
    # map the citing year to the snapshot year
    snap_ing_years = [get_snapshot_year(y, snapshot_year_list) for y in ing_years]
    
    
     # dict that organizes edges by ing snapshot year
    edges_by_ing_snap_year_dict = {y: [] for y in snapshot_year_list}
    for i in range(num_edges):
        sn_year = snap_ing_years[i]
        edge = edgelist[i]

        edges_by_ing_snap_year_dict[sn_year].append(edge)
    
    return edges_by_ing_snap_year_dict



In [49]:

    
def get_edge_data(G, edges, snapshot_df, similarity_matrix, edge_status=None):

    
    num_edges = len(edges)
    
    # CL ids of ed cases (indexes the snap_df rows)
    ed_CLids = [G.vs[edge[1]]['name'] for edge in edges]
    ing_CLids = [G.vs[edge[0]]['name'] for edge in edges]
    
    # ages 
    ed_year = np.array([G.vs[edge[1]]['year'] for edge in edges])
    ing_year = np.array([G.vs[edge[0]]['year'] for edge in edges])
    
    
    # get case similarities
    similarities = [0] * num_edges
    for i in range(num_edges):
        # similarities[i] = similarity_matrix.ix[ing_CLids[i], ed_CLids[i]]
        similarities[i] = 0
    
   
    # ed metrics in ing year 
    ed_metrics = snapshot_df.loc[ed_CLids]
    
    # create edge data frame 
    edge_data = pd.DataFrame()
    edge_data['indegree'] = ed_metrics['indegree'].tolist()
    edge_data['l_pagerank'] = ed_metrics['l_pagerank'].tolist()
    
    edge_data['age'] = ing_year - ed_year
    edge_data['similarity'] = similarities
    
    # add edge status
    if edge_status == 'present':
        is_edge = [1] *num_edges
    elif edge_status == 'absent':
        is_edge = [0] *num_edges
    else:
        # TODO: check if edge is present
        is_edge = [-999] * num_edges
    
    edge_data['is_edge'] = is_edge
    

    edge_data.index = [str(edge[0]) + '_' + str(edge[1]) for edge in edges]
    edge_data.index.name = 'CLids'
    
    return edge_data



In [ ]:



In [12]:

    
columns_to_use = ['indegree', 'l_pagerank', 'age', 'similarity']
num_non_edges_to_add = 10 # len(G.es())
snapshot_year_list = np.array([year for year in range(1760, 2021) if year % 10 == 0])


make_edge_df(G, experiment_data_dir, snapshot_year_list, num_non_edges_to_add, columns_to_use, seed=None)



In [15]:

    
df = pd.read_csv(experiment_data_dir + 'edge_data.csv', index_col=0)



In [ ]:

	indegree	l_pagerank	age	similarity	is_edge
CLids
3489_14659	20	-10.430492	10	0	1
8626_19190	17	-9.202492	187	0	1
8626_19540	13	-9.225286	179	0	1
8626_20355	48	-9.056253	160	0	1
8626_20356	6	-10.638067	160	0	1
8626_20781	7	-10.305295	154	0	1
8626_20785	16	-10.275654	154	0	1
8626_20827	4	-10.828957	153	0	1
8626_20828	3	-11.313848	153	0	1
8626_20886	9	-10.623997	152	0	1
8626_21242	31	-9.353188	146	0	1
8626_21340	28	-9.381410	145	0	1
8626_21351	21	-9.922918	145	0	1
8626_21585	31	-9.152011	143	0	1
8626_21598	15	-9.867753	143	0	1
8626_21713	8	-10.006439	142	0	1
8626_21732	6	-10.711029	142	0	1
8626_21806	48	-8.502403	141	0	1
8626_21846	7	-10.889696	141	0	1
8626_21885	10	-10.282365	141	0	1
8626_22488	25	-9.215012	138	0	1
8626_22794	21	-9.804277	136	0	1
8626_23495	46	-8.152902	132	0	1
8626_23673	12	-10.277912	131	0	1
8626_24218	40	-8.583699	129	0	1
8626_24349	5	-10.130515	129	0	1
8626_24374	11	-10.576345	128	0	1
8626_24803	22	-8.852576	127	0	1
8626_25102	12	-9.774079	126	0	1
8626_25148	23	-9.690808	126	0	1
...	...	...	...	...	...
27902_10287	9	-11.079021	32	0	1
27902_10424	6	-11.136528	32	0	1
27902_10818	71	-9.948991	30	0	1
27902_11324	39	-10.493053	28	0	1
27902_11470	18	-10.910480	27	0	1
27902_11727	18	-11.046646	26	0	1
27902_12383	30	-10.593719	22	0	1
27902_12532	62	-10.205436	21	0	1
27902_12537	11	-11.183016	21	0	1
27902_12677	15	-10.828842	20	0	1
27902_13099	28	-10.634809	18	0	1
27902_13176	18	-10.996026	17	0	1
27902_13205	20	-10.842327	17	0	1
27902_13242	23	-10.757460	16	0	1
27902_13293	25	-10.949408	16	0	1
27902_13295	16	-11.127490	16	0	1
27902_13520	34	-10.322698	13	0	1
27902_14659	20	-10.430492	10	0	1
27902_15097	13	-11.056688	9	0	1
27902_16527	11	-11.280293	6	0	1
27902_16529	7	-10.813227	6	0	1
27902_17301	66	-10.249631	2	0	1
27902_20178	22	-8.595442	163	0	1
27902_26192	26	-9.889950	123	0	1
27902_26618	46	-8.759077	121	0	1
27902_29701	608	-7.738451	107	0	1
27902_29749	20	-9.988881	107	0	1
27902_32874	47	-9.569925	93	0	1
28342_15962	18	-10.232796	8	0	1
28342_16685	9	-10.629858	4	0	1