In [17]:
    
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/data/courtlistener/'
import os
import sys
import time
from math import *
import copy
import cPickle as pickle
# data
import numpy as np
import pandas as pd
# viz
import matplotlib.pyplot as plt
# graph
import igraph as ig
# our code
sys.path.append(repo_directory + 'code/')
from pipeline.download_data import *
from pipeline.make_raw_case_metadata import *
from helpful_functions import case_info
sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from make_case_text_files import *
from custom_vertex_metrics import get_CiteRankPoly, get_CiteRank
# court
court = 'scotus'
network_name = 'scotus'
# directory set up
experiment_data_dir = data_dir + 'scotus/'
text_dir = experiment_data_dir + 'textfiles/'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
    
    
In [18]:
    
G = ig.Graph.Read_GraphML(experiment_data_dir + 'scotus_network.graphml')
    
In [17]:
    
def get_vertex_recent_citations(v, threshold_year):
    return len([ing for ing in v.neighbors(mode="IN") if threshold_year <= ing['year']])
    
In [34]:
    
def get_recent_citations(G, current_year, threshold):
    """
    Number of citations in past T years
    
    Parameters
    ---------
    G: igraph object with 'year' vertex attributes
    
    current_year: current year
    
    threshold: how many years before to look
    
    Output
    ------
    Returns a list ordered by ig index of recent citations
    
    i.e. number citations that happend after current_year - threshold
    """
    threshold_year = current_year - threshold
    return [get_vertex_recent_citations(v, threshold_year) for v in G.vs]
    
In [9]:
    
rd = get_recent_degrees(G, current_year=2016, threshold=10)
    
In [33]:
    
thresholds =  5 * np.arange(1, 21)
current_year = 2016
i = 5320
rcs = [get_recent_citations(G, current_year, threshold=T)[i] for T in thresholds]
    
In [35]:
    
plt.scatter(current_year - thresholds, rcs)
    
    Out[35]:
    
In [38]:
    
metric ='recentcite_10'
    
In [39]:
    
c = create_metric_column(G, metric, year=2016)
    
In [105]:
    
v = G.vs[798]
h = 2
    
In [239]:
    
def get_vertex_recent_hindex(v):
    """
    Naive implementation: SUPER SLOW (no 100% sure I did this right)
    TODO: implement this in a less dumb way
    """
    MAXITER = 1e4
    
    # neighbors of v
    citations = v.neighbors(mode='IN')
    h = 1
    keep_going = True
    
    if len(citations) == 0:
        return 0
    else:
        i = 0
        while keep_going and i <= MAXITER:
            i += 1
            # if v has at least h citations
            if len(citations) > h: 
                # who each have at least h citation
                if len([c for c in citations if len(c.neighbors(mode="IN")) >= h]) <= h:
                    keep_going = False
            else:
                keep_going = False
        return h
    
In [240]:
    
def get_hindex(G):
    return [get_vertex_recent_hindex(v) for v in G.vs]
    
In [241]:
    
%time hindx = get_hindex(G)
    
    
In [12]:
    
n = len(G.vs)
p = .85
reset = [1.0/n] * n
pp = G.personalized_pagerank(damping=p, reset=reset)
pr = G.pagerank()
    
In [17]:
    
    
In [35]:
    
p = .85
current_year = 2016
half_life = 5.0
def get_CiteRank(G, half_life, p=.85):
    
    half_life = float(half_life)
    
    # years of each case
    years = np.array(G.vs['year'])
    current_year = max(years)
    # compute exponentially decaying probabilities
    ages = current_year - years
    exp_weights = 2 ** (- ages/half_life)
    probs = exp_weights / exp_weights.sum()
    return G.personalized_pagerank(damping=p, reset=probs)
    
In [19]:
    
cr = get_CiteRank(G, 5)
pr = G.pagerank()
    
In [43]:
    
plt.figure(figsize=[10, 10])
n = len(G.vs)
years = G.vs['year']
# plot pr vs ta_pr
plt.subplot(2,2,1)
plt.scatter(years, pr, 
            color='blue',
            alpha=.2,
            label='pagerank')
plt.scatter(years, cr,
            alpha=.2,
            color='red',
            label='citerank')
plt.ylim([0, 1.2 * max(max(cr), max(pr))])
plt.legend(loc='upper right')
plt.xlabel('year')
plt.ylabel('value')
    
    Out[43]: