In [4]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
import numpy as np
import sys
import matplotlib.pyplot as plt
from scipy.stats import rankdata
from collections import Counter
import discreteMarkovChain as dmc
from scipy.sparse import csr_matrix, dia_matrix
# graph package
import igraph as ig
# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info
sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from rankscore_experiment_sort import *
from rankscore_experiment_LR import *
from rankscore_experiment_search import *
from time_aware_pagerank import *
from make_tr_edge_df import *
# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc
# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [5]:
# load scotes
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')
In [6]:
# get a small sugraph to work wit
np.random.seed(754) # 234, 754
v = G.vs[np.random.choice(range(len(G.vs)))]
subset_ids = G.neighborhood(v.index, order=2)
g = G.subgraph(subset_ids)
print '%d nodes' % len(g.vs)
A = np.array(g.get_adjacency().data)
years = np.array(g.vs['year']).astype(int)
In [4]:
%%time
# A = np.array(G.get_adjacency().data)
# years = np.array(G.vs['year']).astype(int)
# np.save('scotus_adjmat', A)
A = np.load('scotus_adjmat.npy')
In [5]:
years = np.array(G.vs['year']).astype(int)
In [9]:
p = .85
qtv = .3
qvt = .5
In [ ]:
%%time
# number of vertices in the graph
n = A.shape[0]
outdegrees = A.sum(axis=1)
# zero index the years
Y = np.array(years) - min(years)
m = max(Y) + 1
In [ ]:
%%time
# number of cases per year
cases_per_year = [0] * m
cases_per_year_counter = Counter(Y)
for k in cases_per_year_counter.keys():
cases_per_year[k] = cases_per_year_counter[k]
In [ ]:
%%time
# PageRank transition matrix
# (see murphy 17.37)
D = np.diag([0 if d == 0 else 1.0/d for d in outdegrees])
z = [1.0/n if d == 0 else (1.0 - p) / n for d in outdegrees]
# PR = p * np.dot(A.T, D) + np.outer([1] * n, z)
In [ ]:
%%time
PR = np.dot(A.T, D)
In [ ]:
%%time
PR = p * PR + np.outer([1] * n, z)
In [ ]:
%%time
# Time-Time transition matrix
# ones below diagonal
TT = np.zeros((m, m))
TT[1:m, :m-1] = np.diag([1] * (m - 1))
In [ ]:
%%time
# Vertex-Time transition matrix
# i-th column is the Y[i]th basis vector
VT = np.zeros((m, n))
identity_m = np.eye(m) # for basis vectors
for i in range(n):
VT[:, i] = identity_m[:, Y[i]]
In [ ]:
%%time
# Time-Vertex transition matrix
# VT transpose but entries are scaled by number of cases in the year
TV = np.zeros((n, m))
# 1 over number of cases per year
n_inv = [0 if cases_per_year[i] == 0 else 1.0/cases_per_year[i]
for i in range(m)]
for i in range(n):
TV[i, :] = identity_m[Y[i], :] * n_inv[Y[i]]
In [ ]:
%%time
# normalization matrix for TV
qtv_diag = [0 if cases_per_year[i] == 0 else qtv for i in range(m)]
qtv_diag[-1] = 1 # last column of TT is zeros
Qtv = np.diag(qtv_diag)
In [ ]:
%%time
# overall transition matrix
P = np.zeros((n + m, n + m))
P[:n, :n] = (1 - qvt) * PR # upper left
P[n:, :-m] = qvt * VT # lower left
P[:n, -m:] = np.dot(TV, Qtv) # upper right
P[-m:, -m:] = np.dot(TT, np.eye(m) - Qtv) # lower right
In [ ]:
P_dense = P
In [7]:
%%time
# convert to sparse
A = csr_matrix(A, dtype=int)
In [8]:
%%time
# number of vertices in the graph
n = A.shape[0]
# surely there is a more elegant way to do this
# outdegrees = A.sum(axis=1)
outdegrees = A.sum(axis=1).reshape(-1).tolist()[0]
# zero index the years
Y = np.array(years) - min(years)
m = max(Y) + 1
In [9]:
%%time
# number of cases per year
cases_per_year = [0] * m
cases_per_year_counter = Counter(Y)
for k in cases_per_year_counter.keys():
cases_per_year[k] = cases_per_year_counter[k]
In [10]:
%%time
# PageRank transition matrix
# (see murphy 17.37)
# D = np.diag([0 if d == 0 else 1.0/d for d in outdegrees])
# D = dia_matrix([0 if d == 0 else 1.0/d for d in outdegrees],
# shape=(n, n))
# TODO: csr or dia matrix?
D = csr_matrix(np.diag([0 if d == 0 else 1.0/d for d in outdegrees]))
z = [1.0/n if d == 0 else (1.0 - p) / n for d in outdegrees]
# PR = p * np.dot(A.T, D) + np.outer([1] * n, z)
In [11]:
%%time
# PR = np.dot(A.T, D)
PR = (1 - qvt) * p * (A.transpose() * D)
PR = PR.todense()
In [12]:
%%time
PR = PR + np.outer([1 - qvt] * n, z)
In [13]:
%%time
# Time-Time transition matrix
# ones below diagonal
TT = np.zeros((m, m))
TT[1:m, :m-1] = np.diag([1] * (m - 1))
# TODO: csr or dia matrix
TT = dia_matrix(TT)
# TT = csr_matrix(TT)
In [14]:
%%time
# Vertex-Time transition matrix
# i-th column is the Y[i]th basis vector
VT = np.zeros((m, n))
identity_m = np.eye(m) # for basis vectors
for i in range(n):
VT[:, i] = qvt *identity_m[:, Y[i]]
In [15]:
%%time
# Time-Vertex transition matrix
# VT transpose but entries are scaled by number of cases in the year
TV = np.zeros((n, m))
# 1 over number of cases per year
n_inv = [0 if cases_per_year[i] == 0 else 1.0/cases_per_year[i]
for i in range(m)]
for i in range(n):
TV[i, :] = identity_m[Y[i], :] * n_inv[Y[i]]
TV = csr_matrix(TV)
In [16]:
%%time
# normalization matrix for TV
qtv_diag = [0 if cases_per_year[i] == 0 else qtv for i in range(m)]
qtv_diag[-1] = 1 # last column of TT is zeros
# Qtv = dia_matrix(np.diag(qtv_diag))
Qtv = csr_matrix(np.diag(qtv_diag))
In [17]:
%%time
# overall transition matrix
P = np.zeros((n + m, n + m))
In [18]:
%%time
# upper left
# P[:n, :n] = (1 - qvt) * PR
P[:n, :n] = PR
In [19]:
%%time
# lower left
# P[n:, :-m] = qvt * VT
P[n:, :-m] = VT
In [21]:
%%time
# upper right
P[:n, -m:] = (TV * Qtv).todense()
In [22]:
%%time
# lower right
P[-m:, -m:] = (TT * dia_matrix(np.eye(m) - Qtv)).todense()
In [7]:
def get_time_aware_pagerank_matrix_sparse(A, years, p, qtv, qvt):
A = csr_matrix(A, dtype=int)
# number of vertices in the graph
n = A.shape[0]
# surely there is a more elegant way to do this
outdegrees = A.sum(axis=1).reshape(-1).tolist()[0]
# zero index the years
Y = np.array(years) - min(years)
m = max(Y) + 1
# number of cases per year
cases_per_year = [0] * m
cases_per_year_counter = Counter(Y)
for k in cases_per_year_counter.keys():
cases_per_year[k] = cases_per_year_counter[k]
# PageRank transition matrix
# (see murphy 17.37)
# TODO: csr or dia matrix?
D = csr_matrix(np.diag([0 if d == 0 else 1.0/d for d in outdegrees]))
z = [1.0/n if d == 0 else (1.0 - p) / n for d in outdegrees]
PR = (1 - qvt) * p * (A.transpose() * D)
PR = PR.todense()
PR = PR + np.outer([1 - qvt] * n, z)
# Time-Time transition matrix
# ones below diagonal
TT = np.zeros((m, m))
TT[1:m, :m-1] = np.diag([1] * (m - 1))
TT = dia_matrix(TT)
# Vertex-Time transition matrix
# i-th column is the Y[i]th basis vector
VT = np.zeros((m, n))
identity_m = np.eye(m) # for basis vectors
for i in range(n):
VT[:, i] = qvt *identity_m[:, Y[i]]
# Time-Vertex transition matrix
# VT transpose but entries are scaled by number of cases in the year
TV = np.zeros((n, m))
# 1 over number of cases per year
n_inv = [0 if cases_per_year[i] == 0 else 1.0/cases_per_year[i]
for i in range(m)]
for i in range(n):
TV[i, :] = identity_m[Y[i], :] * n_inv[Y[i]]
TV = csr_matrix(TV)
# normalization matrix for TV
qtv_diag = [0 if cases_per_year[i] == 0 else qtv for i in range(m)]
qtv_diag[-1] = 1 # last column of TT is zeros
# Qtv = dia_matrix(np.diag(qtv_diag))
Qtv = csr_matrix(np.diag(qtv_diag))
# overall transition matrix
P = np.zeros((n + m, n + m)) # upper left
P[:n, :n] = PR # lower left
P[n:, :-m] = VT # upper right
P[:n, -m:] = (TV * Qtv).todense() # lower right
P[-m:, -m:] = (TT * dia_matrix(np.eye(m) - Qtv)).todense()
return P.T
In [ ]:
In [ ]:
In [12]:
%%time
P = get_time_aware_pagerank_matrix_sparse(A, years, p, qtv, qvt)
In [ ]:
%%time
P_dense = get_time_aware_pagerank_matrix(A, years, p, qtv, qvt)
In [19]:
diff = P - P_dense
diff.sum()
Out[19]:
In [19]:
sum(P[:, 0])
Out[19]:
In [13]:
%%time
mc = dmc.markovChain(P.T)
mc.computePi('linear')
dmc_linear = mc.pi
In [14]:
%%time
mc = dmc.markovChain(P.T)
mc.computePi('krylov')
dmc_krylov = mc.pi
In [15]:
%%time
mc = dmc.markovChain(P.T)
mc.computePi('eigen')
dmc_eigen = mc.pi
In [16]:
%%time
leading_eig = get_leading_evector(P)
leading_eig = leading_eig/sum(leading_eig)
In [ ]: