In [ ]:
def make_get_similarity(similarity_matrix, CLid_to_index):
"""
helper funciton for get_similarities
"""
def f(CLid_pair):
try:
ida = CLid_to_index[CLid_pair[0]]
idb = CLid_to_index[CLid_pair[1]]
return similarity_matrix[ida, idb]
except KeyError:
return np.nan
return f
def get_similarities(similarity_matrix, CLid_pair, CLid_to_index):
"""
Returns the similarities for cases index by CL ids as a list
Parameters
----------
similarity_matrix: precomputed similarity matrix
CLid_A, CLid_B: two lists of CL ids whose similarities we want
CLid_to_index: dict that maps CL ids to similarity_matrix indices
"""
get_similarity = make_get_similarity(similarity_matrix, CLid_to_index)
return [get_similarity( pair) for pair in CLid_pair]
In [ ]:
# this ended up being a lot slower than sampling an arbitrary pair of vertices then checking the conditions
def sample_absent_edges(G, num_samples, active_years, seed=None):
if seed:
random.seed(seed)
# citing cases must be in active_years
possible_ing_cases = set(G.vs.select(year_ge = min(active_years), year_le = max(active_years)))
samples = set()
present_edges = set(G.get_edgelist())
while len(samples) < num_samples:
# sample a citing case
ing_vertex = random.sample(possible_ing_cases, 1)[0]
# get possible cited cases
# cited cases must be strictly before citing case
possible_ed_cases = G.vs.select(year_le = ing_vertex['year'] - 1)
# sample a cited case
ed_vertex = random.sample(possible_ed_cases, 1)[0]
absent_edge = (ing_vertex.index, ed_vertex.index)
if (absent_edge not in present_edges) and (absent_edge not in samples):
samples.add(absent_edge)
return samples
this could be helpful http://stackoverflow.com/questions/31784011/scikit-learn-fitting-data-into-chunks-vs-fitting-it-all-at-once
In [ ]:
class textfile_chunks:
def __init__(self, paths, chunk_size):
self.i = 0
self.paths = paths
self.chunk_size = chunk_size
self.num_files = len(paths)
self.num_chunks = ceil(float(self.num_files)/self.chunk_size)
def __iter__(self):
return self
def next(self):
if self.i < self.num_chunks:
# file paths to return
file_paths = self.paths[self.i:min(self.i + self.chunk_size,
self.num_files)]
# read in files and put them into dict
files = {}
for path in file_paths:
text = open(path, 'r').read()
files[path] = text_normalization(text)
self.i += 1
return files
else:
raise StopIteration()
In [ ]:
import numpy as np
from collections import Counter
def get_time_aware_pagerank_matrix(A, years, p, qtv, qvt):
"""
Returns the transition matrix of the time aware PageRank random walk
defined as follows. This method does not take advantage of sparse matrices
for intermediate computation.
Create bipartide time-as-a-node graph F
- include time as a node i.e. vertices are V(G) U V(G).years
- F contains a copy of G
- edge from each vetex to AND from its year
- edges go from each year to the following year
When the random walk is at a vertex of G
- probability qvt transitions to the time node
- probability 1 - qvt does a PageRank move
When the random walk is at a time node
- probability qtv transitions to a vertex in G (of the corresponding year)
- probability 1 - qtv moves to the next year
Parameters
----------
A: adjacency matrix of original matrix where Aij = 1
iff there is an edge from i to j
Y: the years assigned to each node
p: PageRank parameter
qtv: probability of transitioning from time to vertex in original graph
qvt: probability of transitioning from vertx to time
Output
------
P: the transition matrix
"""
# number of vertices in the graph
n = A.shape[0]
outdegrees = A.sum(axis=1)
# zero index the years
Y = np.array(years) - min(years)
m = max(Y) + 1
# number of cases per year
cases_per_year = [0] * m
cases_per_year_counter = Counter(Y)
for k in cases_per_year_counter.keys():
cases_per_year[k] = cases_per_year_counter[k]
# PageRank transition matrix
# (see murphy 17.37)
D = np.diag([0 if d == 0 else 1.0/d for d in outdegrees])
z = [1.0/n if d == 0 else (1.0 - p) / n for d in outdegrees]
PR = p * np.dot(A.T, D) + np.outer([1] * n, z)
# Time-Time transition matrix
# ones below diagonal
TT = np.zeros((m, m))
TT[1:m, :m-1] = np.diag([1] * (m - 1))
# Vertex-Time transition matrix
# i-th column is the Y[i]th basis vector
VT = np.zeros((m, n))
identity_m = np.eye(m) # for basis vectors
for i in range(n):
VT[:, i] = identity_m[:, Y[i]]
# Time-Vertex transition matrix
# VT transpose but entries are scaled by number of cases in the year
TV = np.zeros((n, m))
# 1 over number of cases per year
n_inv = [0 if cases_per_year[i] == 0 else 1.0/cases_per_year[i]
for i in range(m)]
for i in range(n):
TV[i, :] = identity_m[Y[i], :] * n_inv[Y[i]]
# normalization matrix for TV
qtv_diag = [0 if cases_per_year[i] == 0 else qtv for i in range(m)]
qtv_diag[-1] = 1 # last column of TT is zeros
Qtv = np.diag(qtv_diag)
# overall transition matrix
P = np.zeros((n + m, n + m))
P[:n, :n] = (1 - qvt) * PR # upper left
P[n:, :-m] = qvt * VT # lower left
P[:n, -m:] = np.dot(TV, Qtv) # upper right
P[-m:, -m:] = np.dot(TT, np.eye(m) - Qtv) # lower right
return P
def get_time_aware_pagerank(A, years, p, qtv, qvt):
P = get_time_aware_pagerank_matrix(A, years, p, qtv, qvt)
# get PageRank values
leading_eig = get_leading_evector(P)
ta_pr = leading_eig[:n]
pr_years = leading_eig[-m:]
# normalize to probabilty vectors
return ta_pr/sum(ta_pr), pr_years/sum(pr_years)
def get_leading_evector(M):
evals, evecs = np.linalg.eig(M)
# evals, evecs = sp.linalg.eig(M)
# there really has to be a more elegant way to do this
return np.real(evecs[:, np.argmax(evals)].reshape(-1))
In [ ]:
import numpy as np
import scipy as sp
def get_mean_rankscore(relevant, pi):
"""
Retuns the mean rank score for a ranking.
The rank score of a ranked case is defined to be
rank_score = 1 - rank/num_ancestors
the mean rank score is the mean of all the rank scores of the cited cases
Between 0 and 1. Smaller values are better. Random ranking is .5.
See Zanin et al. (pref attachemnt aging ...)
Parameters
----------
R: list of cases that were cited
pi: ranking of ancestors
Output
------
average of cited case ranks scores scores
"""
rank_scores = []
# number of ancestors
num_items = len(pi)
# compute rank score for each case test case actually cited
for r in relevant:
# where cited case was ranked ()
rank = np.where(pi == r)[0][0] + 1
# score the ranking
rank_score = float(rank) / num_items
rank_scores.append(rank_score)
return np.mean(rank_scores)
def get_reciprocal_rank(relevant, pi):
"""
Returns the reciprocal rank 1 / r(c) where r(c)
is the rank of the best rank of the cited cases.
Between 0 and 1, smaller is better
Parameters
----------
relevant: list of cases that were cited
pi: ranking of ancestors
Output
------
reciprocal rank
"""
ranks = []
# compute rank score for each case test case actually cited
for r in relevant:
# where cited case was ranked ()
rank = np.where(pi == r)[0][0] + 1
ranks.append(rank)
return 1.0/min(ranks)
def get_precision_at_K(relevant, pi, K):
"""
Returns the precision at K
P@K = num cited cases in top K poisitions of pi / K
Parameters
----------
relevant: list of cases that were cited
pi: ranking of ancestors
Output
------
precision at K
"""
top_k = set(pi[:K])
precision_k = [1 for r in relevant if r in top_k]
return float(len(precision_k)) / K
def get_error_rate(predictions):
"""
Returns the erro 0-1 classification prediction
Parameters
----------
predictions: pd df with columns 'pred_prob' and 'y'
"""
cutoff = 0.5
# get list of predicted probs and citaiton indicators
y_act = predictions['y'].tolist()
y_pred = [1 if p > cutoff else 0 for p in predictions['pred_prob']]
return np.mean([1 if y_act[i] == y_pred[i] else 0
for i in range(len(y_act))])
def get_logloss(predictions):
"""
Returns the log-loss for a 0-1 classification predictions
Parameters
----------
predictions: pd df with columns 'pred_prob' and 'y'
"""
# get list of predicted probs and citaiton indicators
y_act = predictions['y'].tolist()
prob = predictions['pred_prob'].tolist()
return logloss(y_act, prob)
def logloss(act, pred):
"""
Returns the log loss
Parameters
----------
"""
epsilon = 1e-15
pred = sp.maximum(epsilon, pred)
pred = sp.minimum(1-epsilon, pred)
ll = sum(act*sp.log(pred) + sp.subtract(1, act)*sp.log(sp.subtract(1, pred)))
ll = ll * -1.0/len(act)
return ll