get similarity scores for a list of cases

Alternative for get_similarities() from similiarity_matrix.py

Uses closure for workhorse function.



In [ ]:

    
def make_get_similarity(similarity_matrix, CLid_to_index):
    """
    helper funciton for get_similarities
    """
    def f(CLid_pair):
        try:
            ida = CLid_to_index[CLid_pair[0]]
            idb = CLid_to_index[CLid_pair[1]]
        
            return similarity_matrix[ida, idb]
        except KeyError:
            return np.nan
        
    return f
    
def get_similarities(similarity_matrix, CLid_pair, CLid_to_index):
    """
    Returns the similarities for cases index by CL ids as a list

    Parameters
    ----------
    similarity_matrix: precomputed similarity matrix

    CLid_A, CLid_B: two lists of CL ids whose similarities we want

    CLid_to_index: dict that maps CL ids to similarity_matrix indices
    """
    get_similarity = make_get_similarity(similarity_matrix, CLid_to_index)
    return [get_similarity( pair) for pair in CLid_pair]

sample absent edges



In [ ]:

    
# this ended up being a lot slower than sampling an arbitrary pair of vertices then checking the conditions
def sample_absent_edges(G, num_samples, active_years, seed=None):
    if seed:
        random.seed(seed)
        
    # citing cases must be in active_years
    possible_ing_cases = set(G.vs.select(year_ge = min(active_years), year_le = max(active_years)))

    samples = set()
    present_edges = set(G.get_edgelist()) 
    
    while len(samples) < num_samples:
        # sample a citing case
        ing_vertex = random.sample(possible_ing_cases, 1)[0]
        
        # get possible cited cases
        # cited cases must be strictly before citing case
        possible_ed_cases = G.vs.select(year_le = ing_vertex['year'] - 1)
        
        # sample a cited case
        ed_vertex = random.sample(possible_ed_cases, 1)[0]
        
        absent_edge = (ing_vertex.index, ed_vertex.index)
        
        if (absent_edge not in present_edges) and (absent_edge not in samples):
            samples.add(absent_edge)
            
    return samples

textfile chunk iterator

this could be helpful http://stackoverflow.com/questions/31784011/scikit-learn-fitting-data-into-chunks-vs-fitting-it-all-at-once



In [ ]:

    
class textfile_chunks:
    def __init__(self, paths, chunk_size):
        self.i = 0

        self.paths = paths
        self.chunk_size = chunk_size
        self.num_files = len(paths)

        self.num_chunks = ceil(float(self.num_files)/self.chunk_size)

    def __iter__(self):
        return self

    def next(self):
        if self.i < self.num_chunks:

            # file paths to return
            file_paths = self.paths[self.i:min(self.i + self.chunk_size,
                                    self.num_files)]

            # read in files and put them into dict
            files = {}
            for path in file_paths:
                text = open(path, 'r').read()
                files[path] = text_normalization(text)

            self.i += 1

            return files

        else:
            raise StopIteration()

Time aware page rank

Functions to compute time aware pagerank.

does not use sparse matrices
does not use discreteMarkovChain package



In [ ]:

    
import numpy as np
from collections import Counter


def get_time_aware_pagerank_matrix(A, years, p, qtv, qvt):
    """
    Returns the transition matrix of the time aware PageRank random walk
    defined as follows. This method does not take advantage of sparse matrices
    for intermediate computation.

    Create bipartide time-as-a-node graph F
    - include time as a node i.e. vertices are V(G) U V(G).years
    - F contains a copy of G
    - edge from each vetex to AND from its year
    - edges go from each year to the following year

    When the random walk is at a vertex of G
    - probability qvt transitions to the time node
    - probability 1 - qvt does a PageRank move

    When the random walk is at a time node
    - probability qtv transitions to a vertex in G (of the corresponding year)
    - probability 1 - qtv moves to the next year

    Parameters
    ----------
    A: adjacency matrix of original matrix where Aij = 1
    iff there is an edge from i to j

    Y: the years assigned to each node

    p: PageRank parameter

    qtv: probability of transitioning from time to vertex in original graph

    qvt: probability of transitioning from vertx to time

    Output
    ------
    P: the transition matrix

    """
    # number of vertices in the graph
    n = A.shape[0]
    outdegrees = A.sum(axis=1)

    # zero index the years
    Y = np.array(years) - min(years)
    m = max(Y) + 1

    # number of cases per year
    cases_per_year = [0] * m
    cases_per_year_counter = Counter(Y)
    for k in cases_per_year_counter.keys():
        cases_per_year[k] = cases_per_year_counter[k]

    # PageRank transition matrix
    # (see murphy 17.37)
    D = np.diag([0 if d == 0 else 1.0/d for d in outdegrees])
    z = [1.0/n if d == 0 else (1.0 - p) / n for d in outdegrees]
    PR = p * np.dot(A.T, D) + np.outer([1] * n, z)

    # Time-Time transition matrix
    # ones below diagonal
    TT = np.zeros((m, m))
    TT[1:m, :m-1] = np.diag([1] * (m - 1))

    # Vertex-Time transition matrix
    # i-th column is the Y[i]th basis vector
    VT = np.zeros((m, n))
    identity_m = np.eye(m)  # for basis vectors
    for i in range(n):
        VT[:, i] = identity_m[:, Y[i]]

    # Time-Vertex transition matrix
    # VT transpose but entries are scaled by number of cases in the year
    TV = np.zeros((n, m))
    # 1 over number of cases per year
    n_inv = [0 if cases_per_year[i] == 0 else 1.0/cases_per_year[i]
             for i in range(m)]
    for i in range(n):
        TV[i, :] = identity_m[Y[i], :] * n_inv[Y[i]]

    # normalization matrix for TV
    qtv_diag = [0 if cases_per_year[i] == 0 else qtv for i in range(m)]
    qtv_diag[-1] = 1  # last column of TT is zeros
    Qtv = np.diag(qtv_diag)

    # overall transition matrix
    P = np.zeros((n + m, n + m))
    P[:n, :n] = (1 - qvt) * PR  # upper left
    P[n:, :-m] = qvt * VT  # lower left
    P[:n, -m:] = np.dot(TV, Qtv)  # upper right
    P[-m:, -m:] = np.dot(TT, np.eye(m) - Qtv)  # lower right

    return P


def get_time_aware_pagerank(A, years, p, qtv, qvt):
    P = get_time_aware_pagerank_matrix(A, years, p, qtv, qvt)
    # get PageRank values
    leading_eig = get_leading_evector(P)
    ta_pr = leading_eig[:n]
    pr_years = leading_eig[-m:]

    # normalize to probabilty vectors
    return ta_pr/sum(ta_pr), pr_years/sum(pr_years)


def get_leading_evector(M):
    evals, evecs = np.linalg.eig(M)
    # evals, evecs = sp.linalg.eig(M)

    # there really has to be a more elegant way to do this
    return np.real(evecs[:, np.argmax(evals)].reshape(-1))

Ranking loss functions

these take a sorted list instead of precomputed rankings



In [ ]:

    
import numpy as np
import scipy as sp


def get_mean_rankscore(relevant, pi):
    """
    Retuns the mean rank score for a ranking.
    The rank score of a ranked case is defined to be

    rank_score = 1 - rank/num_ancestors

    the mean rank score is the mean of all the rank scores of the cited cases

    Between 0 and 1. Smaller values are better. Random ranking is .5.

    See Zanin et al. (pref attachemnt aging ...)

    Parameters
    ----------
    R: list of cases that were cited

    pi: ranking of ancestors

    Output
    ------
    average of cited case ranks scores scores
    """

    rank_scores = []

    # number of ancestors
    num_items = len(pi)

    # compute rank score for each case test case actually cited
    for r in relevant:

        # where cited case was ranked ()
        rank = np.where(pi == r)[0][0] + 1

        # score the ranking
        rank_score = float(rank) / num_items

        rank_scores.append(rank_score)

    return np.mean(rank_scores)


def get_reciprocal_rank(relevant, pi):
    """
    Returns the reciprocal rank 1 / r(c) where r(c)
    is the rank of the best rank of the cited cases.

    Between 0 and 1, smaller is better

    Parameters
    ----------
    relevant: list of cases that were cited

    pi: ranking of ancestors

    Output
    ------
    reciprocal rank
    """

    ranks = []

    # compute rank score for each case test case actually cited
    for r in relevant:

        # where cited case was ranked ()
        rank = np.where(pi == r)[0][0] + 1

        ranks.append(rank)

    return 1.0/min(ranks)


def get_precision_at_K(relevant, pi, K):
    """
    Returns the precision at K
    P@K = num cited cases in top K poisitions of pi / K


    Parameters
    ----------
    relevant: list of cases that were cited

    pi: ranking of ancestors

    Output
    ------
    precision at K
    """

    top_k = set(pi[:K])
    precision_k = [1 for r in relevant if r in top_k]

    return float(len(precision_k)) / K


def get_error_rate(predictions):
    """
    Returns the erro 0-1 classification prediction

    Parameters
    ----------
    predictions: pd df with columns 'pred_prob' and 'y'
    """

    cutoff = 0.5
    # get list of predicted probs and citaiton indicators
    y_act = predictions['y'].tolist()
    y_pred = [1 if p > cutoff else 0 for p in predictions['pred_prob']]

    return np.mean([1 if y_act[i] == y_pred[i] else 0
                    for i in range(len(y_act))])


def get_logloss(predictions):
    """
    Returns the log-loss for a 0-1 classification predictions

    Parameters
    ----------
    predictions: pd df with columns 'pred_prob' and 'y'
    """

    # get list of predicted probs and citaiton indicators
    y_act = predictions['y'].tolist()
    prob = predictions['pred_prob'].tolist()

    return logloss(y_act, prob)


def logloss(act, pred):
    """
    Returns the log loss

    Parameters
    ----------

    """
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1, act)*sp.log(sp.subtract(1, pred)))
    ll = ll * -1.0/len(act)
    return ll