notebook.community

Edit and run



In [15]:

    
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/data/courtlistener/'

import numpy as np
import sys
import matplotlib.pyplot as plt


# stat
import numpy as np
import cPickle as picklefrom 
from collections import OrderedDict


# our code
sys.path.append(repo_directory + 'code/')

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')

from pipeline_helper_functions import save_sparse_csr, load_sparse_csr


# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc

# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [10]:

    
tfidf_matrix = load_sparse_csr(nlp_dir + 'tfidf_matrix.npz')

with open(nlp_dir + 'op_id_to_bow_id.p', 'rb') as f:
    op_id_to_bow_id = pickle.load(f)

with open(nlp_dir + 'vocab.p', 'rb') as f:
    vocab = pickle.load(f)



In [37]:

    
class TF():
    """
    Class that stores the tfidf matrix and...
    """
    
    def __init__(self, nlp_dir):
        
        self.tfidf_matrix = load_sparse_csr(nlp_dir + 'tfidf_matrix.npz')

        with open(nlp_dir + 'op_id_to_bow_id.p', 'rb') as f:
            self.op_id_to_bow_id = pickle.load(f)

        with open(nlp_dir + 'vocab.p', 'rb') as f:
            self.vocab = np.array(pickle.load(f))


    def top_k_words(self, opinions, num_words):
        """
        This function summarizes a set of opinions by returning the words that appear in these opinions with the highest tf-idf scores.

        Parameters
        -----------
        opinions: list of opinion ids
        num_words: number of words to return as the summary
        tfidf_matrix: the tf-idf matrix of all SCOTUS opinions
        op_id_to_bow_id: dict that maps opinion ids to rows of the tfidf matrix

        Output
        -------
        a list of the words with highest tf-idf scores amount the given opinions
        """

        # op_id_to_bow_id['opinion_id'] = 'row_index'

        n = num_words
        row_indices = []

        # get row indices corresponding to the opinions
        for each_opinion in opinions:
            row_index = self.op_id_to_bow_id[each_opinion]
            row_indices.append(row_index)

        # construct matrix with rows (opinions) from cluster
        new_matrix = self.tfidf_matrix[row_indices, :]

        # return the matrix as sorted listed-of-tuples (descending sort by tf-idf values)
        sorted_matrix = sort_coo(new_matrix)

        # get the unique column indices
        column_ind = [x[1] for x in sorted_matrix]
        column_ind = f7(column_ind) # unique and same ordering

        # get the words from column indice        
        top_words = self.vocab[column_ind].tolist()[:n]
        return top_words



In [38]:

    
blah = TF(nlp_dir)



In [39]:

    
blah.top_k_words(['98286', '105366'], 10)









    



<type 'numpy.ndarray'>






    Out[39]:





[u'allot',
 u'bulk',
 u'tobacco',
 u'plant',
 u'agricultur',
 u'commod',
 u'skelton',
 u'deed',
 u'farm',
 u'land']



In [ ]:



In [ ]:



In [17]:

    
import numpy as np
import re
from collections import OrderedDict

def get_top_n_clusters(n, total_number_clusters, graph_clusters):
    """
    for modularity/walktrap:
    ------------------------
        prints summary of top 'n' clusters
        returns dictionary of top n clusters
            (key = cluster #, value = list of opinions)
    
    parameters
    -----------
        n = number of top clusters
        total_number_clusters = total number of clusters from clustering algorithm
        graph_clusters = pd.Series form of graph_clusters
    """
    
    clusters_size =[]
    for i in range(0,total_number_clusters+1):
        cluster_i = graph_clusters[graph_clusters == i].index.tolist() # list of opinions in cluster i
        clusters_size.append((i,len(cluster_i))) # (cluster #, size_of_cluster)

    # descending sort by size of cluster
    clusters_size = sorted(clusters_size, key=lambda x: x[1], reverse=True)

    # get top 'n' biggest clusters
    biggest_clusters = []
    for i in clusters_size:
        biggest_clusters.append(i[0])
    biggest_clusters = biggest_clusters[0:n]

    # summarize top 'n' biggest clusters
    for i in clusters_size[0:n]:
        print "cluster", i[0], ":", i[1], "opinions"

    clusters_dict = OrderedDict()
    for i in clusters_size[0:n]:
        cluster_i = graph_clusters[graph_clusters == i[0]].index.tolist() # list of opinions in cluster i
        clusters_dict[i[0]] = cluster_i

    return clusters_dict, biggest_clusters

def sort_coo(m): # helper function
    '''
    iterating through a csr (compressed sparse row) matrix:
    (row_index, column_index) tf_idf_value
    
    return a list of tuples (row, column, value), sorted by tf-idf values in descending order
    '''
    m = m.tocoo()
    list_of_tuples = []
    for i,j,k in zip(m.row, m.col, m.data):
        list_of_tuples.append((i,j,k)) # list of tuples
    return sorted(list_of_tuples, key=lambda x: x[2], reverse=True) # sort by tfidf values (descending)

def f7(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

def all_opinions(file_paths): # helper function
    '''
    Get list of all opinions/text files from the (.txt) file paths
    '''
    
    all_opinions = []
    for i in file_paths:
        num = re.search(r'(\d+)', i)
        num = num.group()
        all_opinions.append(num)
    
    # sort the list
    all_opinions = map(int, all_opinions) # convert all elements of list into type(int)
    all_opinions.sort()
    
    # convert list back to list of strings
    all_opinions = map(str, all_opinions)
    
    return all_opinions



####################### Summarize Cluster 1 #######################
def top_k_words(opinions, num_words, tfidf_matrix, op_id_to_bow_id, vocab):
    """
    This function summarizes a set of opinions by returning the words that appear in these opinions with the highest tf-idf scores.

    Parameters
    -----------
    opinions: list of opinion ids
    num_words: number of words to return as the summary
    tfidf_matrix: the tf-idf matrix of all SCOTUS opinions
    op_id_to_bow_id: dict that maps opinion ids to rows of the tfidf matrix

    Output
    -------
    a list of the words with highest tf-idf scores amount the given opinions
    """
    
    # op_id_to_bow_id['opinion_id'] = 'row_index'
    
    vocab = np.array(vocab)
    n = num_words
    row_indices = []
    
    # get row indices corresponding to the opinions
    for each_opinion in opinions:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices.append(row_index)
    
    # construct matrix with rows (opinions) from cluster
    new_matrix = tfidf_matrix[row_indices, :]
    
    # return the matrix as sorted listed-of-tuples (descending sort by tf-idf values)
    sorted_matrix = sort_coo(new_matrix)
    
    # get the unique column indices
    column_ind = [x[1] for x in sorted_matrix]
    column_ind = f7(column_ind) # unique and same ordering
    
    # get the words from column indices
    top_words = vocab[column_ind].tolist()[:n]
    return top_words



####################### Summarize Cluster 2 #######################
def top_k_words_from_mean_vector(opinions, num_words, tfidf_matrix, op_id_to_bow_id, vocab):
    '''
    compute the mean tf-idf vector of the cluster, return the top K words from this mean vector
    '''
    
    # op_id_to_bow_id['opinion_id'] = 'row_index'

    vocab = np.array(vocab)
    n = num_words
    row_indices = []
    
    # get row indices corresponding to the opinions
    for each_opinion in opinions:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices.append(row_index)
    
    # construct a matrix with rows (opinions) from cluster
    new_matrix = tfidf_matrix[row_indices, :]
    
    # to take the mean of each col (use axis=1 to take mean of each row)
    mean_matrix = new_matrix.mean(axis=0) # 1 X 567570 row matrix 
    
    # get the column indices
    column_ind = np.argsort(mean_matrix, axis=1)[:, ::-1] # descending order
    
    # get the words from column indices
    top_words = vocab[column_ind].tolist()[0][:n]
    return top_words



####################### Summarize Cluster 3 #######################
def top_k_words_from_difference(opinions, all_opinions, num_words, tfidf_matrix, op_id_to_bow_id, vocab):
    '''
    compute the mean tf-idf vector of the cluster and also of the complement of the cluster, 
    take the difference mu_cluster - mu_complement, return the top K words in this difference    
    '''
    
    # op_id_to_bow_id['opinion_id'] = 'row_index'
    
    vocab = np.array(vocab)
    n = num_words
    row_indices = []
    
    # get row indices corresponding to the opinions
    for each_opinion in opinions:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices.append(row_index)
    
    # construct a matrix with rowss (opinions) from cluster
    cluster_matrix = tfidf_matrix[row_indices, :]

    # to take the mean of each col (use axis=1 to take mean of each row)
    mean_matrix = cluster_matrix.mean(axis=0) # 1 X 567570 row matrix
    
    
    
    # complement of cluster (all the other opinions)
    opinions_compl = [x for x in all_opinions if x not in opinions]
    
    # get row indices corresponding to complement of cluster
    row_indices_compl = []
    for each_opinion in opinions_compl:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices_compl.append(row_index)
    
    # construct a matrix with rows (opinions) from complement of cluster
    compl_matrix = tfidf_matrix[row_indices_compl, :]
    
    # to take the mean of each col (use axis=1 to take mean of each row)
    mean_matrix_compl = compl_matrix.mean(axis=0) # 1 X 567570 row matrix
    
    
    
    # mu_cluster - mu_complement
    final_mean_matrix = mean_matrix - mean_matrix_compl
    
    # get the column indices
    column_ind = np.argsort(final_mean_matrix, axis=1)[:, ::-1] # descending order
    
    # get the words from column indices
    top_words = vocab[column_ind].tolist()[0][:n]
    
    return top_words



####################### Summarize Cluster 4 #######################
def document_closest_to_mean(opinions, tfidf_matrix, op_id_to_bow_id):
    '''
    compute the mean tf-idf vector, return the document in the cluster closet to the mean  
    '''
    
    # op_id_to_bow_id['opinion_id'] = 'row_index'

    row_indices = []
    
    # get row indices corresponding to the opinions
    for each_opinion in opinions:
        row_index = op_id_to_bow_id[each_opinion]
        row_indices.append(row_index)
    
    # construct a matrix with rows (opinions) from cluster
    new_matrix = tfidf_matrix[row_indices, :]
    
    # to take the mean of each col (use axis=1 to take mean of each row)
    mean_matrix = new_matrix.mean(axis=0) # 1 X 567570 row matrix
    
    # convert to vector (since row matrix)
    mean_vector = np.squeeze(np.asarray(mean_matrix))
    
    # get the euclidean distance between mean vector and all other cluster, row vectors
    euc_dist = {}
    for i in row_indices:
        row_vector = np.squeeze(np.asarray(tfidf_matrix[i].toarray()))
        euc_dist[i] = np.linalg.norm(mean_vector-row_vector)
    
    # get row index closest to mean vector (minimum euclidian distance to mean vector)
    row_index_close = min(euc_dist, key=euc_dist.get)
    
    # get opinion closest to mean vector
    for opinion, row_index in op_id_to_bow_id.iteritems():
        if row_index == row_index_close:
            return opinion



In [ ]: