In [84]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

import os
import sys
import time
from math import *
import copy
import cPickle as pickle
from glob import glob
import re

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# our code
sys.path.append(top_directory + 'code/')
from pipeline.download_data import *
from pipeline.make_raw_case_metadata import *
from load_data import case_info

sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *
from make_snapshots import *



# directory set up
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
experiment_data_dir = data_dir + 'federal/'
text_dir = experiment_data_dir + 'textfiles/'

# courts
courts = ['scotus', 'cafc', 'cadc']
courts += ['ca' + str(i+1) for i in range(11)]


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

make iterator


In [39]:
class textfile_chunks:
    def __init__(self, paths, chunk_size):
        self.i = 0
        
        self.paths = paths
        self.chunk_size = chunk_size
        self.num_files = len(paths)
        
        self.num_chunks = ceil(float(self.num_files)/self.chunk_size)

    def __iter__(self):
        return self

    def next(self):
        if self.i < self.num_chunks:
            
            # file paths to return
            file_paths = self.paths[self.i:min(self.i + self.chunk_size, self.num_files)]
            
            # read in files and put them into dict
            files = {}
            for path in file_paths:
                text = open(path, 'r').read()                
                files[path] = text_normalization(text)
            
            self.i += 1
                
            return files
            
        else:
            raise StopIteration()

In [30]:
class textfile_iter:
    def __init__(self, paths):
        self.i = 0
        self.paths = paths
        self.num_files = len(paths)


    def __iter__(self):
        return self

    def next(self):
        if self.i < self.num_files:
            text = open(self.paths[self.i], 'r').read()            
            self.i += 1
      
            return text_normalization(text)
            
        else:
            raise StopIteration()

run bag of words


In [29]:
file_paths = glob.glob(text_dir + '*.txt')
# text_chunker = textfile_chunks(file_paths, 3)

tf_iter = textfile_iter(file_paths)

In [31]:
bag_of_words = CountVectorizer()

%time BOW = bag_of_words.fit_transform(tf_iter)

vocab = bag_of_words.get_feature_names()

CLid_to_index = {re.search(r'(\d+)\.txt', file_paths[i]).group(1): i for i in range(len(file_paths))}


CPU times: user 59min 31s, sys: 6min 25s, total: 1h 5min 56s
Wall time: 1h 14min 16s

In [92]:
# save data

save_sparse_csr(experiment_data_dir + 'bag_of_words_matrix', BOW)

with open(experiment_data_dir + 'CLid_to_index.p', 'wb') as fp:
    pickle.dump(CLid_to_index, fp)
    
with open(experiment_data_dir + 'vocab.p', 'wb') as fp:
    pickle.dump(vocab, fp)

In [ ]: