In [84]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
import os
import sys
import time
from math import *
import copy
import cPickle as pickle
from glob import glob
import re
# data
import numpy as np
import pandas as pd
# viz
import matplotlib.pyplot as plt
# graph
import igraph as ig
# our code
sys.path.append(top_directory + 'code/')
from pipeline.download_data import *
from pipeline.make_raw_case_metadata import *
from load_data import case_info
sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *
from make_snapshots import *
# directory set up
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
experiment_data_dir = data_dir + 'federal/'
text_dir = experiment_data_dir + 'textfiles/'
# courts
courts = ['scotus', 'cafc', 'cadc']
courts += ['ca' + str(i+1) for i in range(11)]
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [39]:
class textfile_chunks:
def __init__(self, paths, chunk_size):
self.i = 0
self.paths = paths
self.chunk_size = chunk_size
self.num_files = len(paths)
self.num_chunks = ceil(float(self.num_files)/self.chunk_size)
def __iter__(self):
return self
def next(self):
if self.i < self.num_chunks:
# file paths to return
file_paths = self.paths[self.i:min(self.i + self.chunk_size, self.num_files)]
# read in files and put them into dict
files = {}
for path in file_paths:
text = open(path, 'r').read()
files[path] = text_normalization(text)
self.i += 1
return files
else:
raise StopIteration()
In [30]:
class textfile_iter:
def __init__(self, paths):
self.i = 0
self.paths = paths
self.num_files = len(paths)
def __iter__(self):
return self
def next(self):
if self.i < self.num_files:
text = open(self.paths[self.i], 'r').read()
self.i += 1
return text_normalization(text)
else:
raise StopIteration()
In [29]:
file_paths = glob.glob(text_dir + '*.txt')
# text_chunker = textfile_chunks(file_paths, 3)
tf_iter = textfile_iter(file_paths)
In [31]:
bag_of_words = CountVectorizer()
%time BOW = bag_of_words.fit_transform(tf_iter)
vocab = bag_of_words.get_feature_names()
CLid_to_index = {re.search(r'(\d+)\.txt', file_paths[i]).group(1): i for i in range(len(file_paths))}
In [92]:
# save data
save_sparse_csr(experiment_data_dir + 'bag_of_words_matrix', BOW)
with open(experiment_data_dir + 'CLid_to_index.p', 'wb') as fp:
pickle.dump(CLid_to_index, fp)
with open(experiment_data_dir + 'vocab.p', 'wb') as fp:
pickle.dump(vocab, fp)
In [ ]: