In [11]:

    
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/data/courtlistener/'

import numpy as np
import os
import sys
import matplotlib.pyplot as plt
from scipy.stats import rankdata
import cPickle as pickle
from collections import Counter
import pandas as pd

# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info
from viz import print_describe
from stats.linear_model import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')

from custom_vertex_metrics import *


# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [12]:

    
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')









    



/Users/iaincarmichael/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: RuntimeWarning: Could not add vertex ids, there is already an 'id' vertex attribute at foreign-graphml.c:443
  if __name__ == '__main__':

text length vs outdegree



In [13]:

    
num_words = np.array(G.vs['num_words'])

outdegrees = np.array(G.outdegree())

indegrees = G.indegree()

years = G.vs['year']



In [14]:

    
plt.figure(figsize=[12, 4])

plt.subplot(1,3,1)
plt.scatter(num_words, outdegrees)
plt.xlabel('num words')
plt.ylabel('outdegree')

plt.subplot(1,3,2)
plt.scatter(num_words, indegrees)
plt.xlabel('num words')
plt.ylabel('indegree')

plt.subplot(1,3,3)
plt.scatter(years, num_words)
plt.ylabel('year')
plt.ylabel('num words')









    Out[14]:





<matplotlib.text.Text at 0x120308790>



In [15]:

    
get_SLR(num_words, outdegrees, xlabel='num words', ylabel='outdegree')









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.363
Model:                            OLS   Adj. R-squared:                  0.363
Method:                 Least Squares   F-statistic:                 1.591e+04
Date:                Sat, 18 Feb 2017   Prob (F-statistic):               0.00
Time:                        15:14:21   Log-Likelihood:            -1.0368e+05
No. Observations:               27885   AIC:                         2.074e+05
Df Residuals:                   27883   BIC:                         2.074e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          1.9941      0.078     25.439      0.000         1.840     2.148
X              0.0014   1.11e-05    126.132      0.000         0.001     0.001
==============================================================================
Omnibus:                    12103.348   Durbin-Watson:                   1.670
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1373509.235
Skew:                           1.109   Prob(JB):                         0.00
Kurtosis:                      37.311   Cond. No.                     9.27e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.27e+03. This might indicate that there are
strong multicollinearity or other numerical problems.






    Out[15]:





<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x11bbf3190>



In [16]:

    
# remove cases with extremes
out_deg_upper = np.percentile(outdegrees, 99)
out_deg_lower = np.percentile(outdegrees, 0)

num_words_upper = np.percentile(num_words, 99)
num_words_lower = np.percentile(num_words, 0)

od_to_keep = (out_deg_lower <= outdegrees) & (outdegrees <= out_deg_upper)
nw_to_keep = (num_words_lower <= num_words) & (num_words <= num_words_upper)
to_keep =  od_to_keep & nw_to_keep

# remove cases that have zero out-degree
get_SLR(num_words[to_keep], outdegrees[to_keep], xlabel='num words', ylabel='outdegree')









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.368
Model:                            OLS   Adj. R-squared:                  0.368
Method:                 Least Squares   F-statistic:                 1.595e+04
Date:                Sat, 18 Feb 2017   Prob (F-statistic):               0.00
Time:                        15:14:23   Log-Likelihood:                -95606.
No. Observations:               27435   AIC:                         1.912e+05
Df Residuals:                   27433   BIC:                         1.912e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          1.2585      0.069     18.171      0.000         1.123     1.394
X              0.0015    1.2e-05    126.307      0.000         0.001     0.002
==============================================================================
Omnibus:                     5810.031   Durbin-Watson:                   1.629
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            25119.152
Skew:                           0.983   Prob(JB):                         0.00
Kurtosis:                       7.255   Cond. No.                     8.39e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.39e+03. This might indicate that there are
strong multicollinearity or other numerical problems.






    Out[16]:





<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x11baf0950>



In [17]:

    
get_SLR(num_words, indegrees)









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.124
Model:                            OLS   Adj. R-squared:                  0.124
Method:                 Least Squares   F-statistic:                     3944.
Date:                Sat, 18 Feb 2017   Prob (F-statistic):               0.00
Time:                        15:14:24   Log-Likelihood:            -1.1029e+05
No. Observations:               27885   AIC:                         2.206e+05
Df Residuals:                   27883   BIC:                         2.206e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          4.3593      0.099     43.888      0.000         4.165     4.554
X              0.0009   1.41e-05     62.805      0.000         0.001     0.001
==============================================================================
Omnibus:                    25994.942   Durbin-Watson:                   1.829
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1716859.925
Skew:                           4.355   Prob(JB):                         0.00
Kurtosis:                      40.441   Cond. No.                     9.27e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.27e+03. This might indicate that there are
strong multicollinearity or other numerical problems.






    Out[17]:





<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x11bc4b510>

text length over time



In [ ]:



In [18]:

    
plt.scatter(years, num_words)









    Out[18]:





<matplotlib.collections.PathCollection at 0x11cb8b750>



In [19]:

    
get_SLR(years, num_words)









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     1828.
Date:                Sat, 18 Feb 2017   Prob (F-statistic):               0.00
Time:                        15:14:28   Log-Likelihood:            -2.7821e+05
No. Observations:               27885   AIC:                         5.564e+05
Df Residuals:                   27883   BIC:                         5.564e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       -4.86e+04   1244.086    -39.067      0.000      -5.1e+04 -4.62e+04
X             27.6915      0.648     42.759      0.000        26.422    28.961
==============================================================================
Omnibus:                    30916.869   Durbin-Watson:                   1.735
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          6204170.596
Skew:                           5.403   Prob(JB):                         0.00
Kurtosis:                      75.270   Cond. No.                     7.66e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.66e+04. This might indicate that there are
strong multicollinearity or other numerical problems.






    Out[19]:





<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x120495990>



In [20]:

    
def get_year_aggregate(years, x, fcn):
    by_year = {y: [] for y in set(years)}
    for i in range(len(years)):
        by_year[years[i]].append(x[i])
    
    year_agg_dict = {y: fcn(by_year[y]) for y in by_year.keys()}
    return pd.Series(year_agg_dict)

in_year_median = get_year_aggregate(years, indegrees, np.median)

nw_year_median = get_year_aggregate(years, num_words, np.median)

od_year_median = get_year_aggregate(years, outdegrees, np.median)



In [24]:

    
plt.figure(figsize=[8, 4])
plt.subplot(1,2,1)
plt.plot(nw_year_median.index, nw_year_median/1000, label='num words')
plt.plot(od_year_median.index, od_year_median, label='out degree')
plt.ylabel('mean')
plt.xlabel('year')
plt.legend(loc='upper right')

plt.subplot(1,2,2)
plt.plot(nw_year_median.index, nw_year_median/1000, label='num words')
plt.plot(od_year_median.index, od_year_median, label='out degree')

plt.ylabel('median')
plt.xlabel('year')
plt.legend(loc='upper right')









    Out[24]:





<matplotlib.legend.Legend at 0x11bc25c90>



In [28]:

    
plt.figure(figsize=[8, 8])
plt.scatter(nw_year_median.index, nw_year_median/1000,
            label='num words/1000', color='blue', marker='*')
plt.scatter(od_year_median.index, od_year_median,
            label='out degree',  color='red', marker='s')
plt.scatter(in_year_median.index, in_year_median,
            label='indegree degree',  color='green', marker='o')
plt.ylabel('median')
plt.xlabel('year')
plt.legend(loc='upper right')
plt.xlim([1800, 2017])
plt.ylim([0, 30])









    Out[28]:





(0, 30)



In [45]:

    
plt.figure(figsize=[8, 8])
plt.plot(nw_year_median.index, nw_year_median/1000,
            label='num words/1000', color='black', marker='$n$', alpha=.7, linewidth=1, linestyle=':')
plt.plot(od_year_median.index, od_year_median,
            label='out degree',  color='black', marker='$o$', alpha=.7, linestyle=':')
plt.plot(in_year_median.index, in_year_median,
            label='indegree degree',  color='black', marker='$i$', alpha=.7, linestyle=':')
plt.ylabel('median')
plt.xlabel('year')
plt.legend(loc='upper right')
plt.xlim([1800, 2017])
plt.ylim([0, 30])









    Out[45]:





(0, 30)



In [58]:

    
plt.figure(figsize=[6, 9])
plt.subplot(3,1,1)
plt.plot(nw_year_median.index, nw_year_median/1000,
         color='black', marker='.', linestyle=':')
plt.axvline(1953, color='black', alpha=.5)
plt.axvline(1969, color='black', alpha=.5)
plt.ylabel('median text length')
plt.xlim([1800, 2017])
plt.ylim([0, 30])

plt.subplot(3,1,2)
plt.plot(od_year_median.index, od_year_median,
         color='black', marker='.', linestyle=':')
plt.axvline(1953, color='black', alpha=.5)
plt.axvline(1969, color='black', alpha=.5)
plt.ylabel('median outdegree')
plt.xlim([1800, 2017])
plt.ylim([0, 30])

plt.subplot(3,1,3)
plt.plot(in_year_median.index, in_year_median,
         color='black', marker='.', linestyle=':')
plt.axvline(1953, color='black', alpha=.5)
plt.axvline(1969, color='black', alpha=.5)
plt.ylabel('median indegree')
plt.xlabel('year')
plt.xlim([1800, 2017])
plt.ylim([0, 30])









    Out[58]:





(0, 30)



In [27]:

    
?plt.scatter



In [99]:

    
import networkx as nx



In [96]:

    
Gnx = nx.read_graphml(subnet_dir + network_name +'_network.graphml')



In [103]:

    
%%time
katz = nx.katz_centrality(Gnx)









    



CPU times: user 18.3 s, sys: 208 ms, total: 18.5 s
Wall time: 18.5 s



In [ ]: