In [1]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/data/courtlistener/'

import numpy as np
import os
import sys
import matplotlib.pyplot as plt
from scipy.stats import rankdata
import cPickle as pickle
from collections import Counter
import pandas as pd

# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info
from viz import print_describe
from stats.linear_model import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')

from custom_vertex_metrics import *


# which network to download data for
network_name = 'federal' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

text length vs outdegree


In [3]:


In [4]:
indegrees = G.indegree()
years = G.vs['year']

plt.scatter(years, indegrees)
plt.xlabel('year')
plt.ylabel('indegree')


Out[4]:
<matplotlib.text.Text at 0x11d1f4450>

In [5]:
def get_year_aggregate(years, x, fcn):
    by_year = {y: [] for y in set(years)}
    for i in range(len(years)):
        by_year[years[i]].append(x[i])
    
    year_agg_dict = {y: fcn(by_year[y]) for y in by_year.keys()}
    return pd.Series(year_agg_dict)

in_year_mean = get_year_aggregate(years, indegrees, np.median)

In [7]:
plt.plot(in_year_mean.index, in_year_mean)
plt.xlim([1800, 2016])


Out[7]:
(1800, 2016)

In [8]:
in_year_mean


Out[8]:
1792.0    0.0
1793.0    1.0
1794.0    2.0
1795.0    0.0
1796.0    0.0
1797.0    0.0
1798.0    0.0
1799.0    0.0
1800.0    0.0
1801.0    3.0
1803.0    0.0
1804.0    1.0
1805.0    0.0
1806.0    0.0
1807.0    0.0
1808.0    0.0
1809.0    0.0
1810.0    0.0
1812.0    0.0
1813.0    0.0
1814.0    0.0
1815.0    0.0
1816.0    1.0
1817.0    1.0
1818.0    0.0
1819.0    0.0
1820.0    0.0
1821.0    1.0
1822.0    2.0
1823.0    2.0
         ... 
1987.0    0.0
1988.0    0.0
1989.0    0.0
1990.0    0.0
1991.0    0.0
1992.0    0.0
1993.0    0.0
1994.0    0.0
1995.0    0.0
1996.0    0.0
1997.0    0.0
1998.0    0.0
1999.0    0.0
2000.0    0.0
2001.0    0.0
2002.0    0.0
2003.0    0.0
2004.0    0.0
2005.0    0.0
2006.0    0.0
2007.0    0.0
2008.0    0.0
2009.0    0.0
2010.0    0.0
2011.0    0.0
2012.0    0.0
2013.0    0.0
2014.0    0.0
2015.0    0.0
2016.0    0.0
dtype: float64

In [ ]:


In [9]:
indegrees


Out[9]:
[67,
 53,
 282,
 4,
 13,
 1,
 45,
 4,
 42,
 41,
 31,
 58,
 2,
 21,
 40,
 78,
 11,
 37,
 23,
 10,
 13,
 68,
 1851,
 65,
 48,
 3,
 16,
 24,
 43,
 1,
 4,
 13,
 3,
 10,
 12,
 0,
 1,
 11,
 20,
 5,
 4,
 3,
 3,
 29,
 17,
 6,
 5,
 33,
 15,
 18,
 6,
 8,
 10,
 14,
 26,
 99,
 59,
 7,
 10,
 5,
 87,
 14,
 2,
 4,
 170,
 146,
 118,
 0,
 6,
 5,
 3,
 35,
 81,
 67,
 0,
 6,
 2,
 3,
 14,
 1,
 1,
 18,
 10,
 0,
 100,
 33,
 7,
 15,
 15,
 6,
 14,
 0,
 4,
 0,
 16,
 17,
 8,
 0,
 11,
 26,
 0,
 0,
 1,
 6,
 4,
 0,
 2,
 22,
 4,
 1,
 0,
 7,
 1,
 167,
 7,
 121,
 73,
 0,
 117,
 23,
 1,
 2,
 10,
 11,
 0,
 2,
 5,
 4,
 2,
 0,
 6,
 264,
 0,
 563,
 25,
 14,
 199,
 246,
 196,
 216,
 19,
 5,
 287,
 56,
 1018,
 203,
 27,
 54,
 168,
 74,
 1,
 12,
 28,
 44,
 17,
 10,
 13,
 26,
 60,
 10,
 2,
 0,
 0,
 71,
 8,
 56,
 70,
 91,
 1197,
 81,
 0,
 85,
 6,
 2,
 25,
 22,
 4,
 9,
 7,
 7,
 2,
 12,
 1813,
 6,
 9,
 49,
 0,
 9,
 14,
 63,
 21,
 5,
 18,
 1,
 0,
 7,
 5,
 1,
 7,
 12,
 9,
 1,
 94,
 39,
 190,
 17,
 98,
 27,
 51,
 40,
 34,
 9,
 55,
 10,
 22,
 0,
 15,
 8,
 18,
 17,
 0,
 11,
 4,
 24,
 9,
 18,
 84,
 303,
 15,
 39,
 88,
 54,
 847,
 103,
 77,
 426,
 112,
 3,
 16,
 1096,
 38,
 11,
 17,
 9,
 11,
 960,
 31,
 36,
 12,
 354,
 378,
 10,
 151,
 141,
 18,
 3,
 6,
 10,
 2,
 1,
 11,
 70,
 42,
 194,
 21,
 44,
 266,
 21,
 0,
 176,
 20,
 340,
 191,
 331,
 6,
 143,
 0,
 117,
 324,
 178,
 233,
 260,
 32,
 321,
 30,
 52,
 103,
 134,
 37,
 44,
 36,
 4,
 0,
 8,
 8,
 103,
 49,
 83,
 9,
 27,
 3,
 175,
 135,
 300,
 225,
 2,
 6,
 43,
 43,
 8,
 5,
 178,
 40,
 16,
 11,
 7,
 3,
 1,
 6,
 2,
 0,
 17,
 38,
 14,
 5,
 19,
 0,
 89,
 1,
 79,
 74,
 3,
 8,
 3,
 1,
 0,
 0,
 0,
 1,
 2,
 1,
 22,
 175,
 147,
 2107,
 52,
 0,
 73,
 9,
 0,
 43,
 134,
 143,
 14,
 2,
 125,
 55,
 80,
 67,
 446,
 208,
 149,
 148,
 7,
 0,
 0,
 0,
 2,
 0,
 4,
 6,
 40,
 7,
 5,
 4,
 17,
 1,
 29,
 60,
 2,
 0,
 17,
 2,
 2,
 12,
 1,
 0,
 4,
 0,
 5,
 12,
 0,
 5,
 118,
 1,
 1,
 15,
 0,
 3,
 8,
 19,
 0,
 9,
 5,
 4,
 3,
 3,
 2,
 0,
 8,
 12,
 19,
 15,
 10,
 5,
 0,
 11,
 2,
 10,
 3,
 5,
 19,
 1,
 1,
 5,
 3,
 0,
 2,
 836,
 7,
 48,
 155,
 94,
 164,
 214,
 24,
 374,
 148,
 0,
 3,
 117,
 87,
 14,
 0,
 0,
 8,
 2,
 29,
 11,
 4,
 7,
 0,
 21,
 8,
 18,
 6,
 9,
 4,
 33,
 39,
 18,
 1,
 1,
 0,
 55,
 1,
 8,
 1,
 100,
 1,
 16,
 6,
 0,
 5,
 46,
 1,
 9,
 15,
 15,
 0,
 12,
 5,
 18,
 16,
 6,
 4,
 12,
 3,
 4,
 14,
 7,
 11,
 33,
 12,
 26,
 48,
 5,
 0,
 22,
 34,
 8,
 76,
 346,
 260,
 13,
 0,
 0,
 60,
 68,
 5,
 15,
 12,
 0,
 1,
 3,
 34,
 10,
 3,
 23,
 124,
 6,
 0,
 3,
 4,
 6,
 0,
 6,
 44,
 0,
 6,
 7,
 18,
 0,
 9,
 0,
 6,
 5,
 784,
 23,
 149,
 58,
 44,
 42,
 378,
 102,
 16,
 91,
 0,
 4,
 12,
 4,
 5,
 9,
 1,
 7,
 19,
 1,
 4,
 5,
 53,
 27,
 17,
 0,
 72,
 5,
 2,
 0,
 12,
 0,
 38,
 5,
 216,
 59,
 149,
 99,
 246,
 214,
 293,
 115,
 1,
 5,
 38,
 300,
 2,
 141,
 18,
 46,
 149,
 21,
 18,
 24,
 39,
 1,
 4,
 7,
 24,
 6,
 105,
 1,
 2,
 90,
 9,
 0,
 14,
 6,
 17,
 144,
 5,
 36,
 18,
 5,
 28,
 1,
 52,
 48,
 1,
 0,
 101,
 39,
 5,
 128,
 42,
 68,
 24,
 10,
 68,
 12,
 21,
 316,
 283,
 22,
 173,
 30,
 543,
 7,
 1,
 2,
 36,
 39,
 2,
 8,
 12,
 43,
 52,
 18,
 9,
 1118,
 143,
 106,
 8,
 5,
 32,
 16,
 13,
 2,
 4,
 12,
 2,
 173,
 182,
 136,
 0,
 1,
 3,
 0,
 1,
 3,
 3,
 15,
 14,
 0,
 5,
 1,
 1,
 22,
 39,
 0,
 4,
 0,
 79,
 10,
 0,
 0,
 0,
 5,
 85,
 632,
 38,
 19,
 75,
 45,
 0,
 7,
 0,
 26,
 2,
 9,
 26,
 8,
 92,
 3,
 85,
 7,
 1,
 3,
 0,
 0,
 5,
 15,
 30,
 5,
 12,
 24,
 23,
 17,
 43,
 11,
 7,
 23,
 47,
 298,
 15,
 63,
 0,
 4,
 16,
 5,
 2,
 5,
 4,
 4,
 1,
 0,
 51,
 22,
 1,
 1,
 1,
 21,
 0,
 0,
 4,
 6,
 0,
 4,
 3,
 1,
 44,
 76,
 71,
 87,
 13,
 39,
 92,
 411,
 27,
 5,
 0,
 0,
 8,
 0,
 2,
 116,
 172,
 133,
 95,
 33,
 736,
 52,
 99,
 59,
 147,
 137,
 140,
 466,
 233,
 196,
 195,
 84,
 20,
 268,
 0,
 0,
 8,
 0,
 1,
 6,
 0,
 0,
 20,
 0,
 0,
 0,
 0,
 77,
 7,
 1,
 19,
 3,
 14,
 2,
 19,
 0,
 48,
 34,
 0,
 0,
 7,
 8,
 157,
 3,
 10,
 1,
 54,
 175,
 29,
 18,
 99,
 4,
 90,
 18,
 45,
 187,
 0,
 172,
 0,
 26,
 10,
 5,
 1,
 3,
 41,
 2,
 3,
 4,
 0,
 4,
 0,
 12,
 0,
 3,
 4,
 3,
 59,
 23,
 29,
 94,
 10,
 1,
 108,
 18,
 2,
 290,
 4,
 139,
 17,
 42,
 35,
 155,
 29,
 389,
 24,
 0,
 17,
 107,
 7,
 8,
 1,
 4,
 20,
 1,
 8,
 3,
 20,
 2,
 2,
 4,
 45,
 27,
 91,
 4,
 7,
 8,
 9,
 12,
 3,
 47,
 91,
 0,
 24,
 4,
 4,
 88,
 28,
 2,
 91,
 2,
 4,
 57,
 378,
 33,
 3,
 6,
 10,
 114,
 893,
 39,
 44,
 135,
 3,
 1,
 8,
 9,
 1,
 11,
 54,
 85,
 63,
 117,
 102,
 65,
 28,
 446,
 171,
 8,
 393,
 24,
 104,
 21,
 1012,
 22,
 2,
 14,
 5,
 1,
 7,
 3,
 13,
 4,
 0,
 37,
 4,
 107,
 5,
 0,
 14,
 8,
 4,
 5,
 2,
 7,
 0,
 4,
 6,
 0,
 0,
 1,
 4,
 0,
 1,
 26,
 0,
 15,
 1,
 50,
 6,
 42,
 49,
 141,
 2,
 12,
 1,
 6,
 6,
 12,
 46,
 42,
 24,
 77,
 411,
 6,
 88,
 15,
 15,
 68,
 9,
 0,
 6,
 17,
 0,
 105,
 1,
 4,
 686,
 5,
 1,
 63,
 80,
 0,
 65,
 ...]

In [ ]:


In [ ]:


In [14]:
plt.figure(figsize=[12, 4])

plt.subplot(1,3,1)
plt.scatter(num_words, outdegrees)
plt.xlabel('num words')
plt.ylabel('outdegree')

plt.subplot(1,3,2)
plt.scatter(num_words, indegrees)
plt.xlabel('num words')
plt.ylabel('indegree')

plt.subplot(1,3,3)
plt.scatter(years, num_words)
plt.ylabel('year')
plt.ylabel('num words')


Out[14]:
<matplotlib.text.Text at 0x120308790>

In [15]:
get_SLR(num_words, outdegrees, xlabel='num words', ylabel='outdegree')


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.363
Model:                            OLS   Adj. R-squared:                  0.363
Method:                 Least Squares   F-statistic:                 1.591e+04
Date:                Sat, 18 Feb 2017   Prob (F-statistic):               0.00
Time:                        15:14:21   Log-Likelihood:            -1.0368e+05
No. Observations:               27885   AIC:                         2.074e+05
Df Residuals:                   27883   BIC:                         2.074e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          1.9941      0.078     25.439      0.000         1.840     2.148
X              0.0014   1.11e-05    126.132      0.000         0.001     0.001
==============================================================================
Omnibus:                    12103.348   Durbin-Watson:                   1.670
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1373509.235
Skew:                           1.109   Prob(JB):                         0.00
Kurtosis:                      37.311   Cond. No.                     9.27e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.27e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Out[15]:
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x11bbf3190>

In [16]:
# remove cases with extremes
out_deg_upper = np.percentile(outdegrees, 99)
out_deg_lower = np.percentile(outdegrees, 0)

num_words_upper = np.percentile(num_words, 99)
num_words_lower = np.percentile(num_words, 0)

od_to_keep = (out_deg_lower <= outdegrees) & (outdegrees <= out_deg_upper)
nw_to_keep = (num_words_lower <= num_words) & (num_words <= num_words_upper)
to_keep =  od_to_keep & nw_to_keep

# remove cases that have zero out-degree
get_SLR(num_words[to_keep], outdegrees[to_keep], xlabel='num words', ylabel='outdegree')


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.368
Model:                            OLS   Adj. R-squared:                  0.368
Method:                 Least Squares   F-statistic:                 1.595e+04
Date:                Sat, 18 Feb 2017   Prob (F-statistic):               0.00
Time:                        15:14:23   Log-Likelihood:                -95606.
No. Observations:               27435   AIC:                         1.912e+05
Df Residuals:                   27433   BIC:                         1.912e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          1.2585      0.069     18.171      0.000         1.123     1.394
X              0.0015    1.2e-05    126.307      0.000         0.001     0.002
==============================================================================
Omnibus:                     5810.031   Durbin-Watson:                   1.629
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            25119.152
Skew:                           0.983   Prob(JB):                         0.00
Kurtosis:                       7.255   Cond. No.                     8.39e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.39e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Out[16]:
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x11baf0950>

In [17]:
get_SLR(num_words, indegrees)


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.124
Model:                            OLS   Adj. R-squared:                  0.124
Method:                 Least Squares   F-statistic:                     3944.
Date:                Sat, 18 Feb 2017   Prob (F-statistic):               0.00
Time:                        15:14:24   Log-Likelihood:            -1.1029e+05
No. Observations:               27885   AIC:                         2.206e+05
Df Residuals:                   27883   BIC:                         2.206e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          4.3593      0.099     43.888      0.000         4.165     4.554
X              0.0009   1.41e-05     62.805      0.000         0.001     0.001
==============================================================================
Omnibus:                    25994.942   Durbin-Watson:                   1.829
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1716859.925
Skew:                           4.355   Prob(JB):                         0.00
Kurtosis:                      40.441   Cond. No.                     9.27e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.27e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Out[17]:
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x11bc4b510>

text length over time


In [ ]:


In [18]:
plt.scatter(years, num_words)


Out[18]:
<matplotlib.collections.PathCollection at 0x11cb8b750>

In [19]:
get_SLR(years, num_words)


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     1828.
Date:                Sat, 18 Feb 2017   Prob (F-statistic):               0.00
Time:                        15:14:28   Log-Likelihood:            -2.7821e+05
No. Observations:               27885   AIC:                         5.564e+05
Df Residuals:                   27883   BIC:                         5.564e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       -4.86e+04   1244.086    -39.067      0.000      -5.1e+04 -4.62e+04
X             27.6915      0.648     42.759      0.000        26.422    28.961
==============================================================================
Omnibus:                    30916.869   Durbin-Watson:                   1.735
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          6204170.596
Skew:                           5.403   Prob(JB):                         0.00
Kurtosis:                      75.270   Cond. No.                     7.66e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.66e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
Out[19]:
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x120495990>

In [20]:
def get_year_aggregate(years, x, fcn):
    by_year = {y: [] for y in set(years)}
    for i in range(len(years)):
        by_year[years[i]].append(x[i])
    
    year_agg_dict = {y: fcn(by_year[y]) for y in by_year.keys()}
    return pd.Series(year_agg_dict)

in_year_median = get_year_aggregate(years, indegrees, np.median)

nw_year_median = get_year_aggregate(years, num_words, np.median)

od_year_median = get_year_aggregate(years, outdegrees, np.median)

In [24]:
plt.figure(figsize=[8, 4])
plt.subplot(1,2,1)
plt.plot(nw_year_median.index, nw_year_median/1000, label='num words')
plt.plot(od_year_median.index, od_year_median, label='out degree')
plt.ylabel('mean')
plt.xlabel('year')
plt.legend(loc='upper right')

plt.subplot(1,2,2)
plt.plot(nw_year_median.index, nw_year_median/1000, label='num words')
plt.plot(od_year_median.index, od_year_median, label='out degree')

plt.ylabel('median')
plt.xlabel('year')
plt.legend(loc='upper right')


Out[24]:
<matplotlib.legend.Legend at 0x11bc25c90>

In [28]:
plt.figure(figsize=[8, 8])
plt.scatter(nw_year_median.index, nw_year_median/1000,
            label='num words/1000', color='blue', marker='*')
plt.scatter(od_year_median.index, od_year_median,
            label='out degree',  color='red', marker='s')
plt.scatter(in_year_median.index, in_year_median,
            label='indegree degree',  color='green', marker='o')
plt.ylabel('median')
plt.xlabel('year')
plt.legend(loc='upper right')
plt.xlim([1800, 2017])
plt.ylim([0, 30])


Out[28]:
(0, 30)

In [45]:
plt.figure(figsize=[8, 8])
plt.plot(nw_year_median.index, nw_year_median/1000,
            label='num words/1000', color='black', marker='$n$', alpha=.7, linewidth=1, linestyle=':')
plt.plot(od_year_median.index, od_year_median,
            label='out degree',  color='black', marker='$o$', alpha=.7, linestyle=':')
plt.plot(in_year_median.index, in_year_median,
            label='indegree degree',  color='black', marker='$i$', alpha=.7, linestyle=':')
plt.ylabel('median')
plt.xlabel('year')
plt.legend(loc='upper right')
plt.xlim([1800, 2017])
plt.ylim([0, 30])


Out[45]:
(0, 30)

In [58]:
plt.figure(figsize=[6, 9])
plt.subplot(3,1,1)
plt.plot(nw_year_median.index, nw_year_median/1000,
         color='black', marker='.', linestyle=':')
plt.axvline(1953, color='black', alpha=.5)
plt.axvline(1969, color='black', alpha=.5)
plt.ylabel('median text length')
plt.xlim([1800, 2017])
plt.ylim([0, 30])

plt.subplot(3,1,2)
plt.plot(od_year_median.index, od_year_median,
         color='black', marker='.', linestyle=':')
plt.axvline(1953, color='black', alpha=.5)
plt.axvline(1969, color='black', alpha=.5)
plt.ylabel('median outdegree')
plt.xlim([1800, 2017])
plt.ylim([0, 30])

plt.subplot(3,1,3)
plt.plot(in_year_median.index, in_year_median,
         color='black', marker='.', linestyle=':')
plt.axvline(1953, color='black', alpha=.5)
plt.axvline(1969, color='black', alpha=.5)
plt.ylabel('median indegree')
plt.xlabel('year')
plt.xlim([1800, 2017])
plt.ylim([0, 30])


Out[58]:
(0, 30)

In [27]:
?plt.scatter

In [99]:
import networkx as nx

In [96]:
Gnx = nx.read_graphml(subnet_dir + network_name +'_network.graphml')

In [103]:
%%time
katz = nx.katz_centrality(Gnx)


CPU times: user 18.3 s, sys: 208 ms, total: 18.5 s
Wall time: 18.5 s

In [ ]: