In [45]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'

import numpy as np
import sys
import matplotlib.pyplot as plt


# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info

from stats.viz import *
from stats.dim_reduction import *
from stats.linear_model import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from rankscore_experiment_sort import *
from rankscore_experiment_LR import *
from make_tr_edge_df import *


# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [2]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

in-degree vs out-degree


In [52]:
outdegs = np.array(G.outdegree())
indegs = np.array(G.indegree())


# to_keep = (indegs >= 1) & (outdegs >= 1)
# indegs = indegs[to_keep]
# outdegs = outdegs[to_keep]

In [53]:
plt.figure(figsize=[8,4])
plt.subplot(1,2,1)
plt.scatter(outdegs, indegs)
plt.xlabel('out degree')
plt.ylabel('in degree')
plt.xlim([0, 1.2*max(outdegs)])
plt.ylim([0, 1.2*max(indegs)])

plt.subplot(1,2,2)
plt.scatter(np.log(outdegs), np.log(indegs))
plt.xlabel('log out degree')
plt.ylabel('log in degree')


Out[53]:
<matplotlib.text.Text at 0x11a23aad0>

In [54]:
get_SLR(outdegs, indegs)


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.209
Model:                            OLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     7355.
Date:                Fri, 30 Dec 2016   Prob (F-statistic):               0.00
Time:                        15:26:35   Log-Likelihood:            -1.0887e+05
No. Observations:               27885   AIC:                         2.177e+05
Df Residuals:                   27883   BIC:                         2.178e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          4.2557      0.087     49.123      0.000         4.086     4.426
X              0.4935      0.006     85.764      0.000         0.482     0.505
==============================================================================
Omnibus:                    26819.505   Durbin-Watson:                   1.869
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          2056505.697
Skew:                           4.532   Prob(JB):                         0.00
Kurtosis:                      44.083   Cond. No.                         18.2
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Out[54]:
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x117ac5050>

In [56]:
get_SLR(np.log(outdegs + 1), np.log(indegs + 1))


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.290
Model:                            OLS   Adj. R-squared:                  0.290
Method:                 Least Squares   F-statistic:                 1.140e+04
Date:                Fri, 30 Dec 2016   Prob (F-statistic):               0.00
Time:                        15:27:02   Log-Likelihood:                -38137.
No. Observations:               27885   AIC:                         7.628e+04
Df Residuals:                   27883   BIC:                         7.629e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          0.8027      0.009     85.451      0.000         0.784     0.821
X              0.5108      0.005    106.792      0.000         0.501     0.520
==============================================================================
Omnibus:                      113.537   Durbin-Watson:                   1.812
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              114.843
Skew:                           0.156   Prob(JB):                     1.15e-25
Kurtosis:                       3.044   Cond. No.                         3.82
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Out[56]:
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x11a479810>

other metrics


In [33]:
%time d_pagerank = G.pagerank()


CPU times: user 47 ms, sys: 6.85 ms, total: 53.8 ms
Wall time: 55.5 ms

In [29]:
%time u_pagerank = G.as_undirected().pagerank()


CPU times: user 498 ms, sys: 50.8 ms, total: 548 ms
Wall time: 565 ms

In [21]:
%time d_betweenness = G.betweenness(directed=True)


CPU times: user 1min 7s, sys: 987 ms, total: 1min 8s
Wall time: 1min 9s

In [26]:
%time u_betweenness = G.as_undirected().betweenness(directed=False)


CPU times: user 6min 45s, sys: 8.38 s, total: 6min 53s
Wall time: 7min 8s

In [22]:
%time d_closeness =  G.closeness(mode="IN", normalized=True)


CPU times: user 43.2 s, sys: 1.34 s, total: 44.6 s
Wall time: 46.1 s

In [25]:
%time u_closeness = G.as_undirected().closeness(normalized=True)


CPU times: user 3min 27s, sys: 3.69 s, total: 3min 30s
Wall time: 3min 41s

In [30]:
%time d_eigen = G.eigenvector_centrality()


CPU times: user 1min 32s, sys: 896 ms, total: 1min 33s
Wall time: 1min 34s

In [31]:
%time u_eigen = G.as_undirected().eigenvector_centrality()


CPU times: user 493 ms, sys: 14.9 ms, total: 508 ms
Wall time: 513 ms

In [120]:
%time hubs = G.hub_score()


CPU times: user 339 ms, sys: 6.08 ms, total: 345 ms
Wall time: 349 ms

In [121]:
%time authorities = G.authority_score()


CPU times: user 316 ms, sys: 2.48 ms, total: 319 ms
Wall time: 319 ms

In [176]:
indegree = G.indegree()
outdegree = G.outdegree()
degree = G.degree()

In [177]:
df = pd.DataFrame(index=G.vs['name'])

df['year'] = G.vs['year']

df['indegree'] = indegree
df['outdegree'] = outdegree
df['degree'] = degree
df['d_pagerank'] = d_pagerank
df['u_pagerank'] = u_pagerank
df['d_betweenness'] = d_betweenness
df['u_betweenness'] = u_betweenness
df['d_closeness'] = d_closeness
df['u_closeness'] = u_closeness
df['d_eigen'] = d_eigen
df['u_eigen'] = u_eigen
df['hubs'] = hubs
df['authorities'] = authorities

In [178]:
plot_scatter_matrix(df.apply(lambda c: c/np.std(c)))



In [179]:
U, D, V = get_PCA(df, scale=True)

In [180]:
plot_scores(U,
            start=1,
            n_comp=5,
            title='')


issue area


In [146]:
# map types to issues
type_to_issue = {'procedural': [1, 4, 6, 9],
                 'substantive': [2, 3, 5, 7, 8, 12, 14],
                 'other': [10, 11, 13, 0]}

# map issues to type
issue_to_type = {i: '' for i in range(13 + 1)}
for t in type_to_issue.keys():
    for i in type_to_issue[t]:
        issue_to_type[i] = t

In [182]:
# create type
G.vs['issueArea'] = [int(i) for i in G.vs['issueArea']]
G.vs['type'] = [issue_to_type[i] for i in G.vs['issueArea']]

# add to data frame
df['issueArea'] = G.vs['issueArea']
df['type'] = G.vs['type']

In [183]:
# get type subsets
df_sub = df[df['type'] == 'substantive']
df_pro = df[df['type'] == 'procedural']
df_oth = df[df['type'] == 'other']

print 'num substantive: %d' % df_sub.shape[0]
print 'num procedural: %d' % df_pro.shape[0]
print 'num other: %d' % df_oth.shape[0]


num substantive: 16891
num procedural: 9733
num other: 1261

In [184]:
df.columns


Out[184]:
Index([u'year', u'indegree', u'outdegree', u'degree', u'd_pagerank',
       u'u_pagerank', u'd_betweenness', u'u_betweenness', u'd_closeness',
       u'u_closeness', u'd_eigen', u'u_eigen', u'hubs', u'authorities',
       u'issueArea', u'type'],
      dtype='object')

In [185]:
metric = 'indegree'

bins = np.linspace(min(df[metric]), max(df[metric]), 100)

# substantive
plt.hist(df_sub[metric],
         bins=bins,
         color='red',
         label='substantive (mean: %1.5f)' % np.mean(df_sub[metric]))

# procedural
plt.hist(df_pro[metric],
         bins=bins,
         color='blue',
         label='procedural (mean: %1.5f)' % np.mean(df_pro[metric]))

# other
plt.hist(df_oth[metric],
         bins=bins,
         color='green',
         label='other (mean: %1.5f)' % np.mean(df_oth[metric]))

# plt.xlim([0, .2])
# plt.ylim([0, 2000])

plt.xlabel(metric)
plt.legend(loc='upper right')


Out[185]:
<matplotlib.legend.Legend at 0x135c8a750>

In [207]:
R = 100

top_cases = df.sort_values(by=metric, ascending=False).iloc[0:R]['type']
top_breakdown = top_cases.value_counts(normalize=True)
all_breakdown = df['type'].value_counts(normalize=True)

top_breakdown - all_breakdown


Out[207]:
substantive    0.004262
procedural     0.010959
other         -0.015221
Name: type, dtype: float64

In [198]:


In [ ]: