In [1]:

    
%load_ext autoreload



In [2]:

    
autoreload 2



In [3]:

    
%matplotlib inline



In [4]:

    
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import pyreto

Forest fires



In [5]:

    
fire_size = pd.read_csv("http://tuvalu.santafe.edu/~aaronc/powerlaws/data/fires.txt", names=['acres'])



In [6]:

    
fire_size.describe()









    Out[6]:






  
    
      
      acres
    
  
  
    
      count
      203785.000000
    
    
      mean
      89.563111
    
    
      std
      2098.732181
    
    
      min
      0.100000
    
    
      25%
      0.100000
    
    
      50%
      0.200000
    
    
      75%
      2.000000
    
    
      max
      412050.000000



In [7]:

    
# check that I get same estimate for alpha given reported xmin...
desired_alpha, desired_xmin = 2.2, 6324
result1 = pyreto.distributions.Pareto.fit(fire_size.acres, xmin=desired_xmin)



In [9]:

    
# check that I get the same estimates for both alpha and xmin using brute force minimization
result2 = pyreto.distributions.Pareto.fit(fire_size.acres, xmin=None, quantile=0.999, method='brute')



In [10]:

    
np.testing.assert_almost_equal(result2.params['alpha'], desired_alpha, decimal=1)



In [11]:

    
np.testing.assert_almost_equal(result2.xmin, desired_xmin, decimal=1)



In [12]:

    
# check that I get the same estimates for both alpha and xmin using bounded minimization
result3 = pyreto.distributions.Pareto.fit(fire_size.acres, xmin=None, quantile=0.999, method='bounded')



In [13]:

    
np.testing.assert_almost_equal(result3.params['alpha'], desired_alpha, decimal=1)



In [14]:

    
np.testing.assert_almost_equal(result3.xmin, desired_xmin, decimal=1)









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-14-b77cff7068bf> in <module>()
----> 1 np.testing.assert_almost_equal(result3.xmin, desired_xmin, decimal=1)

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 1 decimals
 ACTUAL: 7149.9999509662648
 DESIRED: 6324



In [36]:

    
pvalue, Ds = pyreto.distributions.Pareto.test_goodness_of_fit(42, result3, fire_size.acres, method='bounded')



In [38]:

    
# pareto distribution should be rejected...
assert pvalue <= 0.10

Weblinks



In [20]:

    
weblinks_histogram = pd.read_csv('http://tuvalu.santafe.edu/~aaronc/powerlaws/data/weblinks.hist', sep='\t')



In [21]:

    
weblinks_histogram.describe()









    Out[21]:






  
    
      
      degree
      frequency
    
  
  
    
      count
      1.448000e+04
      1.448000e+04
    
    
      mean
      1.549990e+04
      1.910143e+04
    
    
      std
      3.657173e+04
      1.022556e+06
    
    
      min
      0.000000e+00
      1.000000e+00
    
    
      25%
      3.619750e+03
      1.000000e+00
    
    
      50%
      7.468500e+03
      2.000000e+00
    
    
      75%
      1.475050e+04
      1.100000e+01
    
    
      max
      1.199466e+06
      1.066498e+08



In [22]:

    
# convert histogram data into degree series..
raw_counts = np.repeat(weblinks_histogram.degree.values, weblinks_histogram.frequency.values)
weblinks = pd.Series(raw_counts, name='count')



In [23]:

    
weblinks.describe()









    Out[23]:





count    2.765887e+08
mean     7.990814e+00
std      3.054431e+02
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      4.000000e+00
max      1.199466e+06
Name: count, dtype: float64



In [10]:

    
# check that I get same estimate for alpha given reported xmin...
desired_alpha, desired_xmin = 2.336, 3684
result1 = pyreto.distributions.Pareto.fit(weblinks, xmin=desired_xmin)



In [ ]:

    
np.testing.assert_almost_equal(result1.params['alpha'], desired_alpha, decimal=3)



In [14]:

    
# check that I get the same estimates for both alpha and xmin using bounded minimization
result2 = pyreto.distributions.Pareto.fit(weblinks, xmin=None, quantile=0.9999, method='bounded')









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-14-f7e786fc530c> in <module>()
      1 # check that I get the same estimates for both alpha and xmin using bounded minimization
      2 result2 = pyreto.distributions.Pareto.fit(weblinks, xmin=None, quantile=0.9999, method='bounded')
----> 3 test_scaling_exponent_estimation(desired_alpha, result2, decimal=3)

<ipython-input-4-6899f550d2ad> in test_scaling_exponent_estimation(desired_alpha, result, decimal)
      1 def test_scaling_exponent_estimation(desired_alpha, result, decimal):
----> 2     np.testing.assert_almost_equal(result.params['alpha'], desired_alpha, decimal=decimal)
      3 
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, tol):

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 3 decimals
 ACTUAL: 2.3261940228265701
 DESIRED: 2.336



In [ ]:

    
np.testing.assert_almost_equal(result2.params['alpha'], desired_alpha, decimal=3)



In [17]:

    
test_scaling_threshold_estimation(desired_xmin, result2, decimal=1)









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-17-f11ed5f3e503> in <module>()
----> 1 test_scaling_threshold_estimation(desired_xmin, result2, decimal=1)

<ipython-input-16-aac319d7d71d> in test_scaling_threshold_estimation(desired_xmin, result, decimal)
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, decimal):
----> 6     np.testing.assert_almost_equal(result.xmin, desired_xmin, decimal=decimal)

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 1 decimals
 ACTUAL: 3213.9999650697919
 DESIRED: 3684

Cities



In [6]:

    
cities = pd.read_csv('http://tuvalu.santafe.edu/~aaronc/powerlaws/data/cities.txt', names=['population'])
cities.population /= 1e3  # CSN units are in thousands of persons



In [7]:

    
cities.describe()









    Out[7]:






  
    
      
      population
    
  
  
    
      count
      19447.000000
    
    
      mean
      9.002051
    
    
      std
      77.825051
    
    
      min
      0.001000
    
    
      25%
      0.369500
    
    
      50%
      1.089000
    
    
      75%
      4.135500
    
    
      max
      8008.654000



In [8]:

    
# check that I get same estimate for alpha given reported xmin...
desired_alpha, desired_xmin = 2.37, 52.46
result1 = pyreto.distributions.Pareto.fit(cities.population, xmin=desired_xmin)
test_scaling_exponent_estimation(desired_alpha, result1, decimal=2)



In [9]:

    
# check that I get the same estimates for both alpha and xmin using brute force minimization
result2 = pyreto.distributions.Pareto.fit(cities.population, xmin=None, quantile=0.99, method='brute')
test_scaling_exponent_estimation(desired_alpha, result2, decimal=2)









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-9-5b14153ed2ef> in <module>()
      1 # check that I get the same estimates for both alpha and xmin using brute force minimization
      2 result2 = pyreto.distributions.Pareto.fit(cities.population, xmin=None, quantile=0.99, method='brute')
----> 3 test_scaling_exponent_estimation(desired_alpha, result2, decimal=2)

<ipython-input-5-aac319d7d71d> in test_scaling_exponent_estimation(desired_alpha, result, decimal)
      1 def test_scaling_exponent_estimation(desired_alpha, result, decimal):
----> 2     np.testing.assert_almost_equal(result.params['alpha'], desired_alpha, decimal=decimal)
      3 
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, decimal):

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 2 decimals
 ACTUAL: 2.3639368738287363
 DESIRED: 2.37



In [10]:

    
test_scaling_threshold_estimation(desired_xmin, result2, decimal=2)









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-10-068cfd665dcb> in <module>()
----> 1 test_scaling_threshold_estimation(desired_xmin, result2, decimal=2)

<ipython-input-5-aac319d7d71d> in test_scaling_threshold_estimation(desired_xmin, result, decimal)
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, decimal):
----> 6     np.testing.assert_almost_equal(result.xmin, desired_xmin, decimal=decimal)

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 2 decimals
 ACTUAL: 51.442999999999998
 DESIRED: 52.46



In [11]:

    
# check that I get the same estimates for both alpha and xmin using bounded minimization
result3 = pyreto.distributions.Pareto.fit(cities.population, xmin=None, quantile=0.99, method='bounded')



In [12]:

    
test_scaling_exponent_estimation(desired_alpha, result3, decimal=2)



In [13]:

    
test_scaling_threshold_estimation(desired_xmin, result3, decimal=2)









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-13-3c50a6c928ba> in <module>()
----> 1 test_scaling_threshold_estimation(desired_xmin, result3, decimal=2)

<ipython-input-5-aac319d7d71d> in test_scaling_threshold_estimation(desired_xmin, result, decimal)
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, decimal):
----> 6     np.testing.assert_almost_equal(result.xmin, desired_xmin, decimal=decimal)

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 2 decimals
 ACTUAL: 70.075391716136934
 DESIRED: 52.46



In [19]:

    
# using brute force minmization to find xmin makes this test take a while!
pvalue, Ds = pyreto.distributions.Pareto.test_goodness_of_fit(42, result2, cities.population, quantile=0.99,
                                                              method='brute')



In [15]:

    
# pareto distribution should not be rejected...
assert pvalue > 0.10









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-15-68fb2dca5c54> in <module>()
      1 # pareto distribution should not be rejected...
----> 2 assert pvalue > 0.10

AssertionError:



In [16]:

    
pvalue









    Out[16]:





0.026783583288262355



In [17]:

    
pyreto.distributions.Pareto.test_goodness_of_fit??



In [ ]:

	acres
count	203785.000000
mean	89.563111
std	2098.732181
min	0.100000
25%	0.100000
50%	0.200000
75%	2.000000
max	412050.000000

	degree	frequency
count	1.448000e+04	1.448000e+04
mean	1.549990e+04	1.910143e+04
std	3.657173e+04	1.022556e+06
min	0.000000e+00	1.000000e+00
25%	3.619750e+03	1.000000e+00
50%	7.468500e+03	2.000000e+00
75%	1.475050e+04	1.100000e+01
max	1.199466e+06	1.066498e+08

	population
count	19447.000000
mean	9.002051
std	77.825051
min	0.001000
25%	0.369500
50%	1.089000
75%	4.135500
max	8008.654000