In [1]:
%load_ext autoreload

In [2]:
autoreload 2

In [3]:
%matplotlib inline

In [4]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import pyreto

Forest fires


In [5]:
fire_size = pd.read_csv("http://tuvalu.santafe.edu/~aaronc/powerlaws/data/fires.txt", names=['acres'])

In [6]:
fire_size.describe()


Out[6]:
acres
count 203785.000000
mean 89.563111
std 2098.732181
min 0.100000
25% 0.100000
50% 0.200000
75% 2.000000
max 412050.000000

In [7]:
# check that I get same estimate for alpha given reported xmin...
desired_alpha, desired_xmin = 2.2, 6324
result1 = pyreto.distributions.Pareto.fit(fire_size.acres, xmin=desired_xmin)

In [9]:
# check that I get the same estimates for both alpha and xmin using brute force minimization
result2 = pyreto.distributions.Pareto.fit(fire_size.acres, xmin=None, quantile=0.999, method='brute')

In [10]:
np.testing.assert_almost_equal(result2.params['alpha'], desired_alpha, decimal=1)

In [11]:
np.testing.assert_almost_equal(result2.xmin, desired_xmin, decimal=1)

In [12]:
# check that I get the same estimates for both alpha and xmin using bounded minimization
result3 = pyreto.distributions.Pareto.fit(fire_size.acres, xmin=None, quantile=0.999, method='bounded')

In [13]:
np.testing.assert_almost_equal(result3.params['alpha'], desired_alpha, decimal=1)

In [14]:
np.testing.assert_almost_equal(result3.xmin, desired_xmin, decimal=1)


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-14-b77cff7068bf> in <module>()
----> 1 np.testing.assert_almost_equal(result3.xmin, desired_xmin, decimal=1)

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 1 decimals
 ACTUAL: 7149.9999509662648
 DESIRED: 6324

In [36]:
pvalue, Ds = pyreto.distributions.Pareto.test_goodness_of_fit(42, result3, fire_size.acres, method='bounded')

In [38]:
# pareto distribution should be rejected...
assert pvalue <= 0.10

Weblinks


In [20]:
weblinks_histogram = pd.read_csv('http://tuvalu.santafe.edu/~aaronc/powerlaws/data/weblinks.hist', sep='\t')

In [21]:
weblinks_histogram.describe()


Out[21]:
degree frequency
count 1.448000e+04 1.448000e+04
mean 1.549990e+04 1.910143e+04
std 3.657173e+04 1.022556e+06
min 0.000000e+00 1.000000e+00
25% 3.619750e+03 1.000000e+00
50% 7.468500e+03 2.000000e+00
75% 1.475050e+04 1.100000e+01
max 1.199466e+06 1.066498e+08

In [22]:
# convert histogram data into degree series..
raw_counts = np.repeat(weblinks_histogram.degree.values, weblinks_histogram.frequency.values)
weblinks = pd.Series(raw_counts, name='count')

In [23]:
weblinks.describe()


Out[23]:
count    2.765887e+08
mean     7.990814e+00
std      3.054431e+02
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      4.000000e+00
max      1.199466e+06
Name: count, dtype: float64

In [10]:
# check that I get same estimate for alpha given reported xmin...
desired_alpha, desired_xmin = 2.336, 3684
result1 = pyreto.distributions.Pareto.fit(weblinks, xmin=desired_xmin)

In [ ]:
np.testing.assert_almost_equal(result1.params['alpha'], desired_alpha, decimal=3)

In [14]:
# check that I get the same estimates for both alpha and xmin using bounded minimization
result2 = pyreto.distributions.Pareto.fit(weblinks, xmin=None, quantile=0.9999, method='bounded')


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-14-f7e786fc530c> in <module>()
      1 # check that I get the same estimates for both alpha and xmin using bounded minimization
      2 result2 = pyreto.distributions.Pareto.fit(weblinks, xmin=None, quantile=0.9999, method='bounded')
----> 3 test_scaling_exponent_estimation(desired_alpha, result2, decimal=3)

<ipython-input-4-6899f550d2ad> in test_scaling_exponent_estimation(desired_alpha, result, decimal)
      1 def test_scaling_exponent_estimation(desired_alpha, result, decimal):
----> 2     np.testing.assert_almost_equal(result.params['alpha'], desired_alpha, decimal=decimal)
      3 
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, tol):

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 3 decimals
 ACTUAL: 2.3261940228265701
 DESIRED: 2.336

In [ ]:
np.testing.assert_almost_equal(result2.params['alpha'], desired_alpha, decimal=3)

In [17]:
test_scaling_threshold_estimation(desired_xmin, result2, decimal=1)


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-17-f11ed5f3e503> in <module>()
----> 1 test_scaling_threshold_estimation(desired_xmin, result2, decimal=1)

<ipython-input-16-aac319d7d71d> in test_scaling_threshold_estimation(desired_xmin, result, decimal)
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, decimal):
----> 6     np.testing.assert_almost_equal(result.xmin, desired_xmin, decimal=decimal)

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 1 decimals
 ACTUAL: 3213.9999650697919
 DESIRED: 3684

Cities


In [6]:
cities = pd.read_csv('http://tuvalu.santafe.edu/~aaronc/powerlaws/data/cities.txt', names=['population'])
cities.population /= 1e3  # CSN units are in thousands of persons

In [7]:
cities.describe()


Out[7]:
population
count 19447.000000
mean 9.002051
std 77.825051
min 0.001000
25% 0.369500
50% 1.089000
75% 4.135500
max 8008.654000

In [8]:
# check that I get same estimate for alpha given reported xmin...
desired_alpha, desired_xmin = 2.37, 52.46
result1 = pyreto.distributions.Pareto.fit(cities.population, xmin=desired_xmin)
test_scaling_exponent_estimation(desired_alpha, result1, decimal=2)

In [9]:
# check that I get the same estimates for both alpha and xmin using brute force minimization
result2 = pyreto.distributions.Pareto.fit(cities.population, xmin=None, quantile=0.99, method='brute')
test_scaling_exponent_estimation(desired_alpha, result2, decimal=2)


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-9-5b14153ed2ef> in <module>()
      1 # check that I get the same estimates for both alpha and xmin using brute force minimization
      2 result2 = pyreto.distributions.Pareto.fit(cities.population, xmin=None, quantile=0.99, method='brute')
----> 3 test_scaling_exponent_estimation(desired_alpha, result2, decimal=2)

<ipython-input-5-aac319d7d71d> in test_scaling_exponent_estimation(desired_alpha, result, decimal)
      1 def test_scaling_exponent_estimation(desired_alpha, result, decimal):
----> 2     np.testing.assert_almost_equal(result.params['alpha'], desired_alpha, decimal=decimal)
      3 
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, decimal):

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 2 decimals
 ACTUAL: 2.3639368738287363
 DESIRED: 2.37

In [10]:
test_scaling_threshold_estimation(desired_xmin, result2, decimal=2)


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-10-068cfd665dcb> in <module>()
----> 1 test_scaling_threshold_estimation(desired_xmin, result2, decimal=2)

<ipython-input-5-aac319d7d71d> in test_scaling_threshold_estimation(desired_xmin, result, decimal)
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, decimal):
----> 6     np.testing.assert_almost_equal(result.xmin, desired_xmin, decimal=decimal)

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 2 decimals
 ACTUAL: 51.442999999999998
 DESIRED: 52.46

In [11]:
# check that I get the same estimates for both alpha and xmin using bounded minimization
result3 = pyreto.distributions.Pareto.fit(cities.population, xmin=None, quantile=0.99, method='bounded')

In [12]:
test_scaling_exponent_estimation(desired_alpha, result3, decimal=2)

In [13]:
test_scaling_threshold_estimation(desired_xmin, result3, decimal=2)


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-13-3c50a6c928ba> in <module>()
----> 1 test_scaling_threshold_estimation(desired_xmin, result3, decimal=2)

<ipython-input-5-aac319d7d71d> in test_scaling_threshold_estimation(desired_xmin, result, decimal)
      4 
      5 def test_scaling_threshold_estimation(desired_xmin, result, decimal):
----> 6     np.testing.assert_almost_equal(result.xmin, desired_xmin, decimal=decimal)

/Users/drpugh/anaconda/envs/pyreto-dev/lib/python3.5/site-packages/numpy/testing/utils.py in assert_almost_equal(actual, desired, decimal, err_msg, verbose)
    511         pass
    512     if round(abs(desired - actual), decimal) != 0:
--> 513         raise AssertionError(_build_err_msg())
    514 
    515 

AssertionError: 
Arrays are not almost equal to 2 decimals
 ACTUAL: 70.075391716136934
 DESIRED: 52.46

In [19]:
# using brute force minmization to find xmin makes this test take a while!
pvalue, Ds = pyreto.distributions.Pareto.test_goodness_of_fit(42, result2, cities.population, quantile=0.99,
                                                              method='brute')

In [15]:
# pareto distribution should not be rejected...
assert pvalue > 0.10


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-15-68fb2dca5c54> in <module>()
      1 # pareto distribution should not be rejected...
----> 2 assert pvalue > 0.10

AssertionError: 

In [16]:
pvalue


Out[16]:
0.026783583288262355

In [17]:
pyreto.distributions.Pareto.test_goodness_of_fit??

In [ ]: