Generalized ESD test

  • a generalization over Grubbs for one or more outliers in a series
  • showcase for IPython widgets and @interact

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pylab as plt
from PyAstronomy import pyasl

# Convert data given at:
# http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm
# to array.
x = np.array(map(lambda x: float(x),
        "-0.25 0.68 0.94 1.15 1.20 1.26 1.26 1.34 1.38 1.43 1.49 1.49 \
          1.55 1.56 1.58 1.65 1.69 1.70 1.76 1.77 1.81 1.91 1.94 1.96 \
          1.99 2.06 2.09 2.10 2.14 2.15 2.23 2.24 2.26 2.35 2.37 2.40 \
          2.47 2.54 2.62 2.64 2.90 2.92 2.92 2.93 3.21 3.26 3.30 3.59 \
          3.68 4.30 4.64 5.34 5.42 6.01".split()))

# Apply the generalized ESD
r = pyasl.generalizedESD(x, 10, 0.05, fullOutput=True)

print "Number of outliers: ", r[0]
print "Indices of outliers: ", r[1]
print "        R      Lambda"
for i in range(len(r[2])):
  print "%2d  %8.5f  %8.5f" % ((i+1), r[2][i], r[3][i])

# Plot the "data"
plt.plot(x, 'b.')
# and mark the outliers.
for i in range(r[0]):
  plt.plot(r[1][i], x[r[1][i]], 'rp')
plt.show()


Number of outliers:  3
Indices of outliers:  [53, 52, 51]
        R      Lambda
 1   3.14819   3.15879
 2   2.97114   3.15143
 3   3.21044   3.14389
 4   2.83814   3.13616
 5   2.84416   3.12825
 6   2.87769   3.12013
 7   2.30345   3.11180
 8   2.33534   3.10324
 9   2.12480   3.09446
10   2.09054   3.08542

In [4]:
import pandas as pd

MPI_MATRIX = pd.read_csv('potatoes.tsv', sep='\t')

In [8]:
from IPython.html import widgets # Widget definitions
from IPython.display import display # Used to display widgets in the notebook

from IPython.html.widgets.interaction import interact

# all the metrics avail. in scipy.spatial.distance.pdist


column_names = list(MPI_MATRIX.columns)
columns_dropdown = widgets.DropdownWidget(values=column_names, value='12D39077-7A6C-4BA8-9EBB-CFFF87CD9770')

@interact(column_name=columns_dropdown)
def plot_column(column_name):
    plt.plot(MPI_MATRIX[column_name], 'b.')



In [9]:
def apply_generalizedESD(column_name, max_num_outliers=10, significance=0.05):
    array = MPI_MATRIX[column_name]
    r = pyasl.generalizedESD(array, max_num_outliers, significance, 
                             fullOutput=True)

    # Plot the "data"
    plt.plot(array, 'b.')
    # and mark the outliers.
    for i in range(r[0]):
      plt.plot(r[1][i], array[r[1][i]], 'rp')
    plt.show()
    
    print "Number of outliers: ", r[0]
    print "Indices of outliers: ", r[1]
    print "        R      Lambda"
    for i in range(len(r[2])):
      print "%2d  %8.5f  %8.5f" % ((i+1), r[2][i], r[3][i])

In [10]:
interact(apply_generalizedESD, column_name=columns_dropdown)


Number of outliers:  1
Indices of outliers:  [1]
        R      Lambda
 1   6.91733   3.37010
 2   2.62610   3.36649
 3   2.70795   3.36284
 4   2.71654   3.35914
 5   2.64303   3.35539
 6   2.65510   3.35159
 7   2.71684   3.34774
 8   2.81750   3.34384
 9   2.35844   3.33988
10   2.43658   3.33587
Out[10]:
<function __main__.apply_generalizedESD>