Generalized ESD outlier report for potatoes.tsv


In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pylab as plt
from PyAstronomy import pyasl
import pandas as pd

In [3]:
MPI_MATRIX = pd.read_csv('potatoes.tsv', sep='\t')

In [4]:
def apply_generalizedESD(column_name, max_num_outliers=10,
                         significance=0.05):
    array = MPI_MATRIX[column_name]
    r = pyasl.generalizedESD(array, max_num_outliers,
                             significance, fullOutput=True)

    # Plot the "data"
    plt.plot(array, 'b.')
    # and mark the outliers.
    for i in range(r[0]):
      plt.plot(r[1][i], array[r[1][i]], 'rp')
    plt.show()
    
    print "Number of outliers: ", r[0]
    print "Indices of outliers: ", r[1]
    print "        R      Lambda"
    for i in range(len(r[2])):
      # index, "R statistics", lambda
      # NOTE: the lambda value determines which point
      #       is regarded as an outlier
      print "%2d  %8.5f  %8.5f" % ((i+1), r[2][i], r[3][i])

In [5]:
# ignore 'group' and 'id' columns
for column_name in MPI_MATRIX.columns[2:]:
    array = MPI_MATRIX[column_name]
    r = pyasl.generalizedESD(array, 10, 0.05, fullOutput=True)
    if r[0] > 0:
        print "Analyte: %s" % column_name
        apply_generalizedESD(column_name)
        print '\n\n'


Analyte: 0929C255-892A-4196-B906-54A5D809FA72
Number of outliers:  2
Indices of outliers:  [18, 16]
        R      Lambda
 1   3.33049   3.37010
 2   3.47838   3.36649
 3   2.11205   3.36284
 4   2.03037   3.35914
 5   2.06402   3.35539
 6   1.94255   3.35159
 7   1.97397   3.34774
 8   2.01083   3.34384
 9   1.95426   3.33988
10   1.98094   3.33587



Analyte: 0F3D1985-DBA7-4525-92F5-CEF822A2A08F
Number of outliers:  1
Indices of outliers:  [33]
        R      Lambda
 1   3.68537   3.37010
 2   2.73016   3.36649
 3   2.72092   3.36284
 4   2.48102   3.35914
 5   2.45584   3.35539
 6   2.30815   3.35159
 7   2.20700   3.34774
 8   2.21711   3.34384
 9   2.09131   3.33988
10   2.13586   3.33587



Analyte: 12D39077-7A6C-4BA8-9EBB-CFFF87CD9770
Number of outliers:  1
Indices of outliers:  [1]
        R      Lambda
 1   6.91733   3.37010
 2   2.62610   3.36649
 3   2.70795   3.36284
 4   2.71654   3.35914
 5   2.64303   3.35539
 6   2.65510   3.35159
 7   2.71684   3.34774
 8   2.81750   3.34384
 9   2.35844   3.33988
10   2.43658   3.33587



Analyte: 19EBA91E-4D0E-493E-8297-3466769C0C1B
Number of outliers:  1
Indices of outliers:  [72]
        R      Lambda
 1   4.48980   3.37010
 2   2.83064   3.36649
 3   2.65199   3.36284
 4   2.56128   3.35914
 5   2.54045   3.35539
 6   2.32535   3.35159
 7   2.23962   3.34774
 8   2.17822   3.34384
 9   2.14624   3.33988
10   1.93581   3.33587



Analyte: 1E0DFE20-FCD1-43DF-8DE2-901E95A51523
Number of outliers:  2
Indices of outliers:  [82, 83]
        R      Lambda
 1   5.98376   3.37010
 2   7.05371   3.36649
 3   2.67620   3.36284
 4   2.30320   3.35914
 5   2.30634   3.35539
 6   2.34147   3.35159
 7   2.42394   3.34774
 8   2.21745   3.34384
 9   2.12463   3.33988
10   2.11109   3.33587



Analyte: 3771176A-F95B-494A-8172-E57260AC372C
Number of outliers:  1
Indices of outliers:  [16]
        R      Lambda
 1   3.84269   3.37010
 2   3.32824   3.36649
 3   3.27767   3.36284
 4   2.98223   3.35914
 5   2.80255   3.35539
 6   2.68572   3.35159
 7   2.44508   3.34774
 8   2.31383   3.34384
 9   2.29412   3.33988
10   2.08490   3.33587



Analyte: 38ABF99F-2098-4214-83E5-4FC5270C1C6F
Number of outliers:  2
Indices of outliers:  [30, 6]
        R      Lambda
 1   3.80905   3.37010
 2   3.38783   3.36649
 3   3.29392   3.36284
 4   3.04237   3.35914
 5   2.75739   3.35539
 6   2.43438   3.35159
 7   2.26794   3.34774
 8   2.27057   3.34384
 9   2.29987   3.33988
10   2.21195   3.33587



Analyte: 4CA0F2AD-18D5-4044-8350-504FFE609B2C
Number of outliers:  1
Indices of outliers:  [33]
        R      Lambda
 1   3.43342   3.37010
 2   2.30837   3.36649
 3   2.27079   3.36284
 4   2.21215   3.35914
 5   2.19579   3.35539
 6   2.14954   3.35159
 7   2.15506   3.34774
 8   1.91530   3.34384
 9   1.86293   3.33988
10   1.83490   3.33587



Analyte: 556EA0EF-8989-41AE-8023-7703DAC7160C
Number of outliers:  1
Indices of outliers:  [61]
        R      Lambda
 1   7.57039   3.37010
 2   3.29096   3.36649
 3   3.06316   3.36284
 4   2.54228   3.35914
 5   2.45831   3.35539
 6   2.25331   3.35159
 7   2.15746   3.34774
 8   2.18592   3.34384
 9   2.10881   3.33988
10   2.08399   3.33587



Analyte: 607DBF7C-9714-402B-B03A-2CC96DC5CE02
Number of outliers:  5
Indices of outliers:  [40, 20, 44, 14, 16]
        R      Lambda
 1   3.62069   3.37010
 2   3.68956   3.36649
 3   3.60178   3.36284
 4   3.59213   3.35914
 5   3.63350   3.35539
 6   3.28529   3.35159
 7   3.04528   3.34774
 8   3.13370   3.34384
 9   3.04230   3.33988
10   3.13515   3.33587



Analyte: 6176BB6D-9D93-4714-B906-C7A54E0D3325
Number of outliers:  3
Indices of outliers:  [31, 34, 35]
        R      Lambda
 1   5.16059   3.37010
 2   4.90290   3.36649
 3   3.77526   3.36284
 4   3.05508   3.35914
 5   2.92814   3.35539
 6   3.04864   3.35159
 7   3.11830   3.34774
 8   2.80429   3.34384
 9   2.90321   3.33988
10   2.95270   3.33587



Analyte: 6689E110-85D0-445C-8056-108724435D89
Number of outliers:  1
Indices of outliers:  [61]
        R      Lambda
 1   8.33433   3.37010
 2   2.25062   3.36649
 3   2.15959   3.36284
 4   2.09800   3.35914
 5   2.11121   3.35539
 6   2.15148   3.35159
 7   2.04434   3.34774
 8   2.04458   3.34384
 9   2.04591   3.33988
10   2.10732   3.33587



Analyte: 87DAEC85-6F71-4239-8D48-5590DA42CE45
Number of outliers:  1
Indices of outliers:  [92]
        R      Lambda
 1   4.27966   3.37010
 2   3.08890   3.36649
 3   3.04897   3.36284
 4   3.03299   3.35914
 5   2.62337   3.35539
 6   2.69417   3.35159
 7   2.31692   3.34774
 8   2.33052   3.34384
 9   2.24034   3.33988
10   2.25276   3.33587



Analyte: E449A211-33FC-45F4-B4F4-03CDE1D4F4BF
Number of outliers:  1
Indices of outliers:  [76]
        R      Lambda
 1   3.76622   3.37010
 2   3.21085   3.36649
 3   2.11886   3.36284
 4   2.10693   3.35914
 5   2.15846   3.35539
 6   2.20291   3.35159
 7   2.22944   3.34774
 8   2.20894   3.34384
 9   2.28352   3.33988
10   2.36572   3.33587



Analyte: E8976AD5-04D7-4C42-8897-EFCB2CA0A171
Number of outliers:  1
Indices of outliers:  [0]
        R      Lambda
 1   4.01711   3.37010
 2   2.99871   3.36649
 3   3.01750   3.36284
 4   2.43771   3.35914
 5   2.34116   3.35539
 6   2.24044   3.35159
 7   2.20224   3.34774
 8   2.15956   3.34384
 9   2.00864   3.33988
10   2.02549   3.33587



Analyte: F1A1BFCF-BE56-43F7-B0BF-B1A64A618C86
Number of outliers:  1
Indices of outliers:  [61]
        R      Lambda
 1   8.38561   3.37010
 2   2.44401   3.36649
 3   2.08934   3.36284
 4   2.12039   3.35914
 5   2.08112   3.35539
 6   2.13387   3.35159
 7   2.04449   3.34774
 8   2.02474   3.34384
 9   1.99343   3.33988
10   2.05192   3.33587