In [1]:
%matplotlib inline
In [2]:
import numpy as np
import matplotlib.pylab as plt
from PyAstronomy import pyasl
import pandas as pd
In [3]:
MPI_MATRIX = pd.read_csv('potatoes.tsv', sep='\t')
In [4]:
def apply_generalizedESD(column_name, max_num_outliers=10,
significance=0.05):
array = MPI_MATRIX[column_name]
r = pyasl.generalizedESD(array, max_num_outliers,
significance, fullOutput=True)
# Plot the "data"
plt.plot(array, 'b.')
# and mark the outliers.
for i in range(r[0]):
plt.plot(r[1][i], array[r[1][i]], 'rp')
plt.show()
print "Number of outliers: ", r[0]
print "Indices of outliers: ", r[1]
print " R Lambda"
for i in range(len(r[2])):
# index, "R statistics", lambda
# NOTE: the lambda value determines which point
# is regarded as an outlier
print "%2d %8.5f %8.5f" % ((i+1), r[2][i], r[3][i])
In [5]:
# ignore 'group' and 'id' columns
for column_name in MPI_MATRIX.columns[2:]:
array = MPI_MATRIX[column_name]
r = pyasl.generalizedESD(array, 10, 0.05, fullOutput=True)
if r[0] > 0:
print "Analyte: %s" % column_name
apply_generalizedESD(column_name)
print '\n\n'