In this notebook, we create a class DNA that leverages the new bqplot canvas based HeatMap along with the ipywidgets Range Slider to help us detect and clean outliers in our data. The class accepts a DataFrame and allows you to visually and programatically filter your outliers. The cleaned DataFrame can then be retrieved through a simple convenience function.
In [ ]:
from bqplot import (DateScale, ColorScale, HeatMap,
Figure, LinearScale, OrdinalScale, Axis)
from scipy.stats import percentileofscore
from scipy.interpolate import interp1d
import bqplot.pyplot as plt
from traitlets import List, Float, observe
from ipywidgets import IntRangeSlider, Layout, VBox, HBox, jslink
from pandas import DatetimeIndex
import numpy as np
import pandas as pd
def quantile_space(x, q1=0.1, q2=0.9):
'''
Returns a function that squashes quantiles between q1 and q2
'''
q1_x, q2_x = np.percentile(x, [q1, q2])
qs = np.percentile(x, np.linspace(0, 100, 100))
def get_quantile(t):
return np.interp(t, qs, np.linspace(0, 100, 100))
def f(y):
return np.interp(get_quantile(y), [0, q1, q2, 100], [-1, 0, 0, 1])
return f
class DNA(VBox):
colors = List()
q1 = Float()
q2 = Float()
def __init__(self, data, **kwargs):
self.data = data
date_x, date_y = False, False
transpose = kwargs.pop('transpose', False)
if transpose is True:
if type(data.index) is DatetimeIndex:
self.x_scale = DateScale()
if type(data.columns) is DatetimeIndex:
self.y_scale = DateScale()
x, y = list(data.columns.values), data.index.values
else:
if type(data.index) is DatetimeIndex:
date_x = True
if type(data.columns) is DatetimeIndex:
date_y = True
x, y = data.index.values, list(data.columns.values)
self.q1, self.q2 = kwargs.pop('quantiles', (1, 99))
self.quant_func = quantile_space(self.data.values.flatten(), q1=self.q1, q2=self.q2)
self.colors = kwargs.pop('colors', ['Red', 'Black', 'Green'])
self.x_scale = DateScale() if date_x is True else LinearScale()
self.y_scale = DateScale() if date_y is True else OrdinalScale(padding_y=0)
self.color_scale = ColorScale(colors=self.colors)
self.heat_map = HeatMap(color=self.quant_func(self.data.T), x=x, y=y, scales={'x': self.x_scale, 'y': self.y_scale,
'color': self.color_scale})
self.x_ax = Axis(scale=self.x_scale)
self.y_ax = Axis(scale=self.y_scale, orientation='vertical')
show_axes = kwargs.pop('show_axes', True)
self.axes = [self.x_ax, self.y_ax] if show_axes is True else []
self.height = kwargs.pop('height', '800px')
self.layout = kwargs.pop('layout', Layout(width='100%', height=self.height, flex='1'))
self.fig_margin = kwargs.pop('fig_margin', {'top': 60, 'bottom': 60, 'left': 150, 'right': 0})
kwargs.setdefault('padding_y', 0.0)
self.create_interaction(**kwargs)
self.figure = Figure(marks=[self.heat_map], axes=self.axes, fig_margin=self.fig_margin,
layout=self.layout, min_aspect_ratio=0.,**kwargs)
super(VBox, self).__init__(children=[self.range_slider, self.figure], layout=Layout(align_items='center',
width='100%',
height='100%'),
**kwargs)
def create_interaction(self, **kwargs):
self.range_slider = IntRangeSlider(description='Filter Range', value=(self.q1, self.q2), layout=Layout(width='100%'))
self.range_slider.observe(self.slid_changed, 'value')
self.observe(self.changed, ['q1', 'q2'])
def slid_changed(self, new):
self.q1 = self.range_slider.value[0]
self.q2 = self.range_slider.value[1]
def changed(self, new):
self.range_slider.value = (self.q1, self.q2)
self.quant_func = quantile_space(self.data.values.flatten(), q1=self.q1, q2=self.q2)
self.heat_map.color = self.quant_func(self.data.T)
def get_filtered_df(self, fill_type='median'):
q1_x, q2_x = np.percentile(self.data, [self.q1, self.q2])
if fill_type == 'median':
return self.data[(self.data >= q1_x) & (self.data <= q2_x)].apply(lambda x: x.fillna(x.median()))
elif fill_type == 'mean':
return self.data[(self.data >= q1_x) & (self.data <= q2_x)].apply(lambda x: x.fillna(x.mean()))
else:
raise ValueError("fill_type must be one of ('median', 'mean')")
We define the size of our matrix here. Larger matrices require a larger height.
In [ ]:
size = 100
In [ ]:
def num_to_col_letters(num):
letters = ''
while num:
mod = (num - 1) % 26
letters += chr(mod + 65)
num = (num - 1) // 26
return ''.join(reversed(letters))
letters = []
for i in range(1, size+1):
letters.append(num_to_col_letters(i))
In [ ]:
data = pd.DataFrame(np.random.randn(size, size), columns=letters)
In [ ]:
data_dna = DNA(data, title='DNA of our Data', height='1400px', colors=['Red', 'White', 'Green'])
data_dna
Instead of setting the quantiles by the sliders, we can also set them programatically. Using a range of (5, 95) restricts the data considerably.
In [ ]:
data_dna.q1, data_dna.q2 = 5, 95
Now, we can use the convenience function to extract a clean DataFrame.
In [ ]:
data_clean = data_dna.get_filtered_df()
The DNA fills outliers with the mean of the column. Alternately, we can fill the outliers by the mean.
In [ ]:
data_mean = data_dna.get_filtered_df(fill_type='mean')
We can also visualize the new DataFrame the same way to test how our outliers look now.
In [ ]:
DNA(data_clean, title='Cleaned Data', height='1200px', colors=['Red', 'White', 'Green'])