In this notebook, we create a class DNA
that leverages the new bqplot canvas based HeatMap along with the ipywidgets Range Slider to help us detect and clean outliers in our data. The class accepts a DataFrame and allows you to visually and programatically filter your outliers. The cleaned DataFrame can then be retrieved through a simple convenience function.
In [ ]:
from scipy.stats import percentileofscore
from scipy.interpolate import interp1d
import bqplot.pyplot as plt
from bqplot import *
from traitlets import List, Float, observe
from ipywidgets import IntRangeSlider, Layout, VBox, HBox, jslink
from pandas import DatetimeIndex
import numpy as np
import pandas as pd
def quantile_space(x, q1=0.1, q2=0.9):
'''
Returns a function that squashes quantiles between q1 and q2
'''
q1_x, q2_x = np.percentile(x, [q1, q2])
qs = np.percentile(x, np.linspace(0, 100, 100))
def get_quantile(t):
return np.interp(t, qs, np.linspace(0, 100, 100))
def f(y):
return np.interp(get_quantile(y), [0, q1, q2, 100], [-1, 0, 0, 1])
return f
class DNA(VBox):
colors = List()
q1 = Float()
q2 = Float()
def __init__(self, data, **kwargs):
self.data = data
date_x, date_y = False, False
transpose = kwargs.pop('transpose', False)
if transpose is True:
if type(data.index) is DatetimeIndex:
self.x_scale = DateScale()
if type(data.columns) is DatetimeIndex:
self.y_scale = DateScale()
x, y = list(data.columns.values), data.index.values
else:
if type(data.index) is DatetimeIndex:
date_x = True
if type(data.columns) is DatetimeIndex:
date_y = True
x, y = data.index.values, list(data.columns.values)
self.q1, self.q2 = kwargs.pop('quantiles', (1, 99))
self.quant_func = quantile_space(self.data.values.flatten(), q1=self.q1, q2=self.q2)
self.colors = kwargs.pop('colors', ['Red', 'Black', 'Green'])
self.x_scale = DateScale() if date_x is True else LinearScale()
self.y_scale = DateScale() if date_y is True else OrdinalScale(padding_y=0)
self.color_scale = ColorScale(colors=self.colors)
self.heat_map = HeatMap(color=self.quant_func(self.data.T), x=x, y=y, scales={'x': self.x_scale, 'y': self.y_scale,
'color': self.color_scale})
self.x_ax = Axis(scale=self.x_scale)
self.y_ax = Axis(scale=self.y_scale, orientation='vertical')
show_axes = kwargs.pop('show_axes', True)
self.axes = [self.x_ax, self.y_ax] if show_axes is True else []
self.height = kwargs.pop('height', '800px')
self.layout = kwargs.pop('layout', Layout(width='100%', height=self.height, flex='1'))
self.fig_margin = kwargs.pop('fig_margin', {'top': 60, 'bottom': 60, 'left': 150, 'right': 0})
kwargs.setdefault('padding_y', 0.0)
self.create_interaction(**kwargs)
self.figure = Figure(marks=[self.heat_map], axes=self.axes, fig_margin=self.fig_margin,
layout=self.layout, min_aspect_ratio=0.,**kwargs)
super(VBox, self).__init__(children=[self.range_slider, self.figure], layout=Layout(align_items='center',
width='100%',
height='100%'),
**kwargs)
def create_interaction(self, **kwargs):
self.range_slider = IntRangeSlider(description='Filter Range', value=(self.q1, self.q2), layout=Layout(width='100%'))
self.range_slider.observe(self.slid_changed, 'value')
self.observe(self.changed, ['q1', 'q2'])
def slid_changed(self, new):
self.q1 = self.range_slider.value[0]
self.q2 = self.range_slider.value[1]
def changed(self, new):
self.range_slider.value = (self.q1, self.q2)
self.quant_func = quantile_space(self.data.values.flatten(), q1=self.q1, q2=self.q2)
self.heat_map.color = self.quant_func(self.data.T)
def get_filtered_df(self, fill_type='median'):
q1_x, q2_x = np.percentile(self.data, [self.q1, self.q2])
if fill_type == 'median':
return self.data[(self.data >= q1_x) & (self.data <= q2_x)].apply(lambda x: x.fillna(x.median()))
elif fill_type == 'mean':
return self.data[(self.data >= q1_x) & (self.data <= q2_x)].apply(lambda x: x.fillna(x.mean()))
else:
raise ValueError("fill_type must be one of ('median', 'mean')")
We define the size of our matrix here. Larger matrices require a larger height.
In [ ]:
size = 100
In [ ]:
def num_to_col_letters(num):
letters = ''
while num:
mod = (num - 1) % 26
letters += chr(mod + 65)
num = (num - 1) // 26
return ''.join(reversed(letters))
letters = []
for i in range(1, size+1):
letters.append(num_to_col_letters(i))
In [ ]:
data = pd.DataFrame(np.random.randn(size, size), columns=letters)
In [ ]:
data_dna = DNA(data, title='DNA of our Data', height='1400px', colors=['Red', 'White', 'Green'])
data_dna
Instead of setting the quantiles by the sliders, we can also set them programatically. Using a range of (5, 95) restricts the data considerably.
In [ ]:
data_dna.q1, data_dna.q2 = 5, 95
Now, we can use the convenience function to extract a clean DataFrame.
In [ ]:
data_clean = data_dna.get_filtered_df()
The DNA fills outliers with the mean of the column. Alternately, we can fill the outliers by the mean.
In [ ]:
data_mean = data_dna.get_filtered_df(fill_type='mean')
We can also visualize the new DataFrame the same way to test how our outliers look now.
In [ ]:
DNA(data_clean, title='Cleaned Data', height='1200px', colors=['Red', 'White', 'Green'])
In [ ]: