Data Dashboard library/plugin concept

Propose

The propose of this library/plugin is to allow the data analysis process more easy and automatic.

The library will join some methods to help the data manipulation, data cleaning and data visualization.

Data information

When a dataframe were selected, the library will show some information about the dataframe, such as:

  • total of attributes;
  • total of instances;
  • type of each attributes;
  • total of NaN values for each attribute;

Data cleaning

To improve the user's experience, the user should cleaning and prepare the data before call graphical functions.

Data visualization

Data visualization is divided into 2 parts: the first one is the summary about the data, as described at section 1.1.1; the second one are graphical summaries.

The graphical summaries is composed by 2 sections: the first one is a interactive parameterized chart when the user can chose its fields; the second one is a composed chart when the user can see all fields combined with a specific field.

Source

Initializing

Requirements:

  • IPython
  • ipywidgets;
  • matplotlib;
  • numpy;
  • pandas;
  • textwrap
  • traceback

In [1]:
from IPython.display import display, HTML
from ipywidgets import widgets, interactive, IntSlider
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
import qgrid  # https://github.com/quantopian/qgrid
# import statsmodels.api as sm
import textwrap
import traceback

plt.style.use('ggplot')
%matplotlib notebook

Summarized data functions


In [2]:
def summary(data: pd.DataFrame):
    """
    
    """   
    # types
    df = pd.DataFrame(data.dtypes).rename(columns={0: 'Types'})
    
    # set
    df = pd.merge(
        df, pd.DataFrame(
            data.apply(lambda se : str(sorted(set(se.dropna())))[:1000])
        ).rename(columns={0: 'Set Values'}),
        left_index=True, right_index=True
    )
    
    # total observations
    df = pd.merge(
        df, pd.DataFrame(
            data.count()
        ).rename(columns={0: '# Observations'}),
        left_index=True, right_index=True
    )
    
    # total of nan
    df = pd.merge(
        df, pd.DataFrame(data.isnull().sum()).rename(columns={0: '# NaN'}),
        left_index=True, right_index=True
    )
    return df


def make_chart(data: pd.DataFrame, ax: plt.Axes):
    """
    Ex:
    k = ['Sex', 'Survived']
    df[k].groupby(by='Sex').sum()

    """
    # display chart
    try:
        data.plot.bar(ax=ax, stacked=True)

        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
    except:
        t = '<br/>'.join(textwrap.wrap(traceback.format_exc(), 80))
        display(t)
    return ax


def process_query(
    data: pd.DataFrame, 
    field_reference: str, 
    fields_comparison: [str],
    bins: int
) -> pd.DataFrame:
    """

    """
    labels_reference = []
    labels = []

    if not (fields_comparison and field_reference):
        return data

    _data = data[list(fields_comparison)+[field_reference]].copy()
    for f in list(fields_comparison)+[field_reference]:
        try:
            if isinstance(data[f].dtype.type(), np.number):
                _data[f], _ = pd.cut(data[f].copy(), bins=bins, retbins=True)
        except:
            pass
            
    return pd.crosstab(
        [_data[f] for f in fields_comparison], 
        _data[field_reference]
    )

DataAnalysisWidget


In [3]:
class DataAnalysisWidget:
    def __init__(
        self, data: pd.DataFrame
    ):
        self.data = data.copy()
        
    @staticmethod
    def load(filepath: str):
        """
        """
        return DataAnalysisWidget(pd.read_csv(filepath))

    
    def prepare_data(self, fields: dict):
        """
        fields: {'field_name1': {old_value: new_value}}
        """
        # Survived field
        _df = self.data.copy()
        
        # iterate over fields
        for i_field, v_field in fields.items():
            # iterate over labels
            for old_label, new_label in v_field.items():
                _mask = _df[i_field]==old_label
                self.data.loc[_mask, i_field] = new_label
            self.data[i_field] = self.data[i_field].astype(
                'category', categories=list(set(self.data[i_field].dropna()))
            )
    
    def summary(self):
        return display(summary(self.data))
        
    def _interative_show_chart(
        self, field_reference: str, fields_comparison: [str], bins
    ):
        ax = plt.figure().gca()
        
        _data = process_query(
            data=self.data, 
            field_reference=field_reference, 
            fields_comparison=fields_comparison,
            bins=bins
        )
        
        display(_data)
        make_chart(data=_data, ax=ax)
    
    def show_chart(self, field_reference: str, fields_comparison: [str]):
        
        w_bins = IntSlider(min=2, max=10, value=2)
        w_fields_comparison = widgets.SelectMultiple(
            description='Xs:',
            options=[i for i in self.data.keys()],
            selected_labels=fields_comparison
        )

        w_field_reference = widgets.Dropdown(
            description='Y:',
            options=[i for i in self.data.keys()],
            selected_label=field_reference
        )
        
        return interactive(
            self._interative_show_chart,
            field_reference=w_field_reference,
            fields_comparison=w_fields_comparison,
            bins=w_bins
        )
    
    def _interative_show_panel_chart(
        self, field_reference: str, fields_comparison: [str], bins
    ):
        ax = plt.figure().gca()
        
        _data = process_query(
            data=self.data, 
            field_reference=field_reference, 
            fields_comparison=fields_comparison,
            bins=bins
        )
        
        display(_data)
        make_chart(data=_data, ax=ax)
    
    def show_panel_chart(self, field_reference: str):
        
        w_bins = IntSlider(min=2, max=10, value=2)
        w_field_reference = widgets.Dropdown(
            description='Y:',
            options=[i for i in self.data.keys()],
            selected_label=field_reference
        )
        w_fields_comparison = widgets.SelectMultiple(
            description='Xs:',
            options=[i for i in self.data.keys()],
            selected_labels=[
                i for i in self.data.keys() if not i == field_reference
            ]
        )
        
        return interactive(
            self._interative_show_panel_chart,
            field_reference=w_field_reference,
            fields_comparison=w_fields_comparison,
            bins=w_bins
        )
    
    def __repr__(self):
        return ''

Interactive Data Analysis


In [4]:
daw = DataAnalysisWidget.load('data/train.csv')
daw.prepare_data({
    'Survived': {1: 'Survived', 0: 'Died'},
    'Pclass': {1: 'Class1', 2: 'Class2', 3: 'Class3'},
    'Sex': {},
    'Embarked': {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'}
})

In [5]:
daw.summary()


Types Set Values # Observations # NaN
PassengerId int64 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 891 0
Survived category ['Died', 'Survived'] 891 0
Pclass category ['Class1', 'Class2', 'Class3'] 891 0
Name object ['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ... 891 0
Sex category ['female', 'male'] 891 0
Age float64 [0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ... 714 177
SibSp int64 [0, 1, 2, 3, 4, 5, 8] 891 0
Parch int64 [0, 1, 2, 3, 4, 5, 6] 891 0
Ticket object ['110152', '110413', '110465', '110564', '1108... 891 0
Fare float64 [0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495... 891 0
Cabin object ['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A2... 204 687
Embarked category ['Cherbourg', 'Queenstown', 'Southampton'] 889 2

In [6]:
daw.show_chart(
    field_reference='Survived',
    fields_comparison=['Sex']
)


Survived Died Survived
Sex
male 468 109
female 81 233

References