Data Dashboard library/plugin concept

Propose

The propose of this library/plugin is to allow the data analysis process more easy and automatic.

The library will join some methods to help the data manipulation, data cleaning and data visualization.

Data information

When a dataframe were selected, the library will show some information about the dataframe, such as:

total of attributes;
total of instances;
type of each attributes;
total of NaN values for each attribute;

Data cleaning

To improve the user's experience, the user should cleaning and prepare the data before call graphical functions.

Data visualization

Data visualization is divided into 2 parts: the first one is the summary about the data, as described at section 1.1.1; the second one are graphical summaries.

The graphical summaries is composed by 2 sections: the first one is a interactive parameterized chart when the user can chose its fields; the second one is a composed chart when the user can see all fields combined with a specific field.

Source

Initializing

Requirements:

IPython
ipywidgets;
matplotlib;
numpy;
pandas;
textwrap
traceback



In [1]:

    
from IPython.display import display, HTML
from ipywidgets import widgets, interactive, IntSlider
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
import qgrid  # https://github.com/quantopian/qgrid
# import statsmodels.api as sm
import textwrap
import traceback

plt.style.use('ggplot')
%matplotlib notebook

Summarized data functions



In [2]:

    
def summary(data: pd.DataFrame):
    """
    
    """   
    # types
    df = pd.DataFrame(data.dtypes).rename(columns={0: 'Types'})
    
    # set
    df = pd.merge(
        df, pd.DataFrame(
            data.apply(lambda se : str(sorted(set(se.dropna())))[:1000])
        ).rename(columns={0: 'Set Values'}),
        left_index=True, right_index=True
    )
    
    # total observations
    df = pd.merge(
        df, pd.DataFrame(
            data.count()
        ).rename(columns={0: '# Observations'}),
        left_index=True, right_index=True
    )
    
    # total of nan
    df = pd.merge(
        df, pd.DataFrame(data.isnull().sum()).rename(columns={0: '# NaN'}),
        left_index=True, right_index=True
    )
    return df


def make_chart(data: pd.DataFrame, ax: plt.Axes):
    """
    Ex:
    k = ['Sex', 'Survived']
    df[k].groupby(by='Sex').sum()

    """
    # display chart
    try:
        data.plot.bar(ax=ax, stacked=True)

        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
    except:
        t = '<br/>'.join(textwrap.wrap(traceback.format_exc(), 80))
        display(t)
    return ax


def process_query(
    data: pd.DataFrame, 
    field_reference: str, 
    fields_comparison: [str],
    bins: int
) -> pd.DataFrame:
    """

    """
    labels_reference = []
    labels = []

    if not (fields_comparison and field_reference):
        return data

    _data = data[list(fields_comparison)+[field_reference]].copy()
    for f in list(fields_comparison)+[field_reference]:
        try:
            if isinstance(data[f].dtype.type(), np.number):
                _data[f], _ = pd.cut(data[f].copy(), bins=bins, retbins=True)
        except:
            pass
            
    return pd.crosstab(
        [_data[f] for f in fields_comparison], 
        _data[field_reference]
    )

DataAnalysisWidget



In [3]:

    
class DataAnalysisWidget:
    def __init__(
        self, data: pd.DataFrame
    ):
        self.data = data.copy()
        
    @staticmethod
    def load(filepath: str):
        """
        """
        return DataAnalysisWidget(pd.read_csv(filepath))

    
    def prepare_data(self, fields: dict):
        """
        fields: {'field_name1': {old_value: new_value}}
        """
        # Survived field
        _df = self.data.copy()
        
        # iterate over fields
        for i_field, v_field in fields.items():
            # iterate over labels
            for old_label, new_label in v_field.items():
                _mask = _df[i_field]==old_label
                self.data.loc[_mask, i_field] = new_label
            self.data[i_field] = self.data[i_field].astype(
                'category', categories=list(set(self.data[i_field].dropna()))
            )
    
    def summary(self):
        return display(summary(self.data))
        
    def _interative_show_chart(
        self, field_reference: str, fields_comparison: [str], bins
    ):
        ax = plt.figure().gca()
        
        _data = process_query(
            data=self.data, 
            field_reference=field_reference, 
            fields_comparison=fields_comparison,
            bins=bins
        )
        
        display(_data)
        make_chart(data=_data, ax=ax)
    
    def show_chart(self, field_reference: str, fields_comparison: [str]):
        
        w_bins = IntSlider(min=2, max=10, value=2)
        w_fields_comparison = widgets.SelectMultiple(
            description='Xs:',
            options=[i for i in self.data.keys()],
            selected_labels=fields_comparison
        )

        w_field_reference = widgets.Dropdown(
            description='Y:',
            options=[i for i in self.data.keys()],
            selected_label=field_reference
        )
        
        return interactive(
            self._interative_show_chart,
            field_reference=w_field_reference,
            fields_comparison=w_fields_comparison,
            bins=w_bins
        )
    
    def _interative_show_panel_chart(
        self, field_reference: str, fields_comparison: [str], bins
    ):
        ax = plt.figure().gca()
        
        _data = process_query(
            data=self.data, 
            field_reference=field_reference, 
            fields_comparison=fields_comparison,
            bins=bins
        )
        
        display(_data)
        make_chart(data=_data, ax=ax)
    
    def show_panel_chart(self, field_reference: str):
        
        w_bins = IntSlider(min=2, max=10, value=2)
        w_field_reference = widgets.Dropdown(
            description='Y:',
            options=[i for i in self.data.keys()],
            selected_label=field_reference
        )
        w_fields_comparison = widgets.SelectMultiple(
            description='Xs:',
            options=[i for i in self.data.keys()],
            selected_labels=[
                i for i in self.data.keys() if not i == field_reference
            ]
        )
        
        return interactive(
            self._interative_show_panel_chart,
            field_reference=w_field_reference,
            fields_comparison=w_fields_comparison,
            bins=w_bins
        )
    
    def __repr__(self):
        return ''

Interactive Data Analysis



In [4]:

    
daw = DataAnalysisWidget.load('data/train.csv')
daw.prepare_data({
    'Survived': {1: 'Survived', 0: 'Died'},
    'Pclass': {1: 'Class1', 2: 'Class2', 3: 'Class3'},
    'Sex': {},
    'Embarked': {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'}
})



In [5]:

    
daw.summary()









    






  
    
      
      Types
      Set Values
      # Observations
      # NaN
    
  
  
    
      PassengerId
      int64
      [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
      891
      0
    
    
      Survived
      category
      ['Died', 'Survived']
      891
      0
    
    
      Pclass
      category
      ['Class1', 'Class2', 'Class3']
      891
      0
    
    
      Name
      object
      ['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ...
      891
      0
    
    
      Sex
      category
      ['female', 'male']
      891
      0
    
    
      Age
      float64
      [0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ...
      714
      177
    
    
      SibSp
      int64
      [0, 1, 2, 3, 4, 5, 8]
      891
      0
    
    
      Parch
      int64
      [0, 1, 2, 3, 4, 5, 6]
      891
      0
    
    
      Ticket
      object
      ['110152', '110413', '110465', '110564', '1108...
      891
      0
    
    
      Fare
      float64
      [0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495...
      891
      0
    
    
      Cabin
      object
      ['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A2...
      204
      687
    
    
      Embarked
      category
      ['Cherbourg', 'Queenstown', 'Southampton']
      889
      2



In [6]:

    
daw.show_chart(
    field_reference='Survived',
    fields_comparison=['Sex']
)

References

http://pandas.pydata.org/pandas-docs/stable/categorical.html

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.crosstab.html

http://pandas.pydata.org/pandas-docs/stable/groupby.html

http://ipywidgets.readthedocs.io/en/latest/examples/Using%20Interact.html

	Types	Set Values	# Observations	# NaN
PassengerId	int64	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	891	0
Survived	category	['Died', 'Survived']	891	0
Pclass	category	['Class1', 'Class2', 'Class3']	891	0
Name	object	['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ...	891	0
Sex	category	['female', 'male']	891	0
Age	float64	[0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ...	714	177
SibSp	int64	[0, 1, 2, 3, 4, 5, 8]	891	0
Parch	int64	[0, 1, 2, 3, 4, 5, 6]	891	0
Ticket	object	['110152', '110413', '110465', '110564', '1108...	891	0
Fare	float64	[0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495...	891	0
Cabin	object	['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A2...	204	687
Embarked	category	['Cherbourg', 'Queenstown', 'Southampton']	889	2

Table of Contents