The propose of this library/plugin is to allow the data analysis process more easy and automatic.
The library will join some methods to help the data manipulation, data cleaning and data visualization.
When a dataframe were selected, the library will show some information about the dataframe, such as:
To improve the user's experience, the user should cleaning and prepare the data before call graphical functions.
Data visualization is divided into 2 parts: the first one is the summary about the data, as described at section 1.1.1; the second one are graphical summaries.
The graphical summaries is composed by 2 sections: the first one is a interactive parameterized chart when the user can chose its fields; the second one is a composed chart when the user can see all fields combined with a specific field.
Requirements:
In [1]:
from IPython.display import display, HTML
from ipywidgets import widgets, interactive, IntSlider
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import qgrid # https://github.com/quantopian/qgrid
# import statsmodels.api as sm
import textwrap
import traceback
plt.style.use('ggplot')
%matplotlib notebook
In [2]:
def summary(data: pd.DataFrame):
"""
"""
# types
df = pd.DataFrame(data.dtypes).rename(columns={0: 'Types'})
# set
df = pd.merge(
df, pd.DataFrame(
data.apply(lambda se : str(sorted(set(se.dropna())))[:1000])
).rename(columns={0: 'Set Values'}),
left_index=True, right_index=True
)
# total observations
df = pd.merge(
df, pd.DataFrame(
data.count()
).rename(columns={0: '# Observations'}),
left_index=True, right_index=True
)
# total of nan
df = pd.merge(
df, pd.DataFrame(data.isnull().sum()).rename(columns={0: '# NaN'}),
left_index=True, right_index=True
)
return df
def make_chart(data: pd.DataFrame, ax: plt.Axes):
"""
Ex:
k = ['Sex', 'Survived']
df[k].groupby(by='Sex').sum()
"""
# display chart
try:
data.plot.bar(ax=ax, stacked=True)
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
except:
t = '<br/>'.join(textwrap.wrap(traceback.format_exc(), 80))
display(t)
return ax
def process_query(
data: pd.DataFrame,
field_reference: str,
fields_comparison: [str],
bins: int
) -> pd.DataFrame:
"""
"""
labels_reference = []
labels = []
if not (fields_comparison and field_reference):
return data
_data = data[list(fields_comparison)+[field_reference]].copy()
for f in list(fields_comparison)+[field_reference]:
try:
if isinstance(data[f].dtype.type(), np.number):
_data[f], _ = pd.cut(data[f].copy(), bins=bins, retbins=True)
except:
pass
return pd.crosstab(
[_data[f] for f in fields_comparison],
_data[field_reference]
)
In [3]:
class DataAnalysisWidget:
def __init__(
self, data: pd.DataFrame
):
self.data = data.copy()
@staticmethod
def load(filepath: str):
"""
"""
return DataAnalysisWidget(pd.read_csv(filepath))
def prepare_data(self, fields: dict):
"""
fields: {'field_name1': {old_value: new_value}}
"""
# Survived field
_df = self.data.copy()
# iterate over fields
for i_field, v_field in fields.items():
# iterate over labels
for old_label, new_label in v_field.items():
_mask = _df[i_field]==old_label
self.data.loc[_mask, i_field] = new_label
self.data[i_field] = self.data[i_field].astype(
'category', categories=list(set(self.data[i_field].dropna()))
)
def summary(self):
return display(summary(self.data))
def _interative_show_chart(
self, field_reference: str, fields_comparison: [str], bins
):
ax = plt.figure().gca()
_data = process_query(
data=self.data,
field_reference=field_reference,
fields_comparison=fields_comparison,
bins=bins
)
display(_data)
make_chart(data=_data, ax=ax)
def show_chart(self, field_reference: str, fields_comparison: [str]):
w_bins = IntSlider(min=2, max=10, value=2)
w_fields_comparison = widgets.SelectMultiple(
description='Xs:',
options=[i for i in self.data.keys()],
selected_labels=fields_comparison
)
w_field_reference = widgets.Dropdown(
description='Y:',
options=[i for i in self.data.keys()],
selected_label=field_reference
)
return interactive(
self._interative_show_chart,
field_reference=w_field_reference,
fields_comparison=w_fields_comparison,
bins=w_bins
)
def _interative_show_panel_chart(
self, field_reference: str, fields_comparison: [str], bins
):
ax = plt.figure().gca()
_data = process_query(
data=self.data,
field_reference=field_reference,
fields_comparison=fields_comparison,
bins=bins
)
display(_data)
make_chart(data=_data, ax=ax)
def show_panel_chart(self, field_reference: str):
w_bins = IntSlider(min=2, max=10, value=2)
w_field_reference = widgets.Dropdown(
description='Y:',
options=[i for i in self.data.keys()],
selected_label=field_reference
)
w_fields_comparison = widgets.SelectMultiple(
description='Xs:',
options=[i for i in self.data.keys()],
selected_labels=[
i for i in self.data.keys() if not i == field_reference
]
)
return interactive(
self._interative_show_panel_chart,
field_reference=w_field_reference,
fields_comparison=w_fields_comparison,
bins=w_bins
)
def __repr__(self):
return ''
In [4]:
daw = DataAnalysisWidget.load('data/train.csv')
daw.prepare_data({
'Survived': {1: 'Survived', 0: 'Died'},
'Pclass': {1: 'Class1', 2: 'Class2', 3: 'Class3'},
'Sex': {},
'Embarked': {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'}
})
In [5]:
daw.summary()
In [6]:
daw.show_chart(
field_reference='Survived',
fields_comparison=['Sex']
)