In [ ]:
# load libraries
import numpy as np
import pandas as pd
pd.set_option('display.latex.repr',True)
pd.set_option('display.latex.longtable',False)
pd.set_option('display.latex.escape',False)

%matplotlib inline
import matplotlib.pyplot as plt

In [ ]:
df = pd.DataFrame({'A': [np.random.randint(0, 100) for i in range(1000)],
                   'B': [np.random.randint(0, 100) for i in range(1000)],
                   'C': [np.random.randint(0, 100) for i in range(1000)]})

In [ ]:
from IPython.display import display
from IPython.display import display_latex
from IPython.display import display_markdown

In [ ]:
def print_describe(var):
    
    kind = var.dtype.name
    data = []
    value = []

    # Variable name
    data.append('Variable')
    value.append(var.name)
    # Data type
    data.append('Data type')
    value.append(var.dtype)
    # Data type name
    data.append('Data type name')
    value.append(var.dtype.name)
    # content
    data.append('Not NULL')
    value.append(var.count())
    # NULLs
    data.append('NULLs')
    value.append(len(var) - var.count())

    df = pd.DataFrame({'Data': data, 'Value': value},
                      columns=['Data', 'Value'])
    display(df,metadata={'ipub':{"table":{"caption":False}}})

    return

In [ ]:
def print_top_bottom(var):
    
    vc = var.value_counts()
    if len(vc) == 0:
        return

    kind = var.dtype.name
    n = min(len(vc), 5)
    desc_n = range(n)
    desc_k = list(vc.keys())[:n]
    desc_v = list(vc)[:n]

    vc = var.value_counts(ascending=True)
    asc_n = range(len(vc), 0, -1)[:n]
    asc_k = list(vc.keys())[:n]
    asc_v = list(vc)[:n]

    df = pd.DataFrame({'TOP: Idx': desc_n, 'T.Data': desc_k, 'T.Count': desc_v,
                       'BOTTOM: Idx': asc_n, 'B.Data': asc_k, 'B.Count': asc_v},
                      columns=['TOP: Idx', 'T.Data', 'T.Count', 
                               'BOTTOM: Idx', 'B.Data', 'B.Count'])
    display(df,metadata={'ipub':{"table":{"caption":'Column %s' % var.name}}})
    return

In [ ]:
def print_bar_chart(var):
    
    ax = var.plot(figsize=(15, 5))
    display(ax.get_figure(),metadata={'ipub':{"figure":{"caption":'Column %s' % var.name}}})
    plt.close()
    return

In [ ]:
def EDA_univariate(var):
    
    display_markdown('## Describe', raw=True)
    print_describe(var)

    display_markdown('## Most/Least common', raw=True)
    print_top_bottom(var)

    display_markdown('## Chart', raw=True)
    print_bar_chart(var)

Full Report

For each column will have:

  • Title (markdown)
  • Details (pandas table)
  • Most/Least common (pandas table)
  • Chart (image/matplotlib)
  • LaTeX (\newpage)

In [ ]:
for col_name in df.columns[0:3]:
    
    display_markdown('# Column: %s' % col_name, raw=True)

    col = df[col_name]
    kind = col.dtype.name

    EDA_univariate(col)
    
    display_markdown('\\newpage', raw=True)