In [ ]:
# load libraries
import numpy as np
import pandas as pd
pd.set_option('display.latex.repr',True)
pd.set_option('display.latex.longtable',False)
pd.set_option('display.latex.escape',False)
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
df = pd.DataFrame({'A': [np.random.randint(0, 100) for i in range(1000)],
'B': [np.random.randint(0, 100) for i in range(1000)],
'C': [np.random.randint(0, 100) for i in range(1000)]})
In [ ]:
from IPython.display import display
from IPython.display import display_latex
from IPython.display import display_markdown
In [ ]:
def print_describe(var):
kind = var.dtype.name
data = []
value = []
# Variable name
data.append('Variable')
value.append(var.name)
# Data type
data.append('Data type')
value.append(var.dtype)
# Data type name
data.append('Data type name')
value.append(var.dtype.name)
# content
data.append('Not NULL')
value.append(var.count())
# NULLs
data.append('NULLs')
value.append(len(var) - var.count())
df = pd.DataFrame({'Data': data, 'Value': value},
columns=['Data', 'Value'])
display(df,metadata={'ipub':{"table":{"caption":False}}})
return
In [ ]:
def print_top_bottom(var):
vc = var.value_counts()
if len(vc) == 0:
return
kind = var.dtype.name
n = min(len(vc), 5)
desc_n = range(n)
desc_k = list(vc.keys())[:n]
desc_v = list(vc)[:n]
vc = var.value_counts(ascending=True)
asc_n = range(len(vc), 0, -1)[:n]
asc_k = list(vc.keys())[:n]
asc_v = list(vc)[:n]
df = pd.DataFrame({'TOP: Idx': desc_n, 'T.Data': desc_k, 'T.Count': desc_v,
'BOTTOM: Idx': asc_n, 'B.Data': asc_k, 'B.Count': asc_v},
columns=['TOP: Idx', 'T.Data', 'T.Count',
'BOTTOM: Idx', 'B.Data', 'B.Count'])
display(df,metadata={'ipub':{"table":{"caption":'Column %s' % var.name}}})
return
In [ ]:
def print_bar_chart(var):
ax = var.plot(figsize=(15, 5))
display(ax.get_figure(),metadata={'ipub':{"figure":{"caption":'Column %s' % var.name}}})
plt.close()
return
In [ ]:
def EDA_univariate(var):
display_markdown('## Describe', raw=True)
print_describe(var)
display_markdown('## Most/Least common', raw=True)
print_top_bottom(var)
display_markdown('## Chart', raw=True)
print_bar_chart(var)
In [ ]:
for col_name in df.columns[0:3]:
display_markdown('# Column: %s' % col_name, raw=True)
col = df[col_name]
kind = col.dtype.name
EDA_univariate(col)
display_markdown('\\newpage', raw=True)