2.3 Explore Data

Outputs:

  • Data Exploration Report

In [127]:
import nltk
import pandas as pd
import math
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import gridspec

from sklearn import datasets, linear_model
import numpy as np
from numbers import Number

import seaborn as sns

In [133]:
train= pd.read_csv("../data/train.csv")
train=train.set_index('Id')

In [134]:
def hist_boxplot(column,figsize=(13,8)):
    fig = plt.figure(figsize=figsize) 
    gs = gridspec.GridSpec(2, 1, height_ratios=[1,4])
    ax0 = plt.subplot(gs[0])
    ax0.grid(True)
    ax0.boxplot(column.dropna(),vert=False)
    ax1 = plt.subplot(gs[1])
    ax1.grid(True)
    ax1.hist(column.dropna())
    print (column.describe())
    print ('Null Values:',column.isnull().sum())
    
def hist_and_info(column,figsize=(13,4)):
    column.hist(figsize=figsize)
    print (column.describe())
    print ('Null Values:',column.isnull().sum())
    
def value_counts_and_info(column,figsize=(13,4)):
    column.value_counts().plot(kind='bar',figsize=figsize)
    print (column.value_counts())
    print ('Null Values:',column.isnull().sum())

In [135]:
value_counts_and_info(train['MSSubClass'])


20     536
60     299
50     144
120     87
30      69
160     63
70      60
80      58
90      52
190     30
85      20
75      16
45      12
180     10
40       4
Name: MSSubClass, dtype: int64
Null Values: 0

In [131]:
value_counts_and_info(train['MSZoning'])


RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64
Null Values: 0

In [136]:
#train['LotFrontage'].hist()
hist_boxplot(train['LotFrontage'])


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-136-cde979c62900> in <module>()
      1 #train['LotFrontage'].hist()
----> 2 hist_boxplot(train['LotFrontage'])

<ipython-input-134-899ec38d4142> in hist_boxplot(column, figsize)
      4     ax0 = plt.subplot(gs[0])
      5     ax0.grid(True)
----> 6     ax0.boxplot(column.dropna(),vert=False)
      7     ax1 = plt.subplot(gs[1])
      8     ax1.grid(True)

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1817                     warnings.warn(msg % (label_namer, func.__name__),
   1818                                   RuntimeWarning, stacklevel=2)
-> 1819             return func(ax, *args, **kwargs)
   1820         pre_doc = inner.__doc__
   1821         if pre_doc is None:

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/axes/_axes.py in boxplot(self, x, notch, sym, vert, whis, positions, widths, patch_artist, bootstrap, usermedians, conf_intervals, meanline, showmeans, showcaps, showbox, showfliers, boxprops, labels, flierprops, medianprops, meanprops, capprops, whiskerprops, manage_xticks, autorange)
   3172             bootstrap = rcParams['boxplot.bootstrap']
   3173         bxpstats = cbook.boxplot_stats(x, whis=whis, bootstrap=bootstrap,
-> 3174                                        labels=labels, autorange=autorange)
   3175         if notch is None:
   3176             notch = rcParams['boxplot.notch']

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/cbook.py in boxplot_stats(X, whis, bootstrap, labels, autorange)
   1996 
   1997     # convert X to a list of lists
-> 1998     X = _reshape_2D(X)
   1999 
   2000     ncols = len(X)

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/cbook.py in _reshape_2D(X)
   2244         # one item
   2245         if len(X.shape) == 1:
-> 2246             if hasattr(X[0], 'shape'):
   2247                 X = list(X)
   2248             else:

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/series.py in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/indexes/base.py in get_value(self, series, key)
   2167         try:
   2168             return self._engine.get_value(s, k,
-> 2169                                           tz=getattr(series.dtype, 'tz', None))
   2170         except KeyError as e1:
   2171             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3567)()

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3250)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4289)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8555)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8499)()

KeyError: 0

In [137]:
hist_boxplot(train['LotArea'])


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-137-8b11b20002aa> in <module>()
----> 1 hist_boxplot(train['LotArea'])

<ipython-input-134-899ec38d4142> in hist_boxplot(column, figsize)
      4     ax0 = plt.subplot(gs[0])
      5     ax0.grid(True)
----> 6     ax0.boxplot(column.dropna(),vert=False)
      7     ax1 = plt.subplot(gs[1])
      8     ax1.grid(True)

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1817                     warnings.warn(msg % (label_namer, func.__name__),
   1818                                   RuntimeWarning, stacklevel=2)
-> 1819             return func(ax, *args, **kwargs)
   1820         pre_doc = inner.__doc__
   1821         if pre_doc is None:

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/axes/_axes.py in boxplot(self, x, notch, sym, vert, whis, positions, widths, patch_artist, bootstrap, usermedians, conf_intervals, meanline, showmeans, showcaps, showbox, showfliers, boxprops, labels, flierprops, medianprops, meanprops, capprops, whiskerprops, manage_xticks, autorange)
   3172             bootstrap = rcParams['boxplot.bootstrap']
   3173         bxpstats = cbook.boxplot_stats(x, whis=whis, bootstrap=bootstrap,
-> 3174                                        labels=labels, autorange=autorange)
   3175         if notch is None:
   3176             notch = rcParams['boxplot.notch']

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/cbook.py in boxplot_stats(X, whis, bootstrap, labels, autorange)
   1996 
   1997     # convert X to a list of lists
-> 1998     X = _reshape_2D(X)
   1999 
   2000     ncols = len(X)

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/cbook.py in _reshape_2D(X)
   2244         # one item
   2245         if len(X.shape) == 1:
-> 2246             if hasattr(X[0], 'shape'):
   2247                 X = list(X)
   2248             else:

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/series.py in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/indexes/base.py in get_value(self, series, key)
   2167         try:
   2168             return self._engine.get_value(s, k,
-> 2169                                           tz=getattr(series.dtype, 'tz', None))
   2170         except KeyError as e1:
   2171             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3567)()

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3250)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4289)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8555)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8499)()

KeyError: 0

In [138]:
value_counts_and_info(train['Street'])


Pave    1454
Grvl       6
Name: Street, dtype: int64
Null Values: 0

In [ ]:
## Se identificaron los valores NaN indica que no hay camino de entrada.
value_counts_and_info(train['Alley'])

In [ ]:
value_counts_and_info(train['LotShape'])

In [ ]:
value_counts_and_info(train['LandContour'])

In [ ]:
#value_counts_and_info(train['Utilities'])

In [ ]:
value_counts_and_info(train['LotConfig'])

In [ ]:
value_counts_and_info(train['LandSlope'])

In [ ]:
value_counts_and_info(train['Neighborhood'])

In [ ]:
value_counts_and_info(train['Condition1'])

In [ ]:
value_counts_and_info(train['Condition2'])

In [ ]:
value_counts_and_info(train['BldgType'])

In [ ]:
value_counts_and_info(train['HouseStyle'])

In [ ]:
value_counts_and_info(train['OverallQual'])

In [ ]:
value_counts_and_info(train['OverallCond'])

In [ ]:
hist_and_info(train['YearBuilt'])

In [ ]:
hist_and_info(train['YearRemodAdd'])

In [ ]:
value_counts_and_info(train['RoofStyle'])

In [ ]:
value_counts_and_info(train['RoofMatl'])

In [ ]:
value_counts_and_info(train['Exterior1st'])

In [ ]:
value_counts_and_info(train['Exterior2nd'])

In [ ]:
value_counts_and_info(train['MasVnrType'])

In [ ]:
hist_boxplot(train['MasVnrArea'])

In [ ]:
value_counts_and_info(train['ExterQual'])

In [ ]:
value_counts_and_info(train['ExterCond'])

In [ ]:
value_counts_and_info(train['Foundation'])

In [ ]:
value_counts_and_info(train['BsmtQual'])

In [ ]:
value_counts_and_info(train['BsmtCond'])

In [ ]:
value_counts_and_info(train['BsmtExposure'])

In [ ]:
value_counts_and_info(train['BsmtFinType1'])

In [ ]:
hist_boxplot(train['BsmtFinSF1'])

In [ ]:
value_counts_and_info(train['BsmtFinType2'])

In [ ]:
hist_boxplot(train['BsmtFinSF2'])

In [ ]:
hist_and_info(train['BsmtUnfSF'])

In [ ]:
hist_boxplot(train['TotalBsmtSF'])

In [ ]:
value_counts_and_info(train['Heating'])

In [ ]:
value_counts_and_info(train['HeatingQC'])

In [ ]:
value_counts_and_info(train['CentralAir'])

In [ ]:
value_counts_and_info(train['Electrical'])

In [ ]:
hist_boxplot(train['1stFlrSF'])

In [ ]:
hist_boxplot(train['2ndFlrSF'])

In [ ]:
hist_boxplot(train['LowQualFinSF'])

In [ ]:
hist_boxplot(train['GrLivArea'])

In [ ]:
value_counts_and_info(train['BsmtFullBath'])

In [ ]:
value_counts_and_info(train['BsmtHalfBath'])

In [ ]:
value_counts_and_info(train['FullBath'])

In [ ]:
value_counts_and_info(train['HalfBath'])

In [ ]:
value_counts_and_info(train['BedroomAbvGr'])

In [ ]:
value_counts_and_info(train['KitchenAbvGr'])

In [ ]:
value_counts_and_info(train['KitchenQual'])

In [ ]:
value_counts_and_info(train['TotRmsAbvGrd'])

In [ ]:
value_counts_and_info(train['Functional'])

In [ ]:
value_counts_and_info(train['Fireplaces'])

In [ ]:
value_counts_and_info(train['FireplaceQu'])

In [ ]:
value_counts_and_info(train['GarageType'])

In [ ]:
hist_and_info(train['GarageYrBlt'])

In [ ]:
value_counts_and_info(train['GarageFinish'])

In [ ]:
value_counts_and_info(train['GarageCars'])

In [ ]:
hist_boxplot(train['GarageArea'])

In [ ]:
value_counts_and_info(train['GarageQual'])

In [ ]:
value_counts_and_info(train['GarageCond'])

In [ ]:
value_counts_and_info(train['PavedDrive'])

In [ ]:
hist_boxplot(train['WoodDeckSF'])

In [ ]:
hist_boxplot(train['OpenPorchSF'])

In [ ]:
hist_boxplot(train['EnclosedPorch'])

In [ ]:
hist_boxplot(train['3SsnPorch'])

In [ ]:
hist_and_info(train['ScreenPorch'])

In [ ]:
hist_and_info(train['PoolArea'])

In [ ]:
value_counts_and_info(train['PoolQC'])

In [ ]:
value_counts_and_info(train['Fence'])

In [ ]:
value_counts_and_info(train['MiscFeature'])

In [ ]:
value_counts_and_info(train['MiscVal'])

In [ ]:
value_counts_and_info(train['MoSold'])

In [ ]:
value_counts_and_info(train['YrSold'])

In [ ]:
value_counts_and_info(train['SaleType'])

In [ ]:
value_counts_and_info(train['SaleCondition'])

In [139]:
corrmat = train.iloc[:,:-1].corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);



In [ ]:

2.4 Verify Data Quality

Outputs:

  • Data Quality Report