In [1]:
import pandas as pd
from autoc import DataExploration, NaImputer, PreProcessor
from autoc.naimputer import missing_map
from autoc.outliersdetection import OutliersDetection
from autoc.utils.getdata import get_dataset
from autoc.utils.helpers import cserie
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
# Loading Titanic dataset
titanic = get_dataset('titanic')
In [3]:
titanic.head()
Out[3]:
The DataExploraion class is designed to provide helpers for basic Dataexploration task
In [4]:
# Instantiate the class this way
exploration_titanic = DataExploration(titanic)
In [5]:
# The structure function gives a good summary of important characteristics of the dataset like
# missing values, nb_unique values, cst columns, types of the column ...
exploration_titanic.structure()
Out[5]:
In [6]:
# If you want more specific primitive :
exploration_titanic.nacolcount()
Out[6]:
In [7]:
cserie(exploration_titanic.narows_full) # no rows of only missing values
Out[7]:
In [8]:
exploration_titanic.count_unique()
Out[8]:
In [9]:
# More complete numeric summary than describe()
exploration_titanic.numeric_summary() # you can access to numeric
Out[9]:
In [10]:
# Look at quantiles
In [11]:
exploration_titanic.dfquantiles(nb_quantiles=10)
Out[11]:
In [12]:
# print Consistency infos
# This function helps you to trakc potential consistency errors in the dataset
# like duplicates columns, constant columns, full missing rows, full missing columns.
exploration_titanic.print_infos('consistency', print_empty=False)
In [13]:
# Nearzerovariance function inspired from caret
exploration_titanic.nearzerovar()
Out[13]:
In [14]:
# Find highly correlated columns
exploration_titanic.findcorr() # no highly numerical correlated columns
Out[14]:
In [15]:
exploration_titanic.findupcol()
# no duplicated cols
Out[15]:
In [16]:
# Recheck duplicated row
titanic.duplicated().sum()
Out[16]:
This class is a simple class to detect one dimension outliers.
In [17]:
outlier_detection = OutliersDetection(titanic)
In [18]:
outlier_detection.basic_cutoff
Out[18]:
In [19]:
outlier_detection.strong_cutoff
Out[19]:
In [20]:
soft_outliers_fare = outlier_detection.outlier_detection_serie_1d('fare',cutoff_params=outlier_detection.basic_cutoff)
strong_outliers_fare = outlier_detection.outlier_detection_serie_1d('fare',cutoff_params=outlier_detection.strong_cutoff)
In [21]:
# finding index of your Dataframe
index_strong_outliers = (strong_outliers_fare.is_outlier == 1)
In [22]:
titanic.fare.describe()
Out[22]:
In [23]:
# a lot of outliers because distribution is lognormal
titanic.loc[index_strong_outliers, :].head()
Out[23]:
In [24]:
titanic.fare.hist()
Out[24]:
In [25]:
outlier_detection.outlier_detection_1d(cutoff_params=outlier_detection.basic_cutoff).head(20)
Out[25]:
In [26]:
# initialize preprocessing
preprocessor = PreProcessor(titanic, copy=True)
print("We made a copy so id titanic : {} different from id preprocessor.data {}".format(
id(titanic),id(preprocessor.data)))
In [27]:
# using infos consistency from DataExploration
preprocessor.print_infos('consistency')
In [28]:
# basic cleaning delete constant columns
titanic_clean = preprocessor.basic_cleaning()
In [32]:
titanic_clean.shape # We removed the dupliated columns
Out[32]:
In [33]:
titanic.shape
Out[33]:
In [29]:
preprocessor.infer_subtypes() # this function tries to indentify different subtypes of data
Out[29]:
In [30]:
preprocessor.subtypes
Out[30]:
This is a dataset from airbnb users found (the dataset used here is train_users_2.csv from the this airbnb kaggle competition
In [34]:
df_airbnb = get_dataset('airbnb_users')
In [35]:
exploration_airbnb = DataExploration(df_airbnb)
In [36]:
exploration_airbnb.print_infos('consistency')
In [37]:
exploration_airbnb.structure()
Out[37]:
In [38]:
exploration_airbnb.sign_summary() # Get sign summary (look for -1 na encoded value for example)
Out[38]:
In [39]:
airbnb_od = OutliersDetection(df_airbnb)
In [40]:
# OutliersDetection is a subclass of DataExploration
airbnb_od.structure()
Out[40]:
In [41]:
airbnb_od.numeric_summary() # you can access to numeric
Out[41]:
In [42]:
airbnb_od.strong_cutoff
Out[42]:
In [43]:
outliers_age = airbnb_od.outlier_detection_serie_1d('age', cutoff_params=airbnb_od.strong_cutoff)
outliers_age.head(10)
Out[43]:
In [44]:
print("nb strong outliers : {}".format(outliers_age.is_outlier.sum()))
In [45]:
index_outliers_age = cserie(outliers_age.is_outlier==1, index=True)
In [46]:
df_airbnb.loc[index_outliers_age,:]
Out[46]:
In [47]:
#plt.style.use('ggplot') # ggplot2 style for mathplotlib
In [48]:
naimp = NaImputer(df_airbnb)
In [49]:
naimp.data_isna.corr()
Out[49]:
In [50]:
naimp.plot_corrplot_na()
In [51]:
missing_map(df_airbnb, nmax=200)
Out[51]:
In [54]:
naimp.get_isna_ttest('age', type_test='ks')
Out[54]:
In [55]:
naimp.get_isna_ttest('age', type_test='ttest')
Out[55]:
In [57]:
naimp.get_overlapping_matrix()
Out[57]:
In [58]:
naimp.nacolcount()
Out[58]:
In [ ]: