In [1]:
import pandas as pd
import seaborn as sns
from autoc.explorer import cserie,DataExploration
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = (12.0, 8)
you can find the kaggle competition here
In [2]:
path_to_data = '/Users/ericfourrier/Documents/Data/SpringLeaf/train.csv'
In [3]:
df = pd.read_csv(path_to_data)
In [4]:
exploration = DataExploration(df)
In [5]:
# Missing value count
# columns
exploration.nacolcount()
exploration.narowcount()
Out[5]:
In [6]:
# detecting full na columns
na_cols = cserie(exploration._nacolcount.Napercentage == 1)
df.loc[:,na_cols]
Out[6]:
In [7]:
# detecting full na rows
na_rows = cserie(exploration._narowcount.Napercentage == 1)
na_rows # no full missing rows
Out[7]:
In [8]:
# Looking at full structure of the dataset
df_infos = exploration.structure()
In [9]:
na_cols2 = cserie(df_infos.na_columns) # looking at complete na columns
na_cols2
Out[9]:
In [10]:
constant_cols = cserie(df_infos.constant_columns) # looking at constant columns
constant_cols
Out[10]:
In [11]:
# missing values per rows distribution
print(float(sum(exploration._narowcount.Napercentage > 0.0003)))/df.shape[0]
exploration._narowcount.Napercentage.plot(kind = 'hist',bins=200,xlim = (0,0.0003))
Out[11]:
In [12]:
# missing values per cols distribution
df_infos.perc_missing[df_infos.perc_missing !=0].hist(bins = 100)
Out[12]:
In [13]:
# many missing columns
exploration.manymissing(a=0.7) # more than 70 % missing variables
Out[13]:
In [14]:
# missing values per cols distribution for higher missing percentage
df_infos.perc_missing[df_infos.perc_missing > 0.01].hist(bins = 100)
Out[14]:
In [15]:
df_infos
Out[15]:
In [21]:
reduced_data=exploration.data.loc[:,exploration.data.columns[range(0,100)]]
plt.figure(figsize=(30,15))
corr_matrix = reduced_data.corr()
from autoc.utils.coorplot import plot_corrmatrix
plot_corrmatrix(corr_matrix,size=0.4)
In [ ]: