Examples of datasets cleaning using the autoc package

Import Packages


In [1]:
import pandas as pd 
import seaborn as sns 
from autoc.explorer import cserie,DataExploration
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = (12.0, 8)

Springleaf dataset analysis

you can find the kaggle competition here


In [2]:
path_to_data = '/Users/ericfourrier/Documents/Data/SpringLeaf/train.csv'

In [3]:
df = pd.read_csv(path_to_data)


/Users/ericfourrier/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (8,9,10,11,12,43,157,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [4]:
exploration = DataExploration(df)

Using different methods of the class


In [5]:
# Missing value count

# columns 
exploration.nacolcount()
exploration.narowcount()


Out[5]:
Nanumber Napercentage
0 25 0.000172
1 19 0.000131
2 24 0.000165
3 24 0.000165
4 24 0.000165
5 24 0.000165
6 24 0.000165
7 24 0.000165
8 16 0.000110
9 24 0.000165
10 22 0.000151
11 24 0.000165
12 17 0.000117
13 24 0.000165
14 24 0.000165
15 24 0.000165
16 24 0.000165
17 24 0.000165
18 22 0.000151
19 25 0.000172
20 24 0.000165
21 21 0.000145
22 24 0.000165
23 24 0.000165
24 24 0.000165
25 25 0.000172
26 24 0.000165
27 18 0.000124
28 22 0.000151
29 24 0.000165
... ... ...
145201 24 0.000165
145202 24 0.000165
145203 20 0.000138
145204 24 0.000165
145205 24 0.000165
145206 24 0.000165
145207 24 0.000165
145208 24 0.000165
145209 13 0.000090
145210 24 0.000165
145211 24 0.000165
145212 24 0.000165
145213 19 0.000131
145214 24 0.000165
145215 25 0.000172
145216 22 0.000151
145217 24 0.000165
145218 24 0.000165
145219 22 0.000151
145220 22 0.000151
145221 24 0.000165
145222 18 0.000124
145223 24 0.000165
145224 22 0.000151
145225 22 0.000151
145226 14 0.000096
145227 11 0.000076
145228 13 0.000090
145229 314 0.002162
145230 24 0.000165

145231 rows × 2 columns


In [6]:
# detecting full na columns 
na_cols = cserie(exploration._nacolcount.Napercentage == 1)
df.loc[:,na_cols]


Out[6]:
VAR_0207 VAR_0213 VAR_0840
0 NaN NaN NaN
1 NaN NaN NaN
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
5 NaN NaN NaN
6 NaN NaN NaN
7 NaN NaN NaN
8 NaN NaN NaN
9 NaN NaN NaN
10 NaN NaN NaN
11 NaN NaN NaN
12 NaN NaN NaN
13 NaN NaN NaN
14 NaN NaN NaN
15 NaN NaN NaN
16 NaN NaN NaN
17 NaN NaN NaN
18 NaN NaN NaN
19 NaN NaN NaN
20 NaN NaN NaN
21 NaN NaN NaN
22 NaN NaN NaN
23 NaN NaN NaN
24 NaN NaN NaN
25 NaN NaN NaN
26 NaN NaN NaN
27 NaN NaN NaN
28 NaN NaN NaN
29 NaN NaN NaN
... ... ... ...
145201 NaN NaN NaN
145202 NaN NaN NaN
145203 NaN NaN NaN
145204 NaN NaN NaN
145205 NaN NaN NaN
145206 NaN NaN NaN
145207 NaN NaN NaN
145208 NaN NaN NaN
145209 NaN NaN NaN
145210 NaN NaN NaN
145211 NaN NaN NaN
145212 NaN NaN NaN
145213 NaN NaN NaN
145214 NaN NaN NaN
145215 NaN NaN NaN
145216 NaN NaN NaN
145217 NaN NaN NaN
145218 NaN NaN NaN
145219 NaN NaN NaN
145220 NaN NaN NaN
145221 NaN NaN NaN
145222 NaN NaN NaN
145223 NaN NaN NaN
145224 NaN NaN NaN
145225 NaN NaN NaN
145226 NaN NaN NaN
145227 NaN NaN NaN
145228 NaN NaN NaN
145229 NaN NaN NaN
145230 NaN NaN NaN

145231 rows × 3 columns


In [7]:
# detecting full na rows
na_rows = cserie(exploration._narowcount.Napercentage == 1)
na_rows # no full missing rows


Out[7]:
[]

Using Structure


In [8]:
# Looking at full structure of the dataset 
df_infos = exploration.structure()

In [9]:
na_cols2 = cserie(df_infos.na_columns) # looking at complete na columns 
na_cols2


Out[9]:
['VAR_0207', 'VAR_0213', 'VAR_0840']

In [10]:
constant_cols = cserie(df_infos.constant_columns) # looking at constant columns
constant_cols


Out[10]:
['VAR_0008',
 'VAR_0009',
 'VAR_0010',
 'VAR_0011',
 'VAR_0012',
 'VAR_0018',
 'VAR_0019',
 'VAR_0020',
 'VAR_0021',
 'VAR_0022',
 'VAR_0023',
 'VAR_0024',
 'VAR_0025',
 'VAR_0026',
 'VAR_0027',
 'VAR_0028',
 'VAR_0029',
 'VAR_0030',
 'VAR_0031',
 'VAR_0032',
 'VAR_0038',
 'VAR_0039',
 'VAR_0040',
 'VAR_0041',
 'VAR_0042',
 'VAR_0043',
 'VAR_0044',
 'VAR_0188',
 'VAR_0189',
 'VAR_0190',
 'VAR_0196',
 'VAR_0197',
 'VAR_0199',
 'VAR_0202',
 'VAR_0203',
 'VAR_0215',
 'VAR_0216',
 'VAR_0221',
 'VAR_0222',
 'VAR_0223',
 'VAR_0229',
 'VAR_0239',
 'VAR_0246',
 'VAR_0394',
 'VAR_0438',
 'VAR_0446',
 'VAR_0527',
 'VAR_0528',
 'VAR_0530',
 'VAR_0847',
 'VAR_1428']

Small study on missing values


In [11]:
# missing values per rows distribution
print(float(sum(exploration._narowcount.Napercentage > 0.0003)))/df.shape[0]
exploration._narowcount.Napercentage.plot(kind = 'hist',bins=200,xlim = (0,0.0003))


0.00658261665898
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x129915450>

In [12]:
# missing values per cols distribution
df_infos.perc_missing[df_infos.perc_missing !=0].hist(bins = 100)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1548ec2d0>

In [13]:
# many missing columns 
exploration.manymissing(a=0.7) # more than 70 % missing variables


Out[13]:
['VAR_0156',
 'VAR_0157',
 'VAR_0158',
 'VAR_0159',
 'VAR_0166',
 'VAR_0167',
 'VAR_0168',
 'VAR_0169',
 'VAR_0176',
 'VAR_0177',
 'VAR_0178',
 'VAR_0179',
 'VAR_0205',
 'VAR_0206',
 'VAR_0207',
 'VAR_0208',
 'VAR_0209',
 'VAR_0210',
 'VAR_0211',
 'VAR_0213',
 'VAR_0214',
 'VAR_0840']

In [14]:
# missing values per cols distribution for higher missing percentage 
df_infos.perc_missing[df_infos.perc_missing > 0.01].hist(bins = 100)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x199361e10>

In [15]:
df_infos


Out[15]:
dtypes_p dtypes_r nb_missing perc_missing nb_unique_values constant_columns na_columns is_key
ID int64 numeric 0 0.000000 145231 False False True
VAR_0001 object factor 0 0.000000 3 False False False
VAR_0002 int64 numeric 0 0.000000 820 False False False
VAR_0003 int64 numeric 0 0.000000 588 False False False
VAR_0004 int64 numeric 0 0.000000 7935 False False False
VAR_0005 object factor 0 0.000000 4 False False False
VAR_0006 float64 numeric 56 0.000386 37 False False False
VAR_0007 float64 numeric 56 0.000386 35 False False False
VAR_0008 object factor 56 0.000386 1 True False False
VAR_0009 object factor 56 0.000386 1 True False False
VAR_0010 object factor 56 0.000386 1 True False False
VAR_0011 object factor 56 0.000386 1 True False False
VAR_0012 object factor 56 0.000386 1 True False False
VAR_0013 float64 numeric 56 0.000386 37 False False False
VAR_0014 float64 numeric 56 0.000386 37 False False False
VAR_0015 float64 numeric 56 0.000386 26 False False False
VAR_0016 float64 numeric 56 0.000386 29 False False False
VAR_0017 float64 numeric 56 0.000386 25 False False False
VAR_0018 float64 numeric 56 0.000386 1 True False False
VAR_0019 float64 numeric 56 0.000386 1 True False False
VAR_0020 float64 numeric 56 0.000386 1 True False False
VAR_0021 float64 numeric 56 0.000386 1 True False False
VAR_0022 float64 numeric 56 0.000386 1 True False False
VAR_0023 float64 numeric 56 0.000386 1 True False False
VAR_0024 float64 numeric 56 0.000386 1 True False False
VAR_0025 float64 numeric 56 0.000386 1 True False False
VAR_0026 float64 numeric 56 0.000386 1 True False False
VAR_0027 float64 numeric 56 0.000386 1 True False False
VAR_0028 float64 numeric 56 0.000386 1 True False False
VAR_0029 float64 numeric 56 0.000386 1 True False False
... ... ... ... ... ... ... ... ...
VAR_1906 int64 numeric 0 0.000000 72 False False False
VAR_1907 int64 numeric 0 0.000000 41 False False False
VAR_1908 int64 numeric 0 0.000000 37 False False False
VAR_1909 int64 numeric 0 0.000000 41 False False False
VAR_1910 int64 numeric 0 0.000000 37 False False False
VAR_1911 int64 numeric 0 0.000000 107 False False False
VAR_1912 int64 numeric 0 0.000000 16370 False False False
VAR_1913 int64 numeric 0 0.000000 25426 False False False
VAR_1914 int64 numeric 0 0.000000 14226 False False False
VAR_1915 int64 numeric 0 0.000000 1148 False False False
VAR_1916 int64 numeric 0 0.000000 8 False False False
VAR_1917 int64 numeric 0 0.000000 10 False False False
VAR_1918 int64 numeric 0 0.000000 86 False False False
VAR_1919 int64 numeric 0 0.000000 383 False False False
VAR_1920 int64 numeric 0 0.000000 22 False False False
VAR_1921 int64 numeric 0 0.000000 18 False False False
VAR_1922 int64 numeric 0 0.000000 6798 False False False
VAR_1923 int64 numeric 0 0.000000 2445 False False False
VAR_1924 int64 numeric 0 0.000000 573 False False False
VAR_1925 int64 numeric 0 0.000000 11 False False False
VAR_1926 int64 numeric 0 0.000000 6 False False False
VAR_1927 int64 numeric 0 0.000000 10 False False False
VAR_1928 int64 numeric 0 0.000000 30 False False False
VAR_1929 int64 numeric 0 0.000000 591 False False False
VAR_1930 int64 numeric 0 0.000000 8 False False False
VAR_1931 int64 numeric 0 0.000000 10 False False False
VAR_1932 int64 numeric 0 0.000000 74 False False False
VAR_1933 int64 numeric 0 0.000000 363 False False False
VAR_1934 object factor 0 0.000000 5 False False False
target int64 numeric 0 0.000000 2 False False False

1934 rows × 8 columns

Explore character variables

Correlation matrix


In [21]:
reduced_data=exploration.data.loc[:,exploration.data.columns[range(0,100)]]
plt.figure(figsize=(30,15))
corr_matrix = reduced_data.corr()
from autoc.utils.coorplot import plot_corrmatrix
plot_corrmatrix(corr_matrix,size=0.4)



In [ ]: