In [1]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

import math

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from scipy import stats
from scipy.stats import norm
%matplotlib inline

In [2]:
# Load the data
df_train = pd.read_csv('../data/train.csv')

# Replace -1 with NaN, as they are missing values
df_nan = df_train.replace(-1, np.NaN)

In [3]:
nbr_missing = df_nan.isnull().sum().sort_values(ascending=False)
# Percent missing values
print(nbr_missing[nbr_missing > 0] / df_nan.shape[0] * 100)


ps_car_03_cat    69.089837
ps_car_05_cat    44.782531
ps_reg_03        18.106490
ps_car_14         7.160474
ps_car_07_cat     1.930237
ps_ind_05_cat     0.975955
ps_car_09_cat     0.095596
ps_ind_02_cat     0.036290
ps_car_01_cat     0.017977
ps_ind_04_cat     0.013945
ps_car_02_cat     0.000840
ps_car_11         0.000840
ps_car_12         0.000168
dtype: float64

In [4]:
# Chosen to drop all columns that have more than 10 % missing (and also id as it will probably not be very useful)
drop_cols = ['ps_car_03_cat', 'ps_car_05_cat', 'ps_reg_03', 'id']

# Taking easy way out by dropping all the rows with missing values. 
# Probably not very good as we lose about 50000 (10% data) entries
df_dropped = df_nan.drop(drop_cols, axis=1).dropna()

In [5]:
bob = df_dropped[['target', 'ps_car_06_cat']].groupby(['ps_car_06_cat'],as_index=False).sum()

sns.barplot(x='ps_car_06_cat', y='target', data = bob)


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fe862ff0f0>

Analyse categorical data


In [6]:
cate_cols = [col for col in list(df_dropped) if col.endswith('_cat')]
df_cate = df_dropped[cate_cols]

# Check if there are any non int values in the categories
for col in cate_cols:
    temp = df_cate[col]
    temp = temp[temp % 1 != 0]
    if temp.size == 0:
        df_cate[col] = df_cate[col].astype('int')
# No ints, so change all the columns into int type
df_cate.describe()


E:\Programs\Anaconda\envs\ml\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Out[6]:
ps_ind_02_cat ps_ind_04_cat ps_ind_05_cat ps_car_01_cat ps_car_02_cat ps_car_04_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat ps_car_09_cat ps_car_10_cat ps_car_11_cat
count 541860.000000 541860.000000 541860.000000 541860.000000 541860.000000 541860.000000 541860.000000 541860.000000 541860.000000 541860.000000 541860.000000 541860.000000
mean 1.355529 0.418163 0.417892 8.256677 0.835694 0.670930 6.579528 0.947128 0.852108 1.313926 0.992024 61.908004
std 0.664114 0.493258 1.348684 2.511052 0.370554 2.058453 5.465019 0.223777 0.354994 0.981970 0.092353 32.856706
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
25% 1.000000 0.000000 0.000000 7.000000 1.000000 0.000000 1.000000 1.000000 1.000000 0.000000 1.000000 32.000000
50% 1.000000 0.000000 0.000000 7.000000 1.000000 0.000000 7.000000 1.000000 1.000000 2.000000 1.000000 64.000000
75% 2.000000 1.000000 0.000000 11.000000 1.000000 0.000000 11.000000 1.000000 1.000000 2.000000 1.000000 92.000000
max 4.000000 1.000000 6.000000 11.000000 1.000000 9.000000 17.000000 1.000000 1.000000 4.000000 2.000000 104.000000

In [20]:
for col in cate_cols:
    plot = sns.FacetGrid(df_dropped, col='target')
    plot.map(plt.hist, col, bins=20, normed=True)



In [29]:
for col in cate_cols:
    bob = df_dropped[[col, 'target']].groupby([col], as_index=False).count().sort_values(by='target', ascending=False)
    display(df_dropped[[col, 'target']].groupby([col], as_index=False).count().sort_values(by='target', ascending=False) / df_dropped.shape[0])


ps_ind_02_cat target
0 0.000002 0.731060
1 0.000004 0.201722
2 0.000006 0.047848
3 0.000007 0.019370
ps_ind_04_cat target
0 0.000000 0.581837
1 0.000002 0.418163
ps_ind_05_cat target
0 0.000000 0.896241
6 0.000011 0.034972
4 0.000007 0.031150
1 0.000002 0.014022
3 0.000006 0.013874
2 0.000004 0.006963
5 0.000009 0.002777
ps_car_01_cat target
11 0.000020 0.341059
7 0.000013 0.302822
6 0.000011 0.109927
10 0.000018 0.085690
4 0.000007 0.045789
9 0.000017 0.032361
5 0.000009 0.028950
8 0.000015 0.025763
3 0.000006 0.011320
0 0.000000 0.010440
2 0.000004 0.003721
1 0.000002 0.002159
ps_car_02_cat target
1 0.000002 0.835694
0 0.000000 0.164306
ps_car_04_cat target
0 0.000000 0.842858
1 0.000002 0.048708
2 0.000004 0.042965
8 0.000015 0.034096
9 0.000017 0.026584
6 0.000011 0.002469
3 0.000006 0.001165
5 0.000009 0.000613
4 0.000007 0.000301
7 0.000013 0.000242
ps_car_06_cat target
11 0.000020 0.228923
1 0.000002 0.193749
0 0.000000 0.186410
14 0.000026 0.097503
10 0.000018 0.061073
4 0.000007 0.055852
15 0.000028 0.035404
6 0.000011 0.031966
7 0.000013 0.028996
9 0.000017 0.028552
3 0.000006 0.019577
13 0.000024 0.010073
16 0.000030 0.007788
17 0.000031 0.005802
12 0.000022 0.004346
5 0.000009 0.001714
8 0.000015 0.001244
2 0.000004 0.001028
ps_car_07_cat target
1 0.000002 0.947128
0 0.000000 0.052872
ps_car_08_cat target
1 0.000002 0.852108
0 0.000000 0.147892
ps_car_09_cat target
2 0.000004 0.589095
0 0.000000 0.336554
1 0.000002 0.046058
3 0.000006 0.023495
4 0.000007 0.004798
ps_car_10_cat target
1 0.000002 0.991407
0 0.000000 0.008284
2 0.000004 0.000308
ps_car_11_cat target
101 0.000192 0.131623
100 0.000190 0.043747
62 0.000118 0.039853
84 0.000161 0.031026
26 0.000052 0.022844
4 0.000009 0.022545
30 0.000059 0.021948
96 0.000183 0.021747
63 0.000120 0.019898
80 0.000153 0.017440
79 0.000151 0.017294
36 0.000070 0.016822
9 0.000018 0.016043
66 0.000125 0.015404
21 0.000041 0.015177
58 0.000111 0.014378
68 0.000129 0.013568
98 0.000186 0.013371
11 0.000022 0.013293
76 0.000144 0.013280
65 0.000124 0.012957
15 0.000030 0.012488
10 0.000020 0.012300
89 0.000170 0.011754
82 0.000157 0.011606
37 0.000072 0.011091
49 0.000094 0.011042
25 0.000050 0.010789
46 0.000089 0.010676
6 0.000013 0.010399
... ... ...
8 0.000017 0.004060
93 0.000177 0.003816
77 0.000146 0.003805
45 0.000087 0.003743
14 0.000028 0.003460
71 0.000135 0.003390
69 0.000131 0.003370
48 0.000092 0.003353
19 0.000037 0.003340
88 0.000168 0.003283
57 0.000109 0.003231
43 0.000083 0.003110
22 0.000042 0.003071
73 0.000138 0.003054
52 0.000100 0.002969
83 0.000159 0.002966
75 0.000142 0.002925
3 0.000007 0.002698
78 0.000149 0.002635
33 0.000065 0.002526
1 0.000004 0.002501
54 0.000103 0.002335
56 0.000107 0.002047
99 0.000188 0.001746
17 0.000033 0.001532
31 0.000061 0.001421
94 0.000179 0.001262
67 0.000127 0.001105
61 0.000116 0.000912
7 0.000015 0.000910

102 rows × 2 columns


In [32]:
#correlation matrix
corrmat = df_cate.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, square=True);
print(bob.info())
display(df_cate.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 541860 entries, 0 to 595211
Data columns (total 12 columns):
ps_ind_02_cat    541860 non-null int32
ps_ind_04_cat    541860 non-null int32
ps_ind_05_cat    541860 non-null int32
ps_car_01_cat    541860 non-null int32
ps_car_02_cat    541860 non-null int32
ps_car_04_cat    541860 non-null int32
ps_car_06_cat    541860 non-null int32
ps_car_07_cat    541860 non-null int32
ps_car_08_cat    541860 non-null int32
ps_car_09_cat    541860 non-null int32
ps_car_10_cat    541860 non-null int32
ps_car_11_cat    541860 non-null int32
dtypes: int32(12)
memory usage: 48.9 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 541860 entries, 0 to 595211
Data columns (total 12 columns):
ps_ind_02_cat    541860 non-null int32
ps_ind_04_cat    541860 non-null int32
ps_ind_05_cat    541860 non-null int32
ps_car_01_cat    541860 non-null int32
ps_car_02_cat    541860 non-null int32
ps_car_04_cat    541860 non-null int32
ps_car_06_cat    541860 non-null int32
ps_car_07_cat    541860 non-null int32
ps_car_08_cat    541860 non-null int32
ps_car_09_cat    541860 non-null int32
ps_car_10_cat    541860 non-null int32
ps_car_11_cat    541860 non-null int32
dtypes: int32(12)
memory usage: 48.9 MB
None

In [ ]:
#saleprice correlation matrix
k = 10 #number of variables for heatmap (including saleprice)
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()