In [1]:
# pandas
import pandas as pd
from pandas import Series,DataFrame
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from scipy import stats
from scipy.stats import norm
%matplotlib inline
In [2]:
# Load the data
df_train = pd.read_csv('../data/train.csv')
# Replace -1 with NaN, as they are missing values
df_nan = df_train.replace(-1, np.NaN)
In [3]:
nbr_missing = df_nan.isnull().sum().sort_values(ascending=False)
# Percent missing values
print(nbr_missing[nbr_missing > 0] / df_nan.shape[0] * 100)
In [4]:
# Chosen to drop all columns that have more than 10 % missing (and also id as it will probably not be very useful)
drop_cols = ['ps_car_03_cat', 'ps_car_05_cat', 'ps_reg_03', 'id']
# Taking easy way out by dropping all the rows with missing values.
# Probably not very good as we lose about 50000 (10% data) entries
df_dropped = df_nan.drop(drop_cols, axis=1).dropna()
In [5]:
bob = df_dropped[['target', 'ps_car_06_cat']].groupby(['ps_car_06_cat'],as_index=False).sum()
sns.barplot(x='ps_car_06_cat', y='target', data = bob)
Out[5]:
In [6]:
cate_cols = [col for col in list(df_dropped) if col.endswith('_cat')]
df_cate = df_dropped[cate_cols]
# Check if there are any non int values in the categories
for col in cate_cols:
temp = df_cate[col]
temp = temp[temp % 1 != 0]
if temp.size == 0:
df_cate[col] = df_cate[col].astype('int')
# No ints, so change all the columns into int type
df_cate.describe()
Out[6]:
In [20]:
for col in cate_cols:
plot = sns.FacetGrid(df_dropped, col='target')
plot.map(plt.hist, col, bins=20, normed=True)
In [29]:
for col in cate_cols:
bob = df_dropped[[col, 'target']].groupby([col], as_index=False).count().sort_values(by='target', ascending=False)
display(df_dropped[[col, 'target']].groupby([col], as_index=False).count().sort_values(by='target', ascending=False) / df_dropped.shape[0])
In [32]:
#correlation matrix
corrmat = df_cate.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, square=True);
print(bob.info())
display(df_cate.info())
In [ ]:
#saleprice correlation matrix
k = 10 #number of variables for heatmap (including saleprice)
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()