First, import packages for data cleaning and read the data
In [ ]:
from scipy.stats.mstats import mode
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder
"""
Read Data
"""
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
target = train['SalePrice']
train = train.drop(['SalePrice'],axis=1)
trainlen = train.shape[0]
Combine the train and test set for cleaning
In [ ]:
df1 = train.head()
df2 = test.head()
pd.concat([df1, df2], axis=0, ignore_index=True)
In [ ]:
alldata = pd.concat([train, test], axis=0, join='outer', ignore_index=True)
alldata = alldata.drop(['Id','Utilities'], axis=1)
alldata.dtypes
Convert all ints to floats for XGBoost
In [ ]:
alldata.ix[:,(alldata.dtypes=='int64') & (alldata.columns != 'MSSubClass')]=alldata.ix[:,(alldata.dtypes=='int64') & (alldata.columns!='MSSubClass')].astype('float64')
In [ ]:
alldata.head(20)
Save lightly prepared data (no encoding)
In [ ]:
train = alldata.ix[0:trainlen-1, :]
test = alldata.ix[trainlen:alldata.shape[0],:]
test.to_csv('data/test_prepared_light.csv', index=False)
train.to_csv('data/train_prepared_light.csv', index=False)
Dealing with the NA values in the variables, some of them equal to 0 and some equal to median, based on the txt descriptions
In [ ]:
fMedlist=['LotFrontage']
fArealist=['MasVnrArea','TotalBsmtSF','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','BsmtFullBath', 'BsmtHalfBath','MasVnrArea','Fireplaces','GarageArea','GarageYrBlt','GarageCars']
for i in fArealist:
alldata.ix[pd.isnull(alldata.ix[:,i]),i] = 0
for i in fMedlist:
alldata.ix[pd.isnull(alldata.ix[:,i]),i] = np.nanmedian(alldata.ix[:,i])
Transforming Data Use integers to encode categorical data.
In [ ]:
alldata.head(20)
Encode categorical data
In [ ]:
le = LabelEncoder()
nacount_category = np.array(alldata.columns[((alldata.dtypes=='int64') | (alldata.dtypes=='object')) & (pd.isnull(alldata).sum()>0)])
category = np.array(alldata.columns[((alldata.dtypes=='int64') | (alldata.dtypes=='object'))])
Bsmtset = set(['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2'])
MasVnrset = set(['MasVnrType'])
Garageset = set(['GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond'])
Fireplaceset = set(['FireplaceQu'])
Poolset = set(['PoolQC'])
NAset = set(['Fence','MiscFeature','Alley'])
# Put 0 and null values in the same category
for i in nacount_category:
if i in Bsmtset:
alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['TotalBsmtSF']==0), i]='Empty'
alldata.ix[pd.isnull(alldata.ix[:,i]), i] = alldata.ix[:,i].value_counts().index[0]
elif i in MasVnrset:
alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['MasVnrArea']==0),i]='Empty'
alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]
elif i in Garageset:
alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['GarageArea']==0),i]='Empty'
alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]
elif i in Fireplaceset:
alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['Fireplaces']==0),i]='Empty'
alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]
elif i in Poolset:
alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['PoolArea']==0),i]='Empty'
alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]
elif i in NAset:
alldata.ix[pd.isnull(alldata.ix[:,i]),i]='Empty'
else:
alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]
for i in category:
alldata.ix[:,i]=le.fit_transform(alldata.ix[:,i])
train = alldata.ix[0:trainlen-1, :]
test = alldata.ix[trainlen:alldata.shape[0],:]
In [ ]:
alldata.head()
Export data
In [ ]:
train.to_csv('data/train_prepared.csv')
test.to_csv('data/test_prepared.csv')
In [ ]:
train.head()
In [ ]:
target.to_csv('data/train_target.csv', header='SalePrice', index=False)
In [ ]: