In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
import scipy
from scipy import  stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [51]:
data =pd.read_csv('dataset/house_predictions_kaggle/train.csv')
data.head()


Out[51]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns


In [10]:
data.columns


Out[10]:
Index([u'Id', u'MSSubClass', u'MSZoning', u'LotFrontage', u'LotArea',
       u'Street', u'Alley', u'LotShape', u'LandContour', u'Utilities',
       u'LotConfig', u'LandSlope', u'Neighborhood', u'Condition1',
       u'Condition2', u'BldgType', u'HouseStyle', u'OverallQual',
       u'OverallCond', u'YearBuilt', u'YearRemodAdd', u'RoofStyle',
       u'RoofMatl', u'Exterior1st', u'Exterior2nd', u'MasVnrType',
       u'MasVnrArea', u'ExterQual', u'ExterCond', u'Foundation', u'BsmtQual',
       u'BsmtCond', u'BsmtExposure', u'BsmtFinType1', u'BsmtFinSF1',
       u'BsmtFinType2', u'BsmtFinSF2', u'BsmtUnfSF', u'TotalBsmtSF',
       u'Heating', u'HeatingQC', u'CentralAir', u'Electrical', u'1stFlrSF',
       u'2ndFlrSF', u'LowQualFinSF', u'GrLivArea', u'BsmtFullBath',
       u'BsmtHalfBath', u'FullBath', u'HalfBath', u'BedroomAbvGr',
       u'KitchenAbvGr', u'KitchenQual', u'TotRmsAbvGrd', u'Functional',
       u'Fireplaces', u'FireplaceQu', u'GarageType', u'GarageYrBlt',
       u'GarageFinish', u'GarageCars', u'GarageArea', u'GarageQual',
       u'GarageCond', u'PavedDrive', u'WoodDeckSF', u'OpenPorchSF',
       u'EnclosedPorch', u'3SsnPorch', u'ScreenPorch', u'PoolArea', u'PoolQC',
       u'Fence', u'MiscFeature', u'MiscVal', u'MoSold', u'YrSold', u'SaleType',
       u'SaleCondition', u'SalePrice'],
      dtype='object')

In [12]:
data['SalePrice'].describe()


Out[12]:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [19]:
sns.distplot(data['SalePrice'])
plt.show()



In [35]:
print("Skewness: ",data['SalePrice'].skew())
print("Skewness: ",data['SalePrice'].kurt())


('Skewness: ', 1.8828757597682129)
('Skewness: ', 6.5362818600645287)

In [56]:
from keras.models import Sequential
from keras.layers import  Dense,Flatten,activations
from sklearn.preprocessing import  StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer as DV

In [80]:
data_num = data._get_numeric_data()
data_cat = data.select_dtypes(include=["object"]).copy()
print(data.columns.size)
print(data_num.columns.size)
print(data_cat.columns.size)
print(data.shape)
data_cat = data_cat.fillna("zero",axis=1)


81
38
43
(1460, 81)

In [81]:
data_cat[data_cat.isnull().any(axis=1)]
#data_cat = data_cat.drop('Alley',axis=1)
data_cat[data_cat.isnull().any(axis=1)]


Out[81]:
MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 ... GarageType GarageFinish GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition

0 rows × 43 columns


In [106]:
tot=0
for i in data_cat.columns.tolist():
    j = len(data_cat[i].value_counts())
    tot += j
print(tot)


268

In [105]:
len(data_cat['MSZoning'].value_counts())


Out[105]:
5

In [58]:
#vectoizing the categorical values
vect = DV(sparse=False)
vect_values = vect.fit_transform(data_cat.values)
##one hot encoding the categorical values

enc = OneHotEncoder()
enc.fit(data_cat.values)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-58-01e9bdd7e847> in <module>()
      1 #vectoizing the categorical values
      2 vect = DV(sparse=False)
----> 3 vect_values = vect.fit_transform(data_cat.values)
      4 ##one hot encoding the categorical values
      5 

/home/surya/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyc in fit_transform(self, X, y)
    229             Feature vectors; always 2-d.
    230         """
--> 231         return self._transform(X, fitting=True)
    232 
    233     def inverse_transform(self, X, dict_type=dict):

/home/surya/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyc in _transform(self, X, fitting)
    165         # same time
    166         for x in X:
--> 167             for f, v in six.iteritems(x):
    168                 if isinstance(v, six.string_types):
    169                     f = "%s%s%s" % (f, self.separator, v)

/home/surya/anaconda2/lib/python2.7/site-packages/sklearn/externals/six.pyc in iteritems(d, **kw)
    437 def iteritems(d, **kw):
    438     """Return an iterator over the (key, value) pairs of a dictionary."""
--> 439     return iter(getattr(d, _iteritems)(**kw))
    440 
    441 def iterlists(d, **kw):

AttributeError: 'numpy.ndarray' object has no attribute 'iteritems'

In [37]:
model = Sequential()
model.add(Dense(79))