notebook.community

Edit and run



In [28]:

    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
import scipy
from scipy import  stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline



In [51]:

    
data =pd.read_csv('dataset/house_predictions_kaggle/train.csv')
data.head()









    Out[51]:







  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalePrice
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      WD
      Normal
      208500
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      5
      2007
      WD
      Normal
      181500
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      9
      2008
      WD
      Normal
      223500
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2006
      WD
      Abnorml
      140000
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      12
      2008
      WD
      Normal
      250000
    
  

5 rows × 81 columns



In [10]:

    
data.columns









    Out[10]:





Index([u'Id', u'MSSubClass', u'MSZoning', u'LotFrontage', u'LotArea',
       u'Street', u'Alley', u'LotShape', u'LandContour', u'Utilities',
       u'LotConfig', u'LandSlope', u'Neighborhood', u'Condition1',
       u'Condition2', u'BldgType', u'HouseStyle', u'OverallQual',
       u'OverallCond', u'YearBuilt', u'YearRemodAdd', u'RoofStyle',
       u'RoofMatl', u'Exterior1st', u'Exterior2nd', u'MasVnrType',
       u'MasVnrArea', u'ExterQual', u'ExterCond', u'Foundation', u'BsmtQual',
       u'BsmtCond', u'BsmtExposure', u'BsmtFinType1', u'BsmtFinSF1',
       u'BsmtFinType2', u'BsmtFinSF2', u'BsmtUnfSF', u'TotalBsmtSF',
       u'Heating', u'HeatingQC', u'CentralAir', u'Electrical', u'1stFlrSF',
       u'2ndFlrSF', u'LowQualFinSF', u'GrLivArea', u'BsmtFullBath',
       u'BsmtHalfBath', u'FullBath', u'HalfBath', u'BedroomAbvGr',
       u'KitchenAbvGr', u'KitchenQual', u'TotRmsAbvGrd', u'Functional',
       u'Fireplaces', u'FireplaceQu', u'GarageType', u'GarageYrBlt',
       u'GarageFinish', u'GarageCars', u'GarageArea', u'GarageQual',
       u'GarageCond', u'PavedDrive', u'WoodDeckSF', u'OpenPorchSF',
       u'EnclosedPorch', u'3SsnPorch', u'ScreenPorch', u'PoolArea', u'PoolQC',
       u'Fence', u'MiscFeature', u'MiscVal', u'MoSold', u'YrSold', u'SaleType',
       u'SaleCondition', u'SalePrice'],
      dtype='object')



In [12]:

    
data['SalePrice'].describe()









    Out[12]:





count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64



In [19]:

    
sns.distplot(data['SalePrice'])
plt.show()



In [35]:

    
print("Skewness: ",data['SalePrice'].skew())
print("Skewness: ",data['SalePrice'].kurt())









    



('Skewness: ', 1.8828757597682129)
('Skewness: ', 6.5362818600645287)



In [56]:

    
from keras.models import Sequential
from keras.layers import  Dense,Flatten,activations
from sklearn.preprocessing import  StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer as DV



In [80]:

    
data_num = data._get_numeric_data()
data_cat = data.select_dtypes(include=["object"]).copy()
print(data.columns.size)
print(data_num.columns.size)
print(data_cat.columns.size)
print(data.shape)
data_cat = data_cat.fillna("zero",axis=1)



In [81]:

    
data_cat[data_cat.isnull().any(axis=1)]
#data_cat = data_cat.drop('Alley',axis=1)
data_cat[data_cat.isnull().any(axis=1)]









    Out[81]:







  
    
      
      MSZoning
      Street
      Alley
      LotShape
      LandContour
      Utilities
      LotConfig
      LandSlope
      Neighborhood
      Condition1
      ...
      GarageType
      GarageFinish
      GarageQual
      GarageCond
      PavedDrive
      PoolQC
      Fence
      MiscFeature
      SaleType
      SaleCondition
    
  
  
  

0 rows × 43 columns



In [106]:

    
tot=0
for i in data_cat.columns.tolist():
    j = len(data_cat[i].value_counts())
    tot += j
print(tot)



In [105]:

    
len(data_cat['MSZoning'].value_counts())









    Out[105]:





5



In [58]:

    
#vectoizing the categorical values
vect = DV(sparse=False)
vect_values = vect.fit_transform(data_cat.values)
##one hot encoding the categorical values

enc = OneHotEncoder()
enc.fit(data_cat.values)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-58-01e9bdd7e847> in <module>()
      1 #vectoizing the categorical values
      2 vect = DV(sparse=False)
----> 3 vect_values = vect.fit_transform(data_cat.values)
      4 ##one hot encoding the categorical values
      5 

/home/surya/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyc in fit_transform(self, X, y)
    229             Feature vectors; always 2-d.
    230         """
--> 231         return self._transform(X, fitting=True)
    232 
    233     def inverse_transform(self, X, dict_type=dict):

/home/surya/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyc in _transform(self, X, fitting)
    165         # same time
    166         for x in X:
--> 167             for f, v in six.iteritems(x):
    168                 if isinstance(v, six.string_types):
    169                     f = "%s%s%s" % (f, self.separator, v)

/home/surya/anaconda2/lib/python2.7/site-packages/sklearn/externals/six.pyc in iteritems(d, **kw)
    437 def iteritems(d, **kw):
    438     """Return an iterator over the (key, value) pairs of a dictionary."""
--> 439     return iter(getattr(d, _iteritems)(**kw))
    440 
    441 def iterlists(d, **kw):

AttributeError: 'numpy.ndarray' object has no attribute 'iteritems'



In [37]:

    
model = Sequential()
model.add(Dense(79))

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000