King County House Data


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
import models as ml

from sklearn.model_selection import train_test_split
from dateutil import parser

%matplotlib inline

1), prepare data


In [3]:
df = pd.read_csv('./data/kc_house_data.csv')
df.head()
df_train, df_test = train_test_split(df, test_size = 0.2) 

np_train = np.array(df_train)
np_test = np.array(df_test)

np.savez_compressed('./data/kc_house_data.npz', train=np_train, test=np_test)

In [4]:
df_train.head()


Out[4]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
8309 1426049054 20140701T000000 450000.0 3 1.75 1400 13775 1.0 0 0 ... 8 1400 0 1963 0 98028 47.7413 -122.259 2200 10450
16757 3392100050 20140625T000000 205000.0 3 1.00 1230 8750 1.0 0 0 ... 6 1230 0 1965 0 98003 47.3266 -122.334 1230 8750
18291 7203220300 20140724T000000 895990.0 4 2.75 3555 6565 2.0 0 0 ... 9 3555 0 2014 0 98053 47.6847 -122.017 3625 5637
18029 2558640110 20140514T000000 498000.0 4 2.75 2270 7375 1.0 0 0 ... 7 1290 980 1973 0 98034 47.7222 -122.168 1750 7760
17965 4402700593 20150428T000000 395000.0 2 1.00 1440 7808 1.0 0 0 ... 7 860 580 1949 0 98133 47.7431 -122.336 1550 7682

5 rows × 21 columns


In [5]:
def parse_date(val):
    return parser.parse(val).year

In [6]:
df_train['yr_sold'] = df_train['date'].apply(parse_date)


C:\Users\evitself\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [7]:
df_train.head()


Out[7]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 yr_sold
8309 1426049054 20140701T000000 450000.0 3 1.75 1400 13775 1.0 0 0 ... 1400 0 1963 0 98028 47.7413 -122.259 2200 10450 2014
16757 3392100050 20140625T000000 205000.0 3 1.00 1230 8750 1.0 0 0 ... 1230 0 1965 0 98003 47.3266 -122.334 1230 8750 2014
18291 7203220300 20140724T000000 895990.0 4 2.75 3555 6565 2.0 0 0 ... 3555 0 2014 0 98053 47.6847 -122.017 3625 5637 2014
18029 2558640110 20140514T000000 498000.0 4 2.75 2270 7375 1.0 0 0 ... 1290 980 1973 0 98034 47.7222 -122.168 1750 7760 2014
17965 4402700593 20150428T000000 395000.0 2 1.00 1440 7808 1.0 0 0 ... 860 580 1949 0 98133 47.7431 -122.336 1550 7682 2015

5 rows × 22 columns


In [8]:
df_train.columns.values


Out[8]:
array(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sold'], dtype=object)

In [9]:
plt.plot(df_train['yr_sold'], df_train['price'], 'r,')
plt.show()



In [10]:
train_data = np.array(df_train)[:,2:]
train_X = train_data[:,1:].astype(np.float32)
train_y = train_data[:,0].astype(np.float32)
train_y = train_y.reshape(train_y.shape[0], 1)

In [11]:
stds_X, means_X = ml.std_normalize(train_X)

In [12]:
std_y, mean_y = ml.std_normalize(train_y.reshape(train_y.shape[0], 1))

2), train linear model


In [13]:
feature_cnt = train_X.shape[1]
sample_cnt = train_X.shape[0]

In [14]:
W, b = ml.create_parameters(feature_cnt)

# batch learning
for epoch in range(0, 10000):
    h = ml.linear_model(train_X, W, b)
    dW, db = ml.mse_cost_dev(train_X, train_y, h)
    W, b = ml.gd_update(W, b, dW, db, lr=0.01)
    if (epoch + 1) % 1000 == 0:
        cur_cost = ml.mse_cost(h, train_y)
        print('epoch: {0}, cost:{1}'.format(epoch + 1, cur_cost))

# finish
predictions = ml.linear_model(train_X, W, b)
final_cost = ml.mse_cost(predictions, train_y)
print('training finished!')
print('final cost: {0}'.format(final_cost, W, b))


epoch: 1000, cost:[[ 0.15446093]]
epoch: 2000, cost:[[ 0.15024163]]
epoch: 3000, cost:[[ 0.15016424]]
epoch: 4000, cost:[[ 0.15016262]]
epoch: 5000, cost:[[ 0.15016259]]
epoch: 6000, cost:[[ 0.15016259]]
epoch: 7000, cost:[[ 0.15016259]]
epoch: 8000, cost:[[ 0.15016259]]
epoch: 9000, cost:[[ 0.15016259]]
epoch: 10000, cost:[[ 0.15016259]]
training finished!
final cost: [[ 0.15016259]]

3), using keras


In [20]:
from keras.layers import Dense, Activation, Flatten
from keras.layers import BatchNormalization, Dropout
from keras.models import Sequential
from keras.optimizers import Adam, SGD

In [16]:
train_X = train_X.reshape((train_X.shape[0], feature_cnt, 1))

In [45]:
mlp = Sequential()

mlp.add(Flatten(input_shape=(feature_cnt, 1)))
mlp.add(Dense(500))
mlp.add(BatchNormalization())
mlp.add(Activation('tanh'))
mlp.add(Dense(300))
mlp.add(BatchNormalization())
mlp.add(Activation('tanh'))
mlp.add(Dense(100))
mlp.add(BatchNormalization())
mlp.add(Activation('tanh'))
mlp.add(Dense(1))
mlp.add(BatchNormalization())
mlp.add(Activation('linear'))

mlp.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
flatten_5 (Flatten)          (None, 19)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 500)               10000     
_________________________________________________________________
batch_normalization_11 (Batc (None, 500)               2000      
_________________________________________________________________
activation_12 (Activation)   (None, 500)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 300)               150300    
_________________________________________________________________
batch_normalization_12 (Batc (None, 300)               1200      
_________________________________________________________________
activation_13 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 100)               30100     
_________________________________________________________________
batch_normalization_13 (Batc (None, 100)               400       
_________________________________________________________________
activation_14 (Activation)   (None, 100)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 101       
_________________________________________________________________
batch_normalization_14 (Batc (None, 1)                 4         
_________________________________________________________________
activation_15 (Activation)   (None, 1)                 0         
=================================================================
Total params: 194,105
Trainable params: 192,303
Non-trainable params: 1,802
_________________________________________________________________

In [46]:
sgd = SGD(lr=0.001, momentum=0.5)
adam = Adam(lr=0.001)

mlp.compile(optimizer='adam', 
            loss='mse', 
            metrics=['mae'])

In [47]:
epochs = 100

for epoch in range(0, epochs):
    mlp.fit(train_X,
            train_y,
            batch_size=1000,
            epochs=100,
            verbose=0)

    evl = mlp.evaluate(train_X, 
                 train_y, 
                 batch_size=sample_cnt, 
                 verbose=0)

    print("epoch {0}, cost {1}".format(epoch+1, evl))

print("finished")


epoch 1, cost [0.042596533894538879, 0.15156526863574982]
epoch 2, cost [0.023116141557693481, 0.11406562477350235]
epoch 3, cost [0.017665963619947433, 0.1018495187163353]
epoch 4, cost [0.011753103695809841, 0.081339552998542786]
epoch 5, cost [0.011164415627717972, 0.079401932656764984]
epoch 6, cost [0.0072766509838402271, 0.062301784753799438]
epoch 7, cost [0.0063523086719214916, 0.059343673288822174]
epoch 8, cost [0.006051134318113327, 0.056869737803936005]
epoch 9, cost [0.006089432630687952, 0.059468001127243042]
epoch 10, cost [0.0064008594490587711, 0.06160716712474823]
epoch 11, cost [0.0049832561053335667, 0.053613379597663879]
epoch 12, cost [0.0036616930738091469, 0.044653140008449554]
epoch 13, cost [0.0044356677681207657, 0.051452264189720154]
epoch 14, cost [0.0054807658307254314, 0.0596189945936203]
epoch 15, cost [0.0035088218282908201, 0.045463010668754578]
epoch 16, cost [0.0021140612661838531, 0.033490635454654694]
epoch 17, cost [0.0027109375223517418, 0.038814831525087357]
epoch 18, cost [0.0022268195170909166, 0.035365022718906403]
epoch 19, cost [0.0014960614498704672, 0.028214573860168457]
epoch 20, cost [0.0023670454975217581, 0.03518097847700119]
epoch 21, cost [0.0025837435387074947, 0.040727272629737854]
epoch 22, cost [0.0019667521119117737, 0.031445115804672241]
epoch 23, cost [0.0019548807758837938, 0.033897686749696732]
epoch 24, cost [0.0014254453126341105, 0.027543431147933006]
epoch 25, cost [0.001334702130407095, 0.027003195136785507]
epoch 26, cost [0.0016107655828818679, 0.031196005642414093]
epoch 27, cost [0.0013054272858425975, 0.027070038020610809]
epoch 28, cost [0.0011191652156412601, 0.0237398911267519]
epoch 29, cost [0.001217616256326437, 0.02558743953704834]
epoch 30, cost [0.00098166556563228369, 0.020845841616392136]
epoch 31, cost [0.001811120891943574, 0.033362492918968201]
epoch 32, cost [0.00099789979867637157, 0.022127078846096992]
epoch 33, cost [0.0014012150932103395, 0.027777139097452164]
epoch 34, cost [0.00085654784925282001, 0.020221859216690063]
epoch 35, cost [0.0012536115245893598, 0.02552727609872818]
epoch 36, cost [0.0012580358888953924, 0.025458786636590958]
epoch 37, cost [0.00098597689066082239, 0.022378796711564064]
epoch 38, cost [0.0011035543866455555, 0.023898674175143242]
epoch 39, cost [0.0010883484501391649, 0.024294456467032433]
epoch 40, cost [0.00076899700798094273, 0.019194319844245911]
epoch 41, cost [0.00071686645969748497, 0.018577434122562408]
epoch 42, cost [0.00097277143504470587, 0.022560084238648415]
epoch 43, cost [0.0006625752430409193, 0.017304709181189537]
epoch 44, cost [0.0007146722637116909, 0.017532872036099434]
epoch 45, cost [0.00090803147759288549, 0.021597990766167641]
epoch 46, cost [0.00065971288131549954, 0.017035061493515968]
epoch 47, cost [0.00075431627919897437, 0.019165864214301109]
epoch 48, cost [0.0011683984193950891, 0.024299081414937973]
epoch 49, cost [0.0009734177147038281, 0.0227342639118433]
epoch 50, cost [0.00096143706468865275, 0.02164725586771965]
epoch 51, cost [0.00070254231104627252, 0.018420400097966194]
epoch 52, cost [0.0006740400567650795, 0.017762081697583199]
epoch 53, cost [0.00063957989914342761, 0.017045389860868454]
epoch 54, cost [0.0008085844456218183, 0.020657407119870186]
epoch 55, cost [0.00059572106692939997, 0.016455652192234993]
epoch 56, cost [0.00072457373607903719, 0.019222376868128777]
epoch 57, cost [0.00085456605302169919, 0.020940521731972694]
epoch 58, cost [0.00060993066290393472, 0.017357205972075462]
epoch 59, cost [0.00050135242054238915, 0.014212223701179028]
epoch 60, cost [0.00056576094357296824, 0.015658615157008171]
epoch 61, cost [0.00057795224711298943, 0.015831379219889641]
epoch 62, cost [0.00064139295136556029, 0.015619144774973392]
epoch 63, cost [0.0005070690531283617, 0.01386511605232954]
epoch 64, cost [0.00053754891268908978, 0.015494700521230698]
epoch 65, cost [0.00051825749687850475, 0.014815717935562134]
epoch 66, cost [0.00066039484227076173, 0.01734868623316288]
epoch 67, cost [0.00064582802588120103, 0.016534974798560143]
epoch 68, cost [0.00066019862424582243, 0.017709784209728241]
epoch 69, cost [0.00052424275781959295, 0.014552146196365356]
epoch 70, cost [0.00049948605010285974, 0.014531265944242477]
epoch 71, cost [0.00048337838961742818, 0.013190025463700294]
epoch 72, cost [0.00061464624013751745, 0.016970198601484299]
epoch 73, cost [0.00046077126171439886, 0.013313576579093933]
epoch 74, cost [0.0005984022282063961, 0.016165098175406456]
epoch 75, cost [0.00055822578724473715, 0.015787448734045029]
epoch 76, cost [0.00052820518612861633, 0.015115771442651749]
epoch 77, cost [0.00052616762695834041, 0.015062123537063599]
epoch 78, cost [0.00050605088472366333, 0.014231611043214798]
epoch 79, cost [0.00044783699559047818, 0.013179933652281761]
epoch 80, cost [0.00065016036387532949, 0.018032945692539215]
epoch 81, cost [0.00038043980021029711, 0.011176672764122486]
epoch 82, cost [0.00043487298535183072, 0.01262020505964756]
epoch 83, cost [0.00051021075341850519, 0.014212720096111298]
epoch 84, cost [0.00049424613825976849, 0.014336287975311279]
epoch 85, cost [0.00052080146269872785, 0.014895777218043804]
epoch 86, cost [0.00039987184572964907, 0.01159924641251564]
epoch 87, cost [0.00050957466010004282, 0.01449400931596756]
epoch 88, cost [0.00053589686285704374, 0.014791413210332394]
epoch 89, cost [0.000419276999309659, 0.012248446233570576]
epoch 90, cost [0.00053634139476343989, 0.013867438770830631]
epoch 91, cost [0.00040928935050033033, 0.012252765707671642]
epoch 92, cost [0.00042877532541751862, 0.012188299559056759]
epoch 93, cost [0.00042335456237196922, 0.012715430930256844]
epoch 94, cost [0.00036127513158135116, 0.010598655790090561]
epoch 95, cost [0.00045014050556346774, 0.01314904447644949]
epoch 96, cost [0.0005324044032022357, 0.015414323657751083]
epoch 97, cost [0.00037619320210069418, 0.010994644835591316]
epoch 98, cost [0.00060077832313254476, 0.017064141109585762]
epoch 99, cost [0.00053641648264601827, 0.015811318531632423]
epoch 100, cost [0.00037202471867203712, 0.011110742576420307]
finished

In [56]:
def parse_back(h):
    return h*std_y+mean_y

In [59]:
k=2300

In [60]:
parse_back(mlp.predict(train_X[k:k+4,:]))


Out[60]:
array([[ 361666.1875 ],
       [ 227042.65625],
       [ 351246.     ],
       [ 214414.3125 ]], dtype=float32)

In [61]:
parse_back(train_y[k:k+4,:])


Out[61]:
array([[ 359000.],
       [ 229000.],
       [ 354000.],
       [ 210000.]], dtype=float32)

In [62]:
df_test['yr_sold'] = df_test['date'].apply(parse_date)


C:\Users\evitself\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [63]:
test_data = np.array(df_test)[:,2:]
test_X = test_data[:,1:].astype(np.float32)
test_y = test_data[:,0].astype(np.float32)
test_y = test_y.reshape(test_y.shape[0], 1)

In [68]:
ml.data_normalize(test_X, stds_X, means_X)

In [70]:
ml.data_normalize(test_y, std_y, mean_y)

In [77]:
test_X = test_X.reshape((test_X.shape[0], test_X.shape[1], 1))

In [79]:
parse_back(mlp.predict(test_X[k:k+10,:]))


Out[79]:
array([[ 1623289.     ],
       [  314894.1875 ],
       [  483438.84375],
       [  793371.875  ],
       [  789109.625  ],
       [  658603.8125 ],
       [  212731.78125],
       [  339327.5    ],
       [  911020.1875 ],
       [  587879.875  ]], dtype=float32)

In [80]:
parse_back(test_y[k:k+10,:])


Out[80]:
array([[ 1118000.],
       [  295000.],
       [  539000.],
       [  812000.],
       [  888000.],
       [  720000.],
       [  250000.],
       [  329000.],
       [  990000.],
       [  537000.]], dtype=float32)

In [81]:
mlp.evaluate(test_X, test_y, batch_size=test_X.shape[0], verbose=0)


Out[81]:
[0.14545422792434692, 0.21197842061519623]

In [ ]: