notebook.community

Edit and run



In [1]:

    
# import common APIs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn import cross_validation, naive_bayes, tree, svm, ensemble
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve
from xgboost import XGBRegressor









    



//anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
# Data observation
filepath = '/Users/mac/Desktop/Kaggle_datasets/KC_housePrice/'
filename01 = 'kc_house_data.csv'

df_full = pd.read_csv(os.path.join(filepath, filename01))
df_full.head()









    Out[2]:






  
    
      
      id
      date
      price
      bedrooms
      bathrooms
      sqft_living
      sqft_lot
      floors
      waterfront
      view
      ...
      grade
      sqft_above
      sqft_basement
      yr_built
      yr_renovated
      zipcode
      lat
      long
      sqft_living15
      sqft_lot15
    
  
  
    
      0
      7129300520
      20141013T000000
      221900.0
      3
      1.00
      1180
      5650
      1.0
      0
      0
      ...
      7
      1180
      0
      1955
      0
      98178
      47.5112
      -122.257
      1340
      5650
    
    
      1
      6414100192
      20141209T000000
      538000.0
      3
      2.25
      2570
      7242
      2.0
      0
      0
      ...
      7
      2170
      400
      1951
      1991
      98125
      47.7210
      -122.319
      1690
      7639
    
    
      2
      5631500400
      20150225T000000
      180000.0
      2
      1.00
      770
      10000
      1.0
      0
      0
      ...
      6
      770
      0
      1933
      0
      98028
      47.7379
      -122.233
      2720
      8062
    
    
      3
      2487200875
      20141209T000000
      604000.0
      4
      3.00
      1960
      5000
      1.0
      0
      0
      ...
      7
      1050
      910
      1965
      0
      98136
      47.5208
      -122.393
      1360
      5000
    
    
      4
      1954400510
      20150218T000000
      510000.0
      3
      2.00
      1680
      8080
      1.0
      0
      0
      ...
      8
      1680
      0
      1987
      0
      98074
      47.6168
      -122.045
      1800
      7503
    
  

5 rows × 21 columns



In [3]:

    
df_full.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
id               21613 non-null int64
date             21613 non-null object
price            21613 non-null float64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB



In [8]:

    
df_full.columns









    Out[8]:





Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')



In [7]:

    
df_full.grade.unique()









    Out[7]:





array([ 7,  6,  8, 11,  9,  5, 10, 12,  4,  3, 13,  1])



In [11]:

    
cols = ['price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']

df_num = df_full[cols]

minmax_cols = ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', #拿掉zipcode
       'lat', 'long', 'sqft_living15', 'sqft_lot15']

for col in minmax_cols:
    scaler = MinMaxScaler()
    scaler.fit(df_num[col].values.reshape(-1, 1))
    df_num[col] = scaler.transform(df_num[col].values.reshape(-1, 1))
    
p_scaler = MinMaxScaler()
p_scaler.fit(df_num['price'].values.reshape(-1, 1))
df_num['price'] = p_scaler.transform(df_num['price'].values.reshape(-1, 1))


df_num = pd.get_dummies(df_num, columns=['zipcode'])









    



//anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:444: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, DataConversionWarning)
//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [12]:

    
df_num.head()









    Out[12]:






  
    
      
      price
      bedrooms
      bathrooms
      sqft_living
      sqft_lot
      floors
      waterfront
      view
      condition
      grade
      ...
      zipcode_98146
      zipcode_98148
      zipcode_98155
      zipcode_98166
      zipcode_98168
      zipcode_98177
      zipcode_98178
      zipcode_98188
      zipcode_98198
      zipcode_98199
    
  
  
    
      0
      0.019266
      0.090909
      0.12500
      0.067170
      0.003108
      0.0
      0.0
      0.0
      0.5
      0.500000
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      1
      0.060721
      0.090909
      0.28125
      0.172075
      0.004072
      0.4
      0.0
      0.0
      0.5
      0.500000
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      0.013770
      0.060606
      0.12500
      0.036226
      0.005743
      0.0
      0.0
      0.0
      0.5
      0.416667
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      0.069377
      0.121212
      0.37500
      0.126038
      0.002714
      0.0
      0.0
      0.0
      1.0
      0.500000
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      0.057049
      0.090909
      0.25000
      0.104906
      0.004579
      0.0
      0.0
      0.0
      0.5
      0.583333
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 87 columns



In [13]:

    
# Data preprocessing
from sklearn.utils import shuffle

shuffle_df = shuffle(df_num, random_state=42)

df_label = shuffle_df['price']
df_feature = shuffle_df.drop('price', axis=1)

cut_point = round(len(df_num)*0.6)
train_feature = np.array(df_feature.values[:cut_point,:])
train_label = np.array(df_label.values[:cut_point])
test_feature = np.array(df_feature.values[cut_point:,:])
test_label = np.array(df_label.values[cut_point:])

Scikit-Learn



In [14]:

    
## tree.DecisionTreeRegressor()
from sklearn import cross_validation, tree
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                              test_size=0.25, random_state=0)
regr=tree.DecisionTreeRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:0.999499
Testing Score:0.765737






    Out[14]:





0.25120004515339039



In [15]:

    
### svm.LinearSVR()
from sklearn import cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
                                                                  test_size=0.25, random_state=0)
regr=svm.LinearSVR()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:0.773419
Testing Score:0.745703






    Out[15]:





0.22163226051203525



In [16]:

    
### ensemble.AdaBoostRegressor()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
                                                                  test_size=0.25, random_state=0)
regr=ensemble.AdaBoostRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:0.228179
Testing Score:0.238159






    Out[16]:





1.1438836452705718



In [86]:

    
### ensemble.GradientBoostingRegressor()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
                                                                  test_size=0.25, random_state=0)
regr=ensemble.GradientBoostingRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:0.914783
Testing Score:0.877460






    Out[86]:





0.20587734294095589



In [90]:

    
### ensemble.RandomForestRegressor()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                                test_size=0.25, random_state=0)
regr=ensemble.RandomForestRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:0.972302
Testing Score:0.856844






    Out[90]:





0.19024343468167751



In [81]:

    
# XGBRegressor
from xgboost import XGBRegressor
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
                                          test_size=0.25,random_state=0) #分層取樣
regr=XGBRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:0.910821
Testing Score:0.873735






    Out[81]:





0.20623351834388065



In [91]:

    
real = p_scaler.inverse_transform(test_label.reshape(-1,1)).reshape(-1)
ans = p_scaler.inverse_transform(regr.predict(test_feature).reshape(-1,1)).reshape(-1)

df_ans = pd.DataFrame({'real':real,'ans':ans }, 
                      index = range(len(real))
                      )
df_ans.head()









    Out[91]:






  
    
      
      ans
      real
    
  
  
    
      0
      2.888338e+05
      230000.0
    
    
      1
      6.662170e+05
      713000.0
    
    
      2
      3.515175e+05
      315000.0
    
    
      3
      6.574667e+05
      379900.0
    
    
      4
      1.059171e+06
      1100000.0



In [143]:

    
plt.figure(figsize=(10,10))
plt.scatter(ans, real, alpha=0.5)
plt.xlabel('pred price')
plt.ylabel('real price')

x=np.linspace(0,5000000,100)
y=x

plt.plot(x,y, c='red', label='correct answer')
plt.legend()
plt.show()



In [139]:

    
plt.figure(figsize=(10,10))
sns.jointplot('real','ans',data=df_ans, size=8)
plt.xlabel('pred price')
plt.ylabel('real price')

plt.show()









    





<matplotlib.figure.Figure at 0x11ae3c518>



In [127]:

    
plt.figure(figsize=(10,5))
plt.plot(range(len(real)), (ans-real)/real)
plt.show()

np.mean(np.abs((ans-real)/real))









    












    Out[127]:





0.13858608505266715



In [94]:

    
df_ans['ans'].corr(df_ans['real'])









    Out[94]:





0.92968093327194179

Keras: MLP，效果穩定比ensemble還差



In [121]:

    
### Keras MLP models
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()

model = Sequential() 
model.add(Dense(units=1000, 
                input_dim=86, 
                kernel_initializer='uniform', 
                ))
model.add(Dropout(0.5))

model.add(Dense(units=400,  
                kernel_initializer='uniform', 
                ))
model.add(Dropout(0.5))

model.add(Dense(units=100,  
                kernel_initializer='uniform', 
                ))
model.add(Dropout(0.5))

model.add(Dense(units=1, #輸出一個數字 
                kernel_initializer='uniform',
                ))

print(model.summary()) #可以清楚看到model還有參數數量

model.compile(loss='mean_squared_error',
              optimizer='adam', metrics=['accuracy'])

train_history = model.fit(x=train_feature, y=train_label,  #上面多分割一步在keras是內建的
                          validation_split=0.8, epochs=40, 
                          batch_size=2000, verbose=2) #verbose=2表示顯示訓練過程

show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')

scores = model.evaluate(test_feature, test_label)
print('\n')
print('accuracy=',scores[1])

prediction = model.predict(test_feature)









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_37 (Dense)             (None, 1000)              87000     
_________________________________________________________________
dropout_28 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_38 (Dense)             (None, 400)               400400    
_________________________________________________________________
dropout_29 (Dropout)         (None, 400)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 100)               40100     
_________________________________________________________________
dropout_30 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_40 (Dense)             (None, 1)                 101       
=================================================================
Total params: 527,601
Trainable params: 527,601
Non-trainable params: 0
_________________________________________________________________
None
Train on 2593 samples, validate on 10375 samples
Epoch 1/40
1s - loss: 0.0063 - acc: 0.0000e+00 - val_loss: 0.0042 - val_acc: 9.6386e-05
Epoch 2/40
0s - loss: 0.0043 - acc: 0.0000e+00 - val_loss: 0.0025 - val_acc: 9.6386e-05
Epoch 3/40
0s - loss: 0.0030 - acc: 0.0000e+00 - val_loss: 0.0022 - val_acc: 9.6386e-05
Epoch 4/40
1s - loss: 0.0024 - acc: 0.0000e+00 - val_loss: 0.0012 - val_acc: 9.6386e-05
Epoch 5/40
1s - loss: 0.0017 - acc: 0.0000e+00 - val_loss: 0.0015 - val_acc: 9.6386e-05
Epoch 6/40
1s - loss: 0.0018 - acc: 0.0000e+00 - val_loss: 9.2006e-04 - val_acc: 9.6386e-05
Epoch 7/40
1s - loss: 0.0013 - acc: 0.0000e+00 - val_loss: 0.0012 - val_acc: 9.6386e-05
Epoch 8/40
1s - loss: 0.0015 - acc: 0.0000e+00 - val_loss: 0.0010 - val_acc: 9.6386e-05
Epoch 9/40
1s - loss: 0.0013 - acc: 0.0000e+00 - val_loss: 7.4158e-04 - val_acc: 9.6386e-05
Epoch 10/40
2s - loss: 0.0012 - acc: 0.0000e+00 - val_loss: 8.9546e-04 - val_acc: 9.6386e-05
Epoch 11/40
1s - loss: 0.0012 - acc: 0.0000e+00 - val_loss: 6.7407e-04 - val_acc: 9.6386e-05
Epoch 12/40
1s - loss: 0.0010 - acc: 0.0000e+00 - val_loss: 8.3928e-04 - val_acc: 9.6386e-05
Epoch 13/40
1s - loss: 0.0011 - acc: 0.0000e+00 - val_loss: 7.6231e-04 - val_acc: 9.6386e-05
Epoch 14/40
1s - loss: 0.0010 - acc: 0.0000e+00 - val_loss: 6.3069e-04 - val_acc: 9.6386e-05
Epoch 15/40
1s - loss: 9.4231e-04 - acc: 0.0000e+00 - val_loss: 6.8215e-04 - val_acc: 9.6386e-05
Epoch 16/40
1s - loss: 0.0010 - acc: 0.0000e+00 - val_loss: 6.2124e-04 - val_acc: 9.6386e-05
Epoch 17/40
1s - loss: 8.9099e-04 - acc: 0.0000e+00 - val_loss: 6.6870e-04 - val_acc: 9.6386e-05
Epoch 18/40
1s - loss: 9.8509e-04 - acc: 0.0000e+00 - val_loss: 6.4249e-04 - val_acc: 9.6386e-05
Epoch 19/40
1s - loss: 9.1923e-04 - acc: 0.0000e+00 - val_loss: 5.8642e-04 - val_acc: 9.6386e-05
Epoch 20/40
1s - loss: 8.9010e-04 - acc: 0.0000e+00 - val_loss: 5.8952e-04 - val_acc: 9.6386e-05
Epoch 21/40
1s - loss: 8.5547e-04 - acc: 0.0000e+00 - val_loss: 5.6615e-04 - val_acc: 9.6386e-05
Epoch 22/40
1s - loss: 8.5133e-04 - acc: 0.0000e+00 - val_loss: 5.9633e-04 - val_acc: 9.6386e-05
Epoch 23/40
1s - loss: 8.5320e-04 - acc: 0.0000e+00 - val_loss: 5.7230e-04 - val_acc: 9.6386e-05
Epoch 24/40
1s - loss: 8.1935e-04 - acc: 0.0000e+00 - val_loss: 5.5421e-04 - val_acc: 9.6386e-05
Epoch 25/40
1s - loss: 7.9277e-04 - acc: 0.0000e+00 - val_loss: 5.5150e-04 - val_acc: 9.6386e-05
Epoch 26/40
1s - loss: 7.9025e-04 - acc: 0.0000e+00 - val_loss: 5.4976e-04 - val_acc: 9.6386e-05
Epoch 27/40
1s - loss: 8.0641e-04 - acc: 0.0000e+00 - val_loss: 5.6003e-04 - val_acc: 9.6386e-05
Epoch 28/40
1s - loss: 8.1878e-04 - acc: 0.0000e+00 - val_loss: 5.3904e-04 - val_acc: 9.6386e-05
Epoch 29/40
1s - loss: 7.5964e-04 - acc: 0.0000e+00 - val_loss: 5.3740e-04 - val_acc: 9.6386e-05
Epoch 30/40
1s - loss: 8.4496e-04 - acc: 0.0000e+00 - val_loss: 5.3336e-04 - val_acc: 9.6386e-05
Epoch 31/40
1s - loss: 7.8396e-04 - acc: 0.0000e+00 - val_loss: 5.3531e-04 - val_acc: 9.6386e-05
Epoch 32/40
1s - loss: 8.6322e-04 - acc: 0.0000e+00 - val_loss: 5.3860e-04 - val_acc: 9.6386e-05
Epoch 33/40
1s - loss: 7.6880e-04 - acc: 0.0000e+00 - val_loss: 5.2475e-04 - val_acc: 9.6386e-05
Epoch 34/40
1s - loss: 8.1529e-04 - acc: 0.0000e+00 - val_loss: 5.2377e-04 - val_acc: 9.6386e-05
Epoch 35/40
1s - loss: 7.3003e-04 - acc: 0.0000e+00 - val_loss: 5.2979e-04 - val_acc: 9.6386e-05
Epoch 36/40
1s - loss: 8.0129e-04 - acc: 0.0000e+00 - val_loss: 5.2493e-04 - val_acc: 9.6386e-05
Epoch 37/40
1s - loss: 7.5247e-04 - acc: 0.0000e+00 - val_loss: 5.2032e-04 - val_acc: 9.6386e-05
Epoch 38/40
1s - loss: 7.4195e-04 - acc: 0.0000e+00 - val_loss: 5.2153e-04 - val_acc: 9.6386e-05
Epoch 39/40
1s - loss: 7.8844e-04 - acc: 0.0000e+00 - val_loss: 5.2697e-04 - val_acc: 9.6386e-05
Epoch 40/40
3s - loss: 7.6010e-04 - acc: 0.0000e+00 - val_loss: 5.2639e-04 - val_acc: 9.6386e-05






    












    












    



8544/8645 [============================>.] - ETA: 0s

accuracy= 0.0



In [122]:

    
# Train/Test Score，接續後面的confusion matrix
real = p_scaler.inverse_transform(test_label.reshape(-1,1)).reshape(-1)
ans2 = p_scaler.inverse_transform(prediction.reshape(-1,1)).reshape(-1)

df_ans2 = pd.DataFrame({'real':real,'ans':ans2 }, 
                      index = range(len(real))
                      )
df_ans2.head()









    Out[122]:






  
    
      
      ans
      real
    
  
  
    
      0
      192319.84375
      230000.0
    
    
      1
      612252.31250
      713000.0
    
    
      2
      276880.93750
      315000.0
    
    
      3
      563177.43750
      379900.0
    
    
      4
      910415.56250
      1100000.0



In [131]:

    
plt.figure(figsize=(10,10))
plt.scatter(ans2, real)
plt.xlabel('pred price')
plt.ylabel('real price')

x=np.linspace(0,5000000,100)
y=x

plt.plot(x,y, c='red', label='correct answer')
plt.show()



In [125]:

    
plt.figure(figsize=(10,5))
plt.plot(range(len(real)), (ans2-real)/real)
plt.show()

np.mean(np.abs((ans2-real)/real))









    












    Out[125]:





0.18963831714529647



In [120]:

    
df_ans2['ans'].corr(df_ans2['real'])









    Out[120]:





0.89304438555544596



In [ ]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	...	zipcode_98178
0	0.019266	0.090909	0.12500	0.067170	0.003108	0.0	0.5	0.500000	...	1
1	0.060721	0.090909	0.28125	0.172075	0.004072	0.4	0.5	0.500000	...	0
2	0.013770	0.060606	0.12500	0.036226	0.005743	0.0	0.5	0.416667	...	0
3	0.069377	0.121212	0.37500	0.126038	0.002714	0.0	1.0	0.500000	...	0
4	0.057049	0.090909	0.25000	0.104906	0.004579	0.0	0.5	0.583333	...	0

	ans	real
0	2.888338e+05	230000.0
1	6.662170e+05	713000.0
2	3.515175e+05	315000.0
3	6.574667e+05	379900.0
4	1.059171e+06	1100000.0

	ans	real
0	192319.84375	230000.0
1	612252.31250	713000.0
2	276880.93750	315000.0
3	563177.43750	379900.0
4	910415.56250	1100000.0