In [1]:
    
# import common APIs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn import cross_validation, naive_bayes, tree, svm, ensemble
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve
from xgboost import XGBRegressor
    
    
In [2]:
    
# Data observation
filepath = '/Users/mac/Desktop/Kaggle_datasets/KC_housePrice/'
filename01 = 'kc_house_data.csv'
df_full = pd.read_csv(os.path.join(filepath, filename01))
df_full.head()
    
    Out[2]:
In [3]:
    
df_full.info()
    
    
In [8]:
    
df_full.columns
    
    Out[8]:
In [7]:
    
df_full.grade.unique()
    
    Out[7]:
In [11]:
    
cols = ['price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']
df_num = df_full[cols]
minmax_cols = ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', #拿掉zipcode
       'lat', 'long', 'sqft_living15', 'sqft_lot15']
for col in minmax_cols:
    scaler = MinMaxScaler()
    scaler.fit(df_num[col].values.reshape(-1, 1))
    df_num[col] = scaler.transform(df_num[col].values.reshape(-1, 1))
    
p_scaler = MinMaxScaler()
p_scaler.fit(df_num['price'].values.reshape(-1, 1))
df_num['price'] = p_scaler.transform(df_num['price'].values.reshape(-1, 1))
df_num = pd.get_dummies(df_num, columns=['zipcode'])
    
    
In [12]:
    
df_num.head()
    
    Out[12]:
In [13]:
    
# Data preprocessing
from sklearn.utils import shuffle
shuffle_df = shuffle(df_num, random_state=42)
df_label = shuffle_df['price']
df_feature = shuffle_df.drop('price', axis=1)
cut_point = round(len(df_num)*0.6)
train_feature = np.array(df_feature.values[:cut_point,:])
train_label = np.array(df_label.values[:cut_point])
test_feature = np.array(df_feature.values[cut_point:,:])
test_label = np.array(df_label.values[cut_point:])
    
In [14]:
    
## tree.DecisionTreeRegressor()
from sklearn import cross_validation, tree
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                              test_size=0.25, random_state=0)
regr=tree.DecisionTreeRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
    
    
    Out[14]:
In [15]:
    
### svm.LinearSVR()
from sklearn import cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
                                                                  test_size=0.25, random_state=0)
regr=svm.LinearSVR()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
    
    
    Out[15]:
In [16]:
    
### ensemble.AdaBoostRegressor()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
                                                                  test_size=0.25, random_state=0)
regr=ensemble.AdaBoostRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
    
    
    Out[16]:
In [86]:
    
### ensemble.GradientBoostingRegressor()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
                                                                  test_size=0.25, random_state=0)
regr=ensemble.GradientBoostingRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
    
    
    Out[86]:
In [90]:
    
### ensemble.RandomForestRegressor()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                                test_size=0.25, random_state=0)
regr=ensemble.RandomForestRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
    
    
    Out[90]:
In [81]:
    
# XGBRegressor
from xgboost import XGBRegressor
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
                                          test_size=0.25,random_state=0) #分層取樣
regr=XGBRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
    
    
    Out[81]:
In [91]:
    
real = p_scaler.inverse_transform(test_label.reshape(-1,1)).reshape(-1)
ans = p_scaler.inverse_transform(regr.predict(test_feature).reshape(-1,1)).reshape(-1)
df_ans = pd.DataFrame({'real':real,'ans':ans }, 
                      index = range(len(real))
                      )
df_ans.head()
    
    Out[91]:
In [143]:
    
plt.figure(figsize=(10,10))
plt.scatter(ans, real, alpha=0.5)
plt.xlabel('pred price')
plt.ylabel('real price')
x=np.linspace(0,5000000,100)
y=x
plt.plot(x,y, c='red', label='correct answer')
plt.legend()
plt.show()
    
    
In [139]:
    
plt.figure(figsize=(10,10))
sns.jointplot('real','ans',data=df_ans, size=8)
plt.xlabel('pred price')
plt.ylabel('real price')
plt.show()
    
    
    
In [127]:
    
plt.figure(figsize=(10,5))
plt.plot(range(len(real)), (ans-real)/real)
plt.show()
np.mean(np.abs((ans-real)/real))
    
    
    Out[127]:
In [94]:
    
df_ans['ans'].corr(df_ans['real'])
    
    Out[94]:
In [121]:
    
### Keras MLP models
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()
model = Sequential() 
model.add(Dense(units=1000, 
                input_dim=86, 
                kernel_initializer='uniform', 
                ))
model.add(Dropout(0.5))
model.add(Dense(units=400,  
                kernel_initializer='uniform', 
                ))
model.add(Dropout(0.5))
model.add(Dense(units=100,  
                kernel_initializer='uniform', 
                ))
model.add(Dropout(0.5))
model.add(Dense(units=1, #輸出一個數字 
                kernel_initializer='uniform',
                ))
print(model.summary()) #可以清楚看到model還有參數數量
model.compile(loss='mean_squared_error',
              optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_feature, y=train_label,  #上面多分割一步在keras是內建的
                          validation_split=0.8, epochs=40, 
                          batch_size=2000, verbose=2) #verbose=2表示顯示訓練過程
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
scores = model.evaluate(test_feature, test_label)
print('\n')
print('accuracy=',scores[1])
prediction = model.predict(test_feature)
    
    
    
    
    
In [122]:
    
# Train/Test Score,接續後面的confusion matrix
real = p_scaler.inverse_transform(test_label.reshape(-1,1)).reshape(-1)
ans2 = p_scaler.inverse_transform(prediction.reshape(-1,1)).reshape(-1)
df_ans2 = pd.DataFrame({'real':real,'ans':ans2 }, 
                      index = range(len(real))
                      )
df_ans2.head()
    
    Out[122]:
In [131]:
    
plt.figure(figsize=(10,10))
plt.scatter(ans2, real)
plt.xlabel('pred price')
plt.ylabel('real price')
x=np.linspace(0,5000000,100)
y=x
plt.plot(x,y, c='red', label='correct answer')
plt.show()
    
    
In [125]:
    
plt.figure(figsize=(10,5))
plt.plot(range(len(real)), (ans2-real)/real)
plt.show()
np.mean(np.abs((ans2-real)/real))
    
    
    Out[125]:
In [120]:
    
df_ans2['ans'].corr(df_ans2['real'])
    
    Out[120]:
In [ ]: