In [1]:
# import common APIs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn import cross_validation, naive_bayes, tree, svm, ensemble
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve
from xgboost import XGBRegressor
In [2]:
# Data observation
filepath = '/Users/mac/Desktop/Kaggle_datasets/KC_housePrice/'
filename01 = 'kc_house_data.csv'
df_full = pd.read_csv(os.path.join(filepath, filename01))
df_full.head()
Out[2]:
In [3]:
df_full.info()
In [8]:
df_full.columns
Out[8]:
In [7]:
df_full.grade.unique()
Out[7]:
In [11]:
cols = ['price', 'bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15']
df_num = df_full[cols]
minmax_cols = ['bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', #拿掉zipcode
'lat', 'long', 'sqft_living15', 'sqft_lot15']
for col in minmax_cols:
scaler = MinMaxScaler()
scaler.fit(df_num[col].values.reshape(-1, 1))
df_num[col] = scaler.transform(df_num[col].values.reshape(-1, 1))
p_scaler = MinMaxScaler()
p_scaler.fit(df_num['price'].values.reshape(-1, 1))
df_num['price'] = p_scaler.transform(df_num['price'].values.reshape(-1, 1))
df_num = pd.get_dummies(df_num, columns=['zipcode'])
In [12]:
df_num.head()
Out[12]:
In [13]:
# Data preprocessing
from sklearn.utils import shuffle
shuffle_df = shuffle(df_num, random_state=42)
df_label = shuffle_df['price']
df_feature = shuffle_df.drop('price', axis=1)
cut_point = round(len(df_num)*0.6)
train_feature = np.array(df_feature.values[:cut_point,:])
train_label = np.array(df_label.values[:cut_point])
test_feature = np.array(df_feature.values[cut_point:,:])
test_label = np.array(df_label.values[cut_point:])
In [14]:
## tree.DecisionTreeRegressor()
from sklearn import cross_validation, tree
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=tree.DecisionTreeRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[14]:
In [15]:
### svm.LinearSVR()
from sklearn import cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=svm.LinearSVR()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[15]:
In [16]:
### ensemble.AdaBoostRegressor()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=ensemble.AdaBoostRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[16]:
In [86]:
### ensemble.GradientBoostingRegressor()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=ensemble.GradientBoostingRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[86]:
In [90]:
### ensemble.RandomForestRegressor()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=ensemble.RandomForestRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[90]:
In [81]:
# XGBRegressor
from xgboost import XGBRegressor
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25,random_state=0) #分層取樣
regr=XGBRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[81]:
In [91]:
real = p_scaler.inverse_transform(test_label.reshape(-1,1)).reshape(-1)
ans = p_scaler.inverse_transform(regr.predict(test_feature).reshape(-1,1)).reshape(-1)
df_ans = pd.DataFrame({'real':real,'ans':ans },
index = range(len(real))
)
df_ans.head()
Out[91]:
In [143]:
plt.figure(figsize=(10,10))
plt.scatter(ans, real, alpha=0.5)
plt.xlabel('pred price')
plt.ylabel('real price')
x=np.linspace(0,5000000,100)
y=x
plt.plot(x,y, c='red', label='correct answer')
plt.legend()
plt.show()
In [139]:
plt.figure(figsize=(10,10))
sns.jointplot('real','ans',data=df_ans, size=8)
plt.xlabel('pred price')
plt.ylabel('real price')
plt.show()
In [127]:
plt.figure(figsize=(10,5))
plt.plot(range(len(real)), (ans-real)/real)
plt.show()
np.mean(np.abs((ans-real)/real))
Out[127]:
In [94]:
df_ans['ans'].corr(df_ans['real'])
Out[94]:
In [121]:
### Keras MLP models
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
model = Sequential()
model.add(Dense(units=1000,
input_dim=86,
kernel_initializer='uniform',
))
model.add(Dropout(0.5))
model.add(Dense(units=400,
kernel_initializer='uniform',
))
model.add(Dropout(0.5))
model.add(Dense(units=100,
kernel_initializer='uniform',
))
model.add(Dropout(0.5))
model.add(Dense(units=1, #輸出一個數字
kernel_initializer='uniform',
))
print(model.summary()) #可以清楚看到model還有參數數量
model.compile(loss='mean_squared_error',
optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_feature, y=train_label, #上面多分割一步在keras是內建的
validation_split=0.8, epochs=40,
batch_size=2000, verbose=2) #verbose=2表示顯示訓練過程
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
scores = model.evaluate(test_feature, test_label)
print('\n')
print('accuracy=',scores[1])
prediction = model.predict(test_feature)
In [122]:
# Train/Test Score,接續後面的confusion matrix
real = p_scaler.inverse_transform(test_label.reshape(-1,1)).reshape(-1)
ans2 = p_scaler.inverse_transform(prediction.reshape(-1,1)).reshape(-1)
df_ans2 = pd.DataFrame({'real':real,'ans':ans2 },
index = range(len(real))
)
df_ans2.head()
Out[122]:
In [131]:
plt.figure(figsize=(10,10))
plt.scatter(ans2, real)
plt.xlabel('pred price')
plt.ylabel('real price')
x=np.linspace(0,5000000,100)
y=x
plt.plot(x,y, c='red', label='correct answer')
plt.show()
In [125]:
plt.figure(figsize=(10,5))
plt.plot(range(len(real)), (ans2-real)/real)
plt.show()
np.mean(np.abs((ans2-real)/real))
Out[125]:
In [120]:
df_ans2['ans'].corr(df_ans2['real'])
Out[120]:
In [ ]: