In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, r2_score, mean_absolute_error
In [52]:
path = '../input/melbourne-housing-snapshot/melb_data.csv'
In [63]:
melbourne_data = pd.read_csv(path)
In [54]:
melbourne_data.describe()
Out[54]:
In [64]:
melbourne_data.columns
Out[64]:
In [65]:
melbourne_data = melbourne_data.dropna(axis=0) # Just drop empty values
In [66]:
y = melbourne_data.Price
In [67]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'BuildingArea']
X = melbourne_data[melbourne_features]
X.describe()
y = melbourne_data['Price']
In [68]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt',]
X = melbourne_data[melbourne_features]
# X = pd.get_dummies(X)
y = melbourne_data['Price']
X.describe()
Out[68]:
In [69]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
In [70]:
model = RandomForestRegressor(random_state=1, n_estimators=100)
In [71]:
model.fit(train_X, train_y)
Out[71]:
In [72]:
tree_model = DecisionTreeRegressor(random_state=1)
tree_model.fit(train_X, train_y)
Out[72]:
In [73]:
random_forest_val_mae = mean_absolute_error(model.predict(val_X), val_y)
tree_val_mae = mean_absolute_error(tree_model.predict(val_X), val_y)
print(random_forest_val_mae, tree_val_mae)
print(explained_variance_score(model.predict(val_X), val_y), explained_variance_score(tree_model.predict(val_X), val_y))
print(r2_score(model.predict(val_X), val_y), r2_score(tree_model.predict(val_X), val_y))
In [75]:
columns = X.columns
my_house = pd.DataFrame([{'Rooms': 2, 'Bathroom': 1, 'Landsize': 700, 'BuildingArea': 150, 'YearBuilt': 1990}, ], columns=columns)
In [78]:
model.predict([[2, 1, 700, 150, 1990], ])
Out[78]:
In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
path = '../input/melbourne-housing-snapshot/melb_data.csv'
data = pd.read_csv(path)
data.dropna(axis=0, subset=['Price'], inplace=True)
y = data.Price
X = data.drop(['Price'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)
# Imputer, should probably ignore this
my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)
X.describe()
Out[15]:
In [16]:
from xgboost import XGBRegressor
my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(train_X, train_y, verbose=False)
Out[16]:
In [17]:
# make predictions
predictions = my_model.predict(test_X)
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))
In [18]:
my_model = XGBRegressor(n_estimators=1000)
my_model.fit(train_X, train_y, early_stopping_rounds=5,
eval_set=[(test_X, test_y)], verbose=False)
Out[18]:
In [19]:
print("Mean Absolute Error : " + str(mean_absolute_error(my_model.predict(test_X), test_y)))
print("Explained Variance Score :" + str(explained_variance_score(my_model.predict(test_X), test_y)))
print("R2 Score :" + str(r2_score(my_model.predict(test_X), test_y)))
In [20]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5,
eval_set=[(test_X, test_y)], verbose=False)
# make predictions
predictions = my_model.predict(test_X)
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))
In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
path = '../input/melbourne-housing-snapshot/melb_data.csv'
data = pd.read_csv(path)
data.dropna(axis=0, subset=['Price'], inplace=True)
y = data.Price
X = data.drop(['Price'], axis=1).select_dtypes(exclude=['object'])
# melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt',]
# X = X[melbourne_features]
columns = X.columns
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.20, random_state=1)
# Imputer, should probably ignore this
# my_imputer = Imputer()
# train_X = my_imputer.fit_transform(train_X)
# test_X = my_imputer.transform(test_X)
X.describe()
Out[21]:
In [22]:
# Test again but only with the features we can expect from users
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5,
eval_set=[(test_X, test_y)], verbose=False)
# make predictions
predictions = my_model.predict(test_X)
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))
In [23]:
my_house = pd.DataFrame([{'Rooms': 2, 'Bathroom': 1, 'Landsize': 700, 'BuildingArea': 150, 'YearBuilt': 1990}, ], columns=columns)
In [24]:
my_house
Out[24]:
In [25]:
# my_house = my_imputer.transform(my_house)
In [26]:
my_house
Out[26]:
In [27]:
my_model.predict(my_house)
Out[27]:
In [28]:
z = {'model': my_model, 'impu': my_imputer}
In [29]:
import pickle
In [30]:
s = pickle.dumps(z)
In [31]:
clf2 = pickle.loads(s)
In [32]:
clf2['model'].predict(my_house)
Out[32]:
In [75]:
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
xgb_model = XGBRegressor()
clf = GridSearchCV(
xgb_model,
{'max_depth': [6,],
'learning_rate': [0.05,],
'n_estimators': [450, 470, 475, 480, 485]},
n_jobs=4,
cv=3,
verbose=2
)
clf.fit(train_X, train_y) # None, eval_set=[(test_X, test_y)], verbose=False)
print(clf.best_score_)
print(clf.best_params_)
predictions = clf.predict(test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))
In [68]:
print(type(clf))
print(type(clf.best_estimator_))
print(clf.best_estimator_.predict(my_house))
clf.predict(my_house)
Out[68]:
In [120]:
# print(type(clf))
# print(type(clf.best_estimator_))
# print(clf.best_estimator_.predict(my_house))
# clf.predict(my_house)
import numpy as np
print(columns)
print([col for col in columns])
features = {col: np.NaN for col in columns}
my_house = {'Rooms': 2, 'Bathroom': 1, 'Landsize': 700, 'BuildingArea': 150, 'YearBuilt': 1990}
features.update(my_house)
print(features)
pd.DataFrame([features], columns=columns)
Out[120]:
In [105]:
clf.best_estimator_.save_model('./test_model')
In [114]:
some_model = XGBRegressor()
some_model.load_model('./test_model')
some_model.predict(pd.DataFrame([features], columns=columns), validate_features=True)
Out[114]:
In [35]:
my_model.predict(pd.DataFrame([{'Rooms': 6, 'Bathroom': 2, 'Landsize': 2500, 'BuildingArea': 800, 'YearBuilt': 1980}, ], columns=columns))
Out[35]:
In [70]:
# What will happen if I drop all the other columns from test_X ?
test_X.head()
reduced_test_X = test_X[['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt']]
reduced_test_X = pd.DataFrame(reduced_test_X, columns=columns)
predictions = clf.predict(reduced_test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))
In [50]:
# Ok, lets train both models wiht this and re-evaluate
xgb_model = XGBRegressor()
clf = GridSearchCV(xgb_model,
{'max_depth': [2,4,6],
'n_estimators': [50,100,200]}, verbose=1)
clf.fit(pd.DataFrame(train_X[['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt']], columns=columns), train_y)
print(clf.best_score_)
print(clf.best_params_)
predictions = clf.predict(test_X)
predictions = clf.predict(reduced_test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5,
eval_set=[(reduced_test_X, test_y)], verbose=False)
# make predictions
predictions = my_model.predict(reduced_test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))