In [1]:
import numpy as np
import pandas as pd
import os
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [15]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
In [2]:
df = pd.read_csv('C:/Users/Ruoying/notebook/cp290/rental-listings-census/data/ba_block_variables.csv')
In [3]:
df.columns
Out[3]:
In [8]:
df_listing = pd.read_csv('C:/Users/Ruoying/notebook/cp290/rental-listings-census/data/sf_filtered.csv')
In [9]:
df_listing.columns
Out[9]:
In [10]:
print(df.shape)
In [11]:
print(df_listing.shape)
In [12]:
dfA = pd.merge(df_listing, df, on='block_id')
In [13]:
print(dfA.shape)
In [20]:
cols_to_use = ['rent','rent_sqft','sqft','bedrooms','bathrooms','longitude', 'latitude',
'bgpop','bgjobs', 'bgmedkids', 'bgmedhhs',
'bgmedinc', 'proprent', 'lowinc1500m', 'highinc1500m', 'lnjobs5000m',
'lnjobs30km', 'lnpop400m', 'lnpop800m', 'lnjobs800m', 'propwhite',
'propblack', 'propasian', 'pumahhden', 'lnbasic3000m', 'lntcpuw3000m',
'pumajobden', 'lnjobs40km', 'lnret3000m', 'lnfire3000m', 'lnserv3000m',
'prop1per', 'prop2per', 'bgmedagehd', 'puma1', 'puma2', 'puma3',
'puma4', 'northsf', 'pct1per', 'pct2per', 'pctrent',
'pctblack', 'pctwhite', 'pctasian', 'y17jan', 'y17feb', 'y17mar',
'bgpopden', 'bgjobden', 'highlowinc1500m']
x_cols = cols_to_use[2:]
y_col = 'rent'
print(len(x_cols))
In [22]:
# looks like outliers are already filtered.
# nan's?
# about half of rows have bathrooms missing. Is it better to not use bathrooms feature? Or better to use it and drop Na's?
df1 = dfA[cols_to_use]
print(len(df1))
df_notnull = df1.dropna(how='any')
print(len(df_notnull))
df2 = df_notnull
In [16]:
def RMSE(y_actual, y_predicted):
return np.sqrt(mean_squared_error(y_actual, y_predicted))
def cross_val_gb(X,y,cv_method='kfold',k=5, **params):
"""Estimate gradient boosting regressor using cross validation.
Args:
X (DataFrame): features data
y (Series): target data
cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
k (int): number of folds (default=5)
**params: keyword arguments for regressor
Returns:
float: mean error (RMSE) across all training/test sets.
"""
if cv_method == 'kfold':
kf = KFold(n_splits=k, shuffle=True, random_state=2012016) # use random seed for reproducibility.
E = np.ones(k) # this array will hold the errors.
i=0
for train, test in kf.split(X, y):
train_data_x = X.iloc[train]
train_data_y = y.iloc[train]
test_data_x = X.iloc[test]
test_data_y = y.iloc[test]
# n_estimators is number of trees to build.
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(train_data_x,train_data_y)
predict_y=grad_boost.predict(test_data_x)
E[i] = RMSE(test_data_y, predict_y)
i+=1
return np.mean(E)
In [23]:
df_X = df2[x_cols]
df_y = df2[y_col]
In [24]:
params = {'n_estimators':100,
'learning_rate':0.1,
'max_depth':1,
'min_samples_leaf':4
}
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
cross_val_gb(df_X,df_y, **params)
Out[24]:
In [26]:
# find best model using different parameter
param_grid = {'learning_rate':[5,.2,.1],
'max_depth':[6,8,12],
'min_samples_leaf': [17,25],
'max_features': [.3]
}
est= GradientBoostingRegressor(n_estimators = 100)
gs_cv = GridSearchCV(est,param_grid).fit(df_X,df_y)
print(gs_cv.best_params_)
print(gs_cv.best_score_)
In [28]:
# best parameters
params = {'n_estimators':500,
'learning_rate':0.1,
'max_depth':12,
'min_samples_leaf':17,
'max_features':.3
}
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
cross_val_gb(df_X,df_y, cv_method='kfold',k=5, **params)
Out[28]:
In [32]:
# plot the importances
gb_o = pd.DataFrame({'features':x_cols,'importance':grad_boost.feature_importances_})
gb_o= gb_o.sort('importance',ascending=False)
plt.figure(1,figsize=(12, 6))
plt.xticks(range(len(gb_o)), gb_o.features,rotation=45)
plt.plot(range(len(gb_o)),gb_o.importance,"o")
plt.title('Feature importances')
plt.show()
In [33]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
In [34]:
features = [0,1,2,3,4, 13, 46,5,11]
names = df_X.columns
fig, axs = plot_partial_dependence(grad_boost, df_X, features,feature_names=names, grid_resolution=50, figsize = (10,8))
fig.suptitle('Partial dependence of rental price features')
plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle
plt.show()
In [35]:
# interact variables and see their partial dependence
features = [(0,2),(1,2),(3,4),(13,2)]
names = df_X.columns
fig, axs = plot_partial_dependence(grad_boost, df_X, features,feature_names=names, grid_resolution=50, figsize = (9,6))
fig.suptitle('Partial dependence of rental price features')
plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle
plt.show()
try new features
In [ ]:
cols_to_use = ['rent','rent_sqft','sqft','bedrooms','bathrooms','longitude', 'latitude',
'bgpop','bgjobs', 'bgmedkids', 'bgmedhhs',
'bgmedinc', 'proprent', 'lowinc1500m', 'highinc1500m', 'lnjobs5000m',
'lnjobs30km', 'lnpop400m', 'lnpop800m', 'lnjobs800m', 'propwhite',
'propblack', 'propasian', 'pumahhden', 'lnbasic3000m', 'lntcpuw3000m',
'pumajobden', 'lnjobs40km', 'lnret3000m', 'lnfire3000m', 'lnserv3000m',
'prop1per', 'prop2per', 'bgmedagehd', 'puma1', 'puma2', 'puma3',
'puma4', 'northsf', 'pct1per', 'pct2per', 'pctrent',
'pctblack', 'pctwhite', 'pctasian', 'y17jan', 'y17feb', 'y17mar',
'bgpopden', 'bgjobden', 'highlowinc1500m']
x_cols = cols_to_use[2:]
y_col = 'rent'
print(len(x_cols))
In [ ]:
# looks like outliers are already filtered.
# nan's?
# about half of rows have bathrooms missing. Is it better to not use bathrooms feature? Or better to use it and drop Na's?
df1 = dfA[cols_to_use]
print(len(df1))
df_notnull = df1.dropna(how='any')
print(len(df_notnull))
df2 = df_notnull
In [ ]:
df_X = df2[x_cols]
df_y = df2[y_col]