In [34]:
import pandas as pd
# import paramiko
import os
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
import mpl_toolkits.basemap.pyproj as pyproj
%matplotlib inline
This notebook explores merged craigslist listings/census data and fits some initial models
In [2]:
# # TODO: add putty connection too.
# #read SSH connection parameters
# with open('ssh_settings.json') as settings_file:
# settings = json.load(settings_file)
# hostname = settings['hostname']
# username = settings['username']
# password = settings['password']
# local_key_dir = settings['local_key_dir']
# census_dir = 'synthetic_population/'
# """Remote directory with census data"""
# results_dir = 'craigslist_census/'
# """Remote directory for results"""
# # estbalish SSH connection
# ssh = paramiko.SSHClient()
# ssh.load_host_keys(local_key_dir)
# ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# ssh.connect(hostname,username=username, password=password)
# sftp = ssh.open_sftp()
In [4]:
# def read_listings_file(fname):
# """Read csv file via SFTP and return as dataframe."""
# with sftp.open(os.path.join(listings_dir,fname)) as f:
# df = pd.read_csv(f, delimiter=',', dtype={'fips_block':str,'state':str,'mpo_id':str}, date_parser=['date'])
# # TODO: parse dates.
# return df
def log_var(x):
"""Return log of x, but NaN if zero."""
if x==0:
return np.nan
else:
return np.log(x)
def create_census_vars(df):
"""Make meaningful variables and return the dataframe."""
df['pct_white'] = df['race_of_head_1']/df['hhs_tot']
df['pct_black'] = df['race_of_head_2']/df['hhs_tot']
df['pct_amer_native'] = df['race_of_head_3']/df['hhs_tot']
df['pct_alaska_native'] = df['race_of_head_4']/df['hhs_tot']
df['pct_any_native'] = df['race_of_head_5']/df['hhs_tot']
df['pct_asian'] = df['race_of_head_6']/df['hhs_tot']
df['pct_pacific'] = df['race_of_head_7']/df['hhs_tot']
df['pct_other_race'] = df['race_of_head_8']/df['hhs_tot']
df['pct_mixed_race'] = df['race_of_head_9']/df['hhs_tot']
df['pct_mover'] = df['recent_mover_1']/df['hhs_tot']
df['pct_owner'] = df['tenure_1']/df['hhs_tot']
df['avg_hh_size'] = df['persons_tot']/df['hhs_tot']
df['cars_per_hh'] = df['cars_tot']/df['hhs_tot']
df['ln_rent'] = df['rent'].apply(log_var)
df['ln_income'] = df.income_med.apply(log_var)
return df
def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
"""Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft.
Args:
df: Dataframe with listings. Cols names include ['rent','sqft']
rent_range (tuple): min and max rent
sqft_range (tuple): min and max sqft
Returns:
DataFrame: listings data without outliers.
"""
n0=len(df)
df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
n1=len(df)
print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
n2=len(df)
print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
return(df)
In [5]:
# get list of files and load.
# for remotely stored data by state (just do one state for now)
state='CA'
infile='cl_census_{}.csv'.format(state)
#data = read_listings_file(infile) # uncomment to get remote data.
# for local data:
data_dir = '../data/'
data_file = r'..\data\sfbay_listings_03172017.csv'
data = pd.read_csv(os.path.join(data_file),parse_dates=[1],dtype={'listing_id':str, 'rent':float, 'bedrooms':float, 'bathrooms':float, 'sqft':float,
'rent_sqft':float, 'fips_block':str, 'state':str, 'region':str, 'mpo_id':str, 'lng':float, 'lat':float,
'cars_tot':float, 'children_tot':float, 'persons_tot':float, 'workers_tot':float,
'age_of_head_med':float, 'income_med':float, 'hhs_tot':float, 'race_of_head_1':float,
'race_of_head_2':float, 'race_of_head_3':float, 'race_of_head_4':float, 'race_of_head_5':float,
'race_of_head_6':float, 'race_of_head_7':float, 'race_of_head_8':float, 'race_of_head_9':float,
'recent_mover_0':float, 'recent_mover_1':float, 'tenure_1':float, 'tenure_2':float})
print(len(data))
data.head()
Out[5]:
In [6]:
# for census vars, NA really means 0...
census_cols = ['cars_tot', 'children_tot','persons_tot', 'workers_tot', 'age_of_head_med', 'income_med','hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3','race_of_head_4', 'race_of_head_5', 'race_of_head_6', 'race_of_head_7','race_of_head_8', 'race_of_head_9', 'recent_mover_0', 'recent_mover_1','tenure_1', 'tenure_2']
for col in census_cols:
data[col] = data[col].fillna(0)
variable codes
Race codes (from PUMS) 1 .White alone 2 .Black or African American alone 3 .American Indian alone 4 .Alaska Native alone 5 .American Indian and Alaska Native tribes specified; or American .Indian or Alaska native, not specified and no other races 6 .Asian alone 7 .Native Hawaiian and Other Pacific Islander alone 8 .Some other race alone 9 .Two or more major race groups
tenure_1 = owner (based on my guess; didn't match the PUMS codes)
mover_1 = moved past year (based on my guess)
In [7]:
# create useful variables
data = create_census_vars(data)
# define some feature to include in the model.
features_to_examine = ['rent','ln_rent', 'bedrooms','bathrooms','sqft','pct_white', 'pct_black','pct_asian','pct_mover','pct_owner','income_med','age_of_head_med','avg_hh_size','cars_per_hh']
data[features_to_examine].describe()
Out[7]:
In [8]:
# I've already identified these ranges as good at exluding outliers
rent_range=(100,10000)
sqft_range=(10,5000)
data = filter_outliers(data, rent_range=rent_range, sqft_range=sqft_range)
In [9]:
# Use this to explore outliers yourself.
g=sns.distplot(data['rent'], kde=False)
g.set_xlim(0,10000)
g=sns.distplot(data['sqft'], kde=False)
g.set_xlim(0,10000)
Out[9]:
In [10]:
# examine NA's
print('Total rows:',len(data))
print('Rows with any NA:',len(data[pd.isnull(data).any(axis=1)]))
print('Rows with bathroom NA:',len(data[pd.isnull(data.bathrooms)]))
print('% rows missing bathroom col:',len(data[pd.isnull(data.bathrooms)])/len(data))
uh oh, 74% are missing bathrooms feature. Might have to omit that one. Only 0.02% of rows have other missing values, so that should be ok.
In [11]:
#for d in range(1,31):
# print(d,'% rows missing bathroom col:',len(data[pd.isnull(data.bathrooms)&((data.date.dt.month==12)&(data.date.dt.day==d))])/len(data[(data.date.dt.month==12)&(data.date.dt.day==d)]))
Bathrooms were added on Dec 21. After that, if bathrooms aren't in the listing, the listing is thrown out. Let's try to find the date when the bathrooms column was added. So if need to use bathrooms feature, can use listings Dec 22 and after.
In [12]:
# uncommon to only use data after Dec 21.
#data=data[(data.date.dt.month>=12)&(data.date.dt.day>=22)]
#data.shape
In [13]:
# Uncomment to drop NA's
#data = data.dropna()
#print('Dropped {} rows with NAs'.format(n0-len(data)))
In [14]:
p=sns.distplot(data.rent, kde=False)
p.set_title('rent')
Out[14]:
In [15]:
p=sns.distplot(data.ln_rent, kde=False)
p.set_title('ln rent')
Out[15]:
In [16]:
plot_rows = math.ceil(len(features_to_examine)/2)
f, axes = plt.subplots(plot_rows,2, figsize=(8,15))
sns.despine(left=True)
for i,col in enumerate(features_to_examine):
row_position = math.floor(i/2)
col_position = i%2
data_notnull = data[pd.notnull(data[col])] # exclude NA values from plot
sns.distplot(data_notnull[col], ax=axes[row_position, col_position],kde=False)
axes[row_position, col_position].set_title('{}'.format(col))
plt.tight_layout()
plt.show()
In [17]:
data_notnull = data[pd.notnull(data['ln_income'])]
p=sns.distplot(data_notnull['ln_income'],kde=False)
p.set_title('ln med income')
# ln med income is not more normal.. use med income instead.
Out[17]:
In [18]:
# correlation heatmap
corrmat=data[features_to_examine].corr()
corrmat.head()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
f.tight_layout()
The correlations appear as expected, except for cars_per_hh. Maybe this is because cars_per_hh is reflecting the size of the household more than income. Might want to try cars per adult instead..
In [19]:
print(data.columns)
#'pct_amer_native','pct_alaska_native',
x_cols = ['bedrooms','bathrooms', 'sqft','age_of_head_med', 'income_med','pct_white', 'pct_black', 'pct_any_native', 'pct_asian', 'pct_pacific',
'pct_other_race', 'pct_mixed_race', 'pct_mover', 'pct_owner', 'avg_hh_size', 'cars_per_hh']
y_col = 'ln_rent'
print(len(data))
# exclude missing values
data_notnull= data[(pd.notnull(data[x_cols])).all(axis=1)]
data_notnull= data_notnull[(pd.notnull(data_notnull[y_col]))]
print('using {} rows of {} total'.format(len(data_notnull),len(data)))
In [30]:
data_notnull.head()
Out[30]:
In [33]:
data_notnull.to_csv(r'..\data\sfbay_listings_03172017_notnull.csv',index=False)
In [20]:
from sklearn import linear_model, cross_validation
In [21]:
# create training and testing datasets.
# this creates a test set that is 30% of total obs.
X_train, X_test, y_train, y_test = cross_validation.train_test_split(data_notnull[x_cols],data_notnull[y_col], test_size = .3, random_state = 201)
In [22]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
Out[22]:
In [23]:
# Intercept
print('Intercept:', regr.intercept_)
# The coefficients
print('Coefficients:')
pd.Series(regr.coef_, index=x_cols)
Out[23]:
In [24]:
# See mean square error, using test data
print("Mean squared error: %.2f" % np.mean((regr.predict(X_test) - y_test) ** 2))
print("RMSE:", np.sqrt(np.mean((regr.predict(X_test) - y_test) ** 2)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
In [25]:
# Plot predicted values vs. observed
plt.scatter(regr.predict(X_train),y_train, color='blue',s=1, alpha=.5)
plt.show()
In [26]:
# plot residuals vs predicted values
plt.scatter(regr.predict(X_train), regr.predict(X_train)- y_train, color='blue',s=1, alpha=.5)
plt.scatter(regr.predict(X_test), regr.predict(X_test)- y_test, color='green',s=1, alpha=.5)
plt.show()
The residuals look pretty normally distributed.
I wonder if inclusion of all these race variables is leading to overfitting. If so, we'd have small error on training set and large error on test set.
In [26]:
print("Training set. Mean squared error: %.5f" % np.mean((regr.predict(X_train) - y_train) ** 2), '| Variance score: %.5f' % regr.score(X_train, y_train))
print("Test set. Mean squared error: %.5f" % np.mean((regr.predict(X_test) - y_test) ** 2), '| Variance score: %.5f' % regr.score(X_test, y_test))
Since the training error and test error are about the same, and since we're using few features, overfitting probably isn't a problem. If it were a problem, we would want to try a regression with regularization. Let's try it just for the sake of demonstration.
In [27]:
from sklearn.linear_model import Ridge
In [28]:
# try a range of different regularization terms.
for a in [10,1,0.1,.01,.001,.00001]:
ridgereg = Ridge(alpha=a)
ridgereg.fit(X_train, y_train)
print('\n alpha:',a)
print("Mean squared error: %.5f" % np.mean((ridgereg.predict(X_test) - y_test) ** 2),'| Variance score: %.5f' % ridgereg.score(X_test, y_test))
In [29]:
# Intercept
print('Intercept:', ridgereg.intercept_)
# The coefficients
print('Coefficients:')
pd.Series(ridgereg.coef_, index=x_cols)
Out[29]:
As expected, Ridge regression doesn't help much. The best way to improve the model at this point is probably to add more features.
In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
In [ ]:
def RMSE(y_actual, y_predicted):
return np.sqrt(mean_squared_error(y_actual, y_predicted))
def cross_val_rf(X, y,max_f='auto', n_trees = 50, cv_method='kfold', k=5):
"""Estimate a random forest model using cross-validation and return the average error across the folds.
Args:
X (DataFrame): features data
y (Series): target data
max_f (str or int): how to select max features to consider for the best split.
If “auto”, then max_features=n_features.
If “sqrt”, then max_features=sqrt(n_features)
If “log2”, then max_features=log2(n_features)
If int, then consider max_features features at each split
n_trees (number of trees to build)
cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
k (int): number of folds (default=5)
Returns:
float: mean error (RMSE) across all training/test sets.
"""
if cv_method == 'kfold':
kf = KFold(n_splits=k, shuffle=True, random_state=2012016) # use random seed for reproducibility.
E = np.ones(k) # this array will hold the errors.
i=0
for train, test in kf.split(X, y):
train_data_x = X.iloc[train]
train_data_y = y.iloc[train]
test_data_x = X.iloc[test]
test_data_y = y.iloc[test]
# n_estimators is number of trees to build.
# max_features = 'auto' means the max_features = n_features. This is a parameter we should tune.
random_forest = RandomForestRegressor(n_estimators=n_trees, max_features=max_f, criterion='mse', max_depth=None)
random_forest.fit(train_data_x,train_data_y)
predict_y=random_forest.predict(test_data_x)
E[i] = RMSE(test_data_y, predict_y)
i+=1
return np.mean(E)
def optimize_rf(df_X, df_y, max_n_trees=100, n_step = 20, cv_method='kfold', k=5):
"""Optimize hyperparameters for a random forest regressor.
Args:
df_X (DataFrame): features data
df_y (Series): target data
max_n_trees (int): max number of trees to generate
n_step (int): intervals to use for max_n_trees
cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
k (int): number of folds (default=5)
"""
max_features_methods = ['auto','sqrt','log2'] # methods of defining max_features to try.
# create a place to store the results, for easy plotting later.
results = pd.DataFrame(columns=max_features_methods, index=[x for x in range(10,max_n_trees+n_step,n_step)])
for m in max_features_methods:
print('max_features:',m)
for n in results.index:
error = cross_val_rf(df_X, df_y,max_f=m, n_trees=n)
print('n_trees:',n,' error:',error)
results.ix[n,m] = error
return results
In [34]:
# data to use - exclude nulls
df_X = data_notnull[x_cols]
df_y = data_notnull[y_col]
print(df_X.shape, df_y.shape)
#df_all = pd.concat([data_notnull[x_cols],data_notnull[y_col]], axis=1)
#df_all.shape
In [ ]:
# basic model to make sure it workds
random_forest = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None)
random_forest.fit(df_X,df_y)
y_predict = random_forest.predict(df_X)
RMSE(df_y,y_predict)
We can use k-fold validation if we believe the samples are independently and identically distributed. That's probably fine right now because we have only 1.5 months of data, but later we may have some time-dependent processes in these timeseries data. If we do use k-fold, I think we should shuffle the samples, because they do not come in a non-random sequence.
In [ ]:
# without parameter tuning
cross_val_rf(df_X,df_y)
In [ ]:
# tune the parameters
rf_results = optimize_rf(df_X,df_y, max_n_trees = 100, n_step = 20) # this is sufficient; very little improvement after n_trees=100.
#rf_results2 = optimize_rf(df_X,df_y, max_n_trees = 500, n_step=100)
rf_results
In [ ]:
ax = rf_results.plot()
ax.set_xlabel('number of trees')
ax.set_ylabel('RMSE')
#rf_results2.plot()
Using m=sqrt(n_features) and log2(n_features) gives similar performance, and a slight improvement over m = n_features. After about 100 trees the error levels off. One of the nice things about random forest is that using additional trees doesn't lead to overfitting, so we could use more, but it's not necessary. Now we can fit the model using n_trees = 100 and m = sqrt.
In [ ]:
random_forest = RandomForestRegressor(n_estimators=100, max_features='sqrt', criterion='mse', max_depth=None)
random_forest.fit(df_X,df_y)
predict_y=random_forest.predict(df_X)
The 'importance' score provides an ordered qualitative ranking of the importance of each feature. It is calculated from the improvement in MSE provided by each feature when it is used to split the tree.
In [ ]:
# plot the importances
rf_o = pd.DataFrame({'features':x_cols,'importance':random_forest.feature_importances_})
rf_o= rf_o.sort_values(by='importance',ascending=False)
plt.figure(1,figsize=(12, 6))
plt.xticks(range(len(rf_o)), rf_o.features,rotation=45)
plt.plot(range(len(rf_o)),rf_o.importance,"o")
plt.title('Feature importances')
plt.show()
It's not surprising sqft is the most important predictor, although it is strange cars_per_hh is the second most important. I would have expected incometo be higher in the list.
If we don't think the samples are i.i.d., it's better to use time series CV.
In [ ]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
In [32]:
from sklearn.ensemble import GradientBoostingRegressor
In [83]:
def cross_val_gb(X,y,cv_method='kfold',k=5, **params):
"""Estimate gradient boosting regressor using cross validation.
Args:
X (DataFrame): features data
y (Series): target data
cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
k (int): number of folds (default=5)
**params: keyword arguments for regressor
Returns:
float: mean error (RMSE) across all training/test sets.
"""
if cv_method == 'kfold':
kf = KFold(n_splits=k, shuffle=True, random_state=2012016) # use random seed for reproducibility.
E = np.ones(k) # this array will hold the errors.
i=0
for train, test in kf.split(X, y):
train_data_x = X.iloc[train]
train_data_y = y.iloc[train]
test_data_x = X.iloc[test]
test_data_y = y.iloc[test]
# n_estimators is number of trees to build.
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(train_data_x,train_data_y)
predict_y=grad_boost.predict(test_data_x)
E[i] = RMSE(test_data_y, predict_y)
i+=1
return np.mean(E)
In [37]:
params = {'n_estimators':100,
'learning_rate':0.1,
'max_depth':1,
'min_samples_leaf':4
}
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
RMSE(y_test, grad_boost.predict(X_test))
In [40]:
n_trees = 100
l_rate = 0.1
max_d = 1
cross_val_gb(df_X,df_y, l_rate,max_d)
Out[40]:
In [70]:
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':[.1, .05, .02, .01],
'max_depth':[2,4,6],
'min_samples_leaf': [3,5,9,17],
'max_features': [1, .3, .1]
}
est= GradientBoostingRegressor(n_estimators = 1000)
gs_cv = GridSearchCV(est,param_grid).fit(df_X,df_y)
In [81]:
print(gs_cv.best_params_)
print(gs_cv.best_score_)
In [85]:
# best parameters
params = {'n_estimators':1000,
'learning_rate':0.05,
'max_depth':6,
'min_samples_leaf':3
}
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
RMSE(y_test, grad_boost.predict(X_test))
Out[85]:
Wow, that's a big improvement on the random forest model!
In [86]:
# plot the importances
gb_o = pd.DataFrame({'features':x_cols,'importance':grad_boost.feature_importances_})
gb_o= gb_o.sort_values(by='importance',ascending=False)
plt.figure(1,figsize=(12, 6))
plt.xticks(range(len(gb_o)), gb_o.features,rotation=45)
plt.plot(range(len(gb_o)),gb_o.importance,"o")
plt.title('Feature importances')
plt.show()
Let's use partial_dependence to look at feature interactions. Look at the four most important features.
In [87]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
In [97]:
df_X.columns
Out[97]:
In [106]:
features = [2, 15, 4, 12,(4,2), (4,15)]
names = df_X.columns
fig, axs = plot_partial_dependence(grad_boost, df_X, features,feature_names=names, grid_resolution=50, figsize = (9,6))
fig.suptitle('Partial dependence of rental price features')
plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle
plt.show()
In the last two plots, darker values indicate a stronger interaction between the two variables plotted. So income and sqft have a stronger dependence at higher values of both. The final plot suggests that for low levels of car ownership, there is a stronger dependence on median income.