In [34]:
import pandas as pd
# import paramiko
import os
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
import mpl_toolkits.basemap.pyproj as pyproj
%matplotlib inline


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-34-fd7f8c0ad582> in <module>()
      7 import matplotlib.pyplot as plt
      8 import seaborn as sns
----> 9 import mpl_toolkits.basemap.pyproj as pyproj
     10 get_ipython().magic('matplotlib inline')

ImportError: No module named 'mpl_toolkits.basemap'

This notebook explores merged craigslist listings/census data and fits some initial models

Remote connection parameters

If data is stored remotely


In [2]:
# # TODO: add putty connection too. 

# #read SSH connection parameters
# with open('ssh_settings.json') as settings_file:    
#     settings = json.load(settings_file)

# hostname = settings['hostname']
# username = settings['username']
# password = settings['password']
# local_key_dir = settings['local_key_dir']

# census_dir = 'synthetic_population/'
# """Remote directory with census data"""

# results_dir = 'craigslist_census/'
# """Remote directory for results"""

# # estbalish SSH connection
# ssh = paramiko.SSHClient() 
# ssh.load_host_keys(local_key_dir)
# ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# ssh.connect(hostname,username=username, password=password)
# sftp = ssh.open_sftp()

Data Preparation


In [4]:
# def read_listings_file(fname):
#     """Read csv file via SFTP and return as dataframe."""
#     with sftp.open(os.path.join(listings_dir,fname)) as f:
#         df = pd.read_csv(f, delimiter=',', dtype={'fips_block':str,'state':str,'mpo_id':str}, date_parser=['date'])
#         # TODO: parse dates. 
#     return df

def log_var(x):
    """Return log of x, but NaN if zero."""
    if x==0:
        return np.nan
    else:
        return np.log(x)
    
def create_census_vars(df):
    """Make meaningful variables and return the dataframe."""
    df['pct_white'] = df['race_of_head_1']/df['hhs_tot']
    df['pct_black'] = df['race_of_head_2']/df['hhs_tot']
    df['pct_amer_native'] = df['race_of_head_3']/df['hhs_tot']
    df['pct_alaska_native'] = df['race_of_head_4']/df['hhs_tot']
    df['pct_any_native'] = df['race_of_head_5']/df['hhs_tot']
    df['pct_asian'] = df['race_of_head_6']/df['hhs_tot']
    df['pct_pacific'] = df['race_of_head_7']/df['hhs_tot']
    df['pct_other_race'] = df['race_of_head_8']/df['hhs_tot']
    df['pct_mixed_race'] = df['race_of_head_9']/df['hhs_tot']
    df['pct_mover'] = df['recent_mover_1']/df['hhs_tot']
    df['pct_owner'] = df['tenure_1']/df['hhs_tot']
    df['avg_hh_size'] = df['persons_tot']/df['hhs_tot']
    df['cars_per_hh'] = df['cars_tot']/df['hhs_tot']
    
    df['ln_rent'] = df['rent'].apply(log_var)
    df['ln_income'] = df.income_med.apply(log_var)
    return df

def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
    """Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft. 
    Args: 
        df: Dataframe with listings. Cols names include ['rent','sqft']
        rent_range (tuple): min and max rent
        sqft_range (tuple): min and max sqft
    Returns: 
        DataFrame: listings data without outliers. 
    """
    n0=len(df)
    df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
    n1=len(df)
    print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
    df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
    n2=len(df)
    print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
    return(df)

In [5]:
# get list of files and load. 
# for remotely stored data by state (just do one state for now)
state='CA'
infile='cl_census_{}.csv'.format(state)
#data = read_listings_file(infile)  # uncomment to get remote data. 

# for local data: 
data_dir = '../data/'
data_file = r'..\data\sfbay_listings_03172017.csv'

data = pd.read_csv(os.path.join(data_file),parse_dates=[1],dtype={'listing_id':str, 'rent':float, 'bedrooms':float, 'bathrooms':float, 'sqft':float,
       'rent_sqft':float, 'fips_block':str, 'state':str, 'region':str, 'mpo_id':str, 'lng':float, 'lat':float,
       'cars_tot':float, 'children_tot':float, 'persons_tot':float, 'workers_tot':float,
       'age_of_head_med':float, 'income_med':float, 'hhs_tot':float, 'race_of_head_1':float,
       'race_of_head_2':float, 'race_of_head_3':float, 'race_of_head_4':float, 'race_of_head_5':float,
       'race_of_head_6':float, 'race_of_head_7':float, 'race_of_head_8':float, 'race_of_head_9':float,
       'recent_mover_0':float, 'recent_mover_1':float, 'tenure_1':float, 'tenure_2':float})

print(len(data))
data.head()


125539
Out[5]:
listing_id date rent bedrooms bathrooms sqft rent_sqft fips_block state region ... race_of_head_4 race_of_head_5 race_of_head_6 race_of_head_7 race_of_head_8 race_of_head_9 recent_mover_0 recent_mover_1 tenure_1 tenure_2
0 5915417519 2016-12-11 2586.0 2.0 NaN 1039.0 2.488932 060971530013001 CA sfbay ... NaN 2.0 23.0 NaN 115.0 29.0 1031.0 145.0 560.0 616.0
1 5915416703 2016-12-11 3178.0 1.0 NaN 574.0 5.536585 060750615003012 CA sfbay ... NaN NaN 472.0 NaN 5.0 2.0 697.0 513.0 613.0 597.0
2 5915414697 2016-12-11 3029.0 1.0 NaN 771.0 3.928664 060816080021000 CA sfbay ... NaN NaN 293.0 3.0 9.0 16.0 636.0 70.0 526.0 180.0
3 5915384383 2016-12-11 2395.0 1.0 NaN 770.0 3.110390 060855081021004 CA sfbay ... NaN NaN 704.0 NaN 1.0 21.0 1005.0 265.0 660.0 610.0
4 5915414020 2016-12-11 2735.0 2.0 NaN 922.0 2.966377 060411042002000 CA sfbay ... NaN 8.0 50.0 2.0 75.0 47.0 898.0 113.0 504.0 507.0

5 rows × 33 columns


In [6]:
# for census vars, NA really means 0...
census_cols = ['cars_tot', 'children_tot','persons_tot', 'workers_tot', 'age_of_head_med', 'income_med','hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3','race_of_head_4', 'race_of_head_5', 'race_of_head_6', 'race_of_head_7','race_of_head_8', 'race_of_head_9', 'recent_mover_0', 'recent_mover_1','tenure_1', 'tenure_2']
for col in census_cols:
    data[col] = data[col].fillna(0)

create variables

variable codes

Race codes (from PUMS) 1 .White alone 2 .Black or African American alone 3 .American Indian alone 4 .Alaska Native alone 5 .American Indian and Alaska Native tribes specified; or American .Indian or Alaska native, not specified and no other races 6 .Asian alone 7 .Native Hawaiian and Other Pacific Islander alone 8 .Some other race alone 9 .Two or more major race groups

tenure_1 = owner (based on my guess; didn't match the PUMS codes)

mover_1 = moved past year (based on my guess)


In [7]:
# create useful variables 
data = create_census_vars(data)

# define some feature to include in the model. 
features_to_examine = ['rent','ln_rent', 'bedrooms','bathrooms','sqft','pct_white', 'pct_black','pct_asian','pct_mover','pct_owner','income_med','age_of_head_med','avg_hh_size','cars_per_hh']
data[features_to_examine].describe()


C:\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[7]:
rent ln_rent bedrooms bathrooms sqft pct_white pct_black pct_asian pct_mover pct_owner income_med age_of_head_med avg_hh_size cars_per_hh
count 1.255390e+05 125539.000000 125539.000000 63137.000000 125539.000000 125473.000000 125473.000000 125473.000000 125473.000000 125473.000000 125539.000000 125539.000000 125473.000000 125473.000000
mean 6.681448e+03 7.896976 1.772246 1.500744 1061.272983 0.613227 0.051669 0.252512 0.240305 0.441876 84181.561563 47.518759 2.386755 1.679141
std 6.574609e+05 0.406612 0.996353 0.640847 3834.013217 0.222957 0.099596 0.199378 0.135384 0.267546 35849.719881 7.499268 0.572601 0.480328
min 1.000000e+00 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.119074 0.084123
25% 2.135000e+03 7.666222 1.000000 NaN 713.000000 NaN NaN NaN NaN NaN 57650.000000 42.000000 NaN NaN
50% 2.600000e+03 7.863267 2.000000 NaN 922.000000 NaN NaN NaN NaN NaN 79500.000000 48.000000 NaN NaN
75% 3.341000e+03 8.114025 2.000000 NaN 1180.000000 NaN NaN NaN NaN NaN 105000.000000 53.000000 NaN NaN
max 2.100204e+08 19.162715 8.000000 7.500000 985024.000000 1.000000 0.940909 0.950382 0.782759 1.000000 267500.000000 78.000000 5.241071 2.966543

Filter outliers


In [8]:
# I've already identified these ranges as good at exluding outliers
rent_range=(100,10000)
sqft_range=(10,5000)
data = filter_outliers(data, rent_range=rent_range, sqft_range=sqft_range)


Dropped 566 outside rent range $100-$10000
Dropped 153 outside sqft range 10-5000 sqft. 124820 rows remaining

In [9]:
# Use this to explore outliers yourself. 
g=sns.distplot(data['rent'],  kde=False)
g.set_xlim(0,10000)

g=sns.distplot(data['sqft'], kde=False)
g.set_xlim(0,10000)


Out[9]:
(0, 10000)

Examine missing data


In [10]:
# examine NA's
print('Total rows:',len(data))
print('Rows with any NA:',len(data[pd.isnull(data).any(axis=1)]))
print('Rows with bathroom NA:',len(data[pd.isnull(data.bathrooms)]))
print('% rows missing bathroom col:',len(data[pd.isnull(data.bathrooms)])/len(data))


Total rows: 124820
Rows with any NA: 62105
Rows with bathroom NA: 62024
% rows missing bathroom col: 0.49690754686748917

uh oh, 74% are missing bathrooms feature. Might have to omit that one. Only 0.02% of rows have other missing values, so that should be ok.


In [11]:
#for d in range(1,31):
#    print(d,'% rows missing bathroom col:',len(data[pd.isnull(data.bathrooms)&((data.date.dt.month==12)&(data.date.dt.day==d))])/len(data[(data.date.dt.month==12)&(data.date.dt.day==d)]))

Bathrooms were added on Dec 21. After that, if bathrooms aren't in the listing, the listing is thrown out. Let's try to find the date when the bathrooms column was added. So if need to use bathrooms feature, can use listings Dec 22 and after.


In [12]:
# uncommon to only use data after Dec 21. 
#data=data[(data.date.dt.month>=12)&(data.date.dt.day>=22)]
#data.shape

In [13]:
# Uncomment to drop NA's
#data = data.dropna()
#print('Dropped {} rows with NAs'.format(n0-len(data)))

Look at distributions

Since rent has a more or less logarithmic distribution, use ln_rent instead


In [14]:
p=sns.distplot(data.rent, kde=False)
p.set_title('rent')


Out[14]:
<matplotlib.text.Text at 0x1f35fb15b38>

In [15]:
p=sns.distplot(data.ln_rent, kde=False)
p.set_title('ln rent')


Out[15]:
<matplotlib.text.Text at 0x1f35fc284a8>

In [16]:
plot_rows = math.ceil(len(features_to_examine)/2)

f, axes = plt.subplots(plot_rows,2, figsize=(8,15))
sns.despine(left=True)

for i,col in enumerate(features_to_examine):
    row_position = math.floor(i/2)
    col_position = i%2
    data_notnull = data[pd.notnull(data[col])]  # exclude NA values from plot
    sns.distplot(data_notnull[col], ax=axes[row_position, col_position],kde=False)
    axes[row_position, col_position].set_title('{}'.format(col)) 

plt.tight_layout()
plt.show()



In [17]:
data_notnull = data[pd.notnull(data['ln_income'])]
p=sns.distplot(data_notnull['ln_income'],kde=False)
p.set_title('ln med income')
# ln med income is not more normal.. use med income instead.


Out[17]:
<matplotlib.text.Text at 0x1f360960e10>

look at correlations


In [18]:
# correlation heatmap
corrmat=data[features_to_examine].corr()
corrmat.head()

f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

f.tight_layout()


The correlations appear as expected, except for cars_per_hh. Maybe this is because cars_per_hh is reflecting the size of the household more than income. Might want to try cars per adult instead..


In [19]:
print(data.columns)
#'pct_amer_native','pct_alaska_native',
x_cols = ['bedrooms','bathrooms', 'sqft','age_of_head_med', 'income_med','pct_white', 'pct_black', 'pct_any_native', 'pct_asian', 'pct_pacific',
       'pct_other_race', 'pct_mixed_race', 'pct_mover', 'pct_owner', 'avg_hh_size', 'cars_per_hh']
y_col = 'ln_rent'

print(len(data))

# exclude missing values
data_notnull= data[(pd.notnull(data[x_cols])).all(axis=1)]
data_notnull= data_notnull[(pd.notnull(data_notnull[y_col]))]
print('using {} rows of {} total'.format(len(data_notnull),len(data)))


Index(['listing_id', 'date', 'rent', 'bedrooms', 'bathrooms', 'sqft',
       'rent_sqft', 'fips_block', 'state', 'region', 'mpo_id', 'lng', 'lat',
       'cars_tot', 'children_tot', 'persons_tot', 'workers_tot',
       'age_of_head_med', 'income_med', 'hhs_tot', 'race_of_head_1',
       'race_of_head_2', 'race_of_head_3', 'race_of_head_4', 'race_of_head_5',
       'race_of_head_6', 'race_of_head_7', 'race_of_head_8', 'race_of_head_9',
       'recent_mover_0', 'recent_mover_1', 'tenure_1', 'tenure_2', 'pct_white',
       'pct_black', 'pct_amer_native', 'pct_alaska_native', 'pct_any_native',
       'pct_asian', 'pct_pacific', 'pct_other_race', 'pct_mixed_race',
       'pct_mover', 'pct_owner', 'avg_hh_size', 'cars_per_hh', 'ln_rent',
       'ln_income'],
      dtype='object')
124820
using 62754 rows of 124820 total

In [30]:
data_notnull.head()


Out[30]:
listing_id date rent bedrooms bathrooms sqft rent_sqft fips_block state region ... pct_asian pct_pacific pct_other_race pct_mixed_race pct_mover pct_owner avg_hh_size cars_per_hh ln_rent ln_income
90 5952685186 2017-01-09 1700.0 1.0 1.0 435.0 3.908046 060133170001018 CA sfbay ... 0.000000 0.005587 0.013035 0.067039 0.253259 0.504655 1.802607 1.791434 7.438384 11.238489
91 5952700057 2017-01-09 2392.0 1.0 1.0 556.0 4.302158 060855091021031 CA sfbay ... 0.488615 0.000000 0.015180 0.023719 0.187856 0.668880 1.857685 1.582543 7.779885 11.626254
92 5952699799 2017-01-09 3195.0 3.0 1.0 1107.0 2.886179 060855058002002 CA sfbay ... 0.166220 0.002681 0.058981 0.016086 0.203753 0.404826 2.016086 1.839142 8.069342 11.302204
93 5952699780 2017-01-09 2398.0 2.0 1.0 1030.0 2.328155 060855002004019 CA sfbay ... 0.244554 0.000000 0.028713 0.054455 0.500990 0.256436 1.887129 1.630693 7.782390 11.302204
94 5952699706 2017-01-09 3696.0 3.0 2.0 1327.0 2.785230 060855064011006 CA sfbay ... 0.122523 0.008108 0.114414 0.065766 0.177477 0.370270 2.406306 1.839640 8.215006 10.915088

5 rows × 48 columns


In [33]:
data_notnull.to_csv(r'..\data\sfbay_listings_03172017_notnull.csv',index=False)

Comparison of models

Try a linear model

We'll start with a linear model to use as the baseline.


In [20]:
from sklearn import linear_model, cross_validation

In [21]:
# create training and testing datasets. 
# this creates a test set that is 30% of total obs.  
X_train, X_test, y_train, y_test = cross_validation.train_test_split(data_notnull[x_cols],data_notnull[y_col], test_size = .3, random_state = 201)

In [22]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)


Out[22]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
# Intercept
print('Intercept:', regr.intercept_)

# The coefficients
print('Coefficients:')
pd.Series(regr.coef_, index=x_cols)


Intercept: 4.19441673223
Coefficients:
Out[23]:
bedrooms           0.072043
bathrooms          0.072522
sqft               0.000265
age_of_head_med    0.000375
income_med         0.000003
pct_white          3.662811
pct_black          3.260943
pct_any_native    -1.835531
pct_asian          3.768787
pct_pacific        5.576854
pct_other_race     4.323419
pct_mixed_race     3.062192
pct_mover         -0.153098
pct_owner         -0.079592
avg_hh_size       -0.041547
cars_per_hh       -0.341830
dtype: float64

In [24]:
# See mean square error, using test data
print("Mean squared error: %.2f" % np.mean((regr.predict(X_test) - y_test) ** 2))
print("RMSE:", np.sqrt(np.mean((regr.predict(X_test) - y_test) ** 2)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))


Mean squared error: 0.06
RMSE: 0.237953812109
Variance score: 0.57

In [25]:
# Plot predicted values vs. observed
plt.scatter(regr.predict(X_train),y_train, color='blue',s=1, alpha=.5)
plt.show()



In [26]:
# plot residuals vs predicted values
plt.scatter(regr.predict(X_train), regr.predict(X_train)- y_train, color='blue',s=1, alpha=.5)
plt.scatter(regr.predict(X_test), regr.predict(X_test)- y_test, color='green',s=1, alpha=.5)
plt.show()


The residuals look pretty normally distributed.

I wonder if inclusion of all these race variables is leading to overfitting. If so, we'd have small error on training set and large error on test set.


In [26]:
print("Training set. Mean squared error: %.5f" % np.mean((regr.predict(X_train) - y_train) ** 2), '| Variance score: %.5f' % regr.score(X_train, y_train))
print("Test set. Mean squared error: %.5f" % np.mean((regr.predict(X_test) - y_test) ** 2), '| Variance score: %.5f' % regr.score(X_test, y_test))


Training set. Mean squared error: 0.06211 | Variance score: 0.54432
Test set. Mean squared error: 0.06038 | Variance score: 0.54917

Try Ridge Regression (linear regression with regularization )

Since the training error and test error are about the same, and since we're using few features, overfitting probably isn't a problem. If it were a problem, we would want to try a regression with regularization. Let's try it just for the sake of demonstration.


In [27]:
from sklearn.linear_model import Ridge

In [28]:
# try a range of different regularization terms.
for a in [10,1,0.1,.01,.001,.00001]:
    ridgereg = Ridge(alpha=a)
    ridgereg.fit(X_train, y_train)
    
    print('\n alpha:',a)
    print("Mean squared error: %.5f" % np.mean((ridgereg.predict(X_test) - y_test) ** 2),'| Variance score: %.5f' % ridgereg.score(X_test, y_test))


 alpha: 10
Mean squared error: 0.06101 | Variance score: 0.54446

 alpha: 1
Mean squared error: 0.06070 | Variance score: 0.54678

 alpha: 0.1
Mean squared error: 0.06040 | Variance score: 0.54897

 alpha: 0.01
Mean squared error: 0.06035 | Variance score: 0.54936

 alpha: 0.001
Mean squared error: 0.06037 | Variance score: 0.54920

 alpha: 1e-05
Mean squared error: 0.06038 | Variance score: 0.54917

In [29]:
# Intercept
print('Intercept:', ridgereg.intercept_)

# The coefficients
print('Coefficients:')
pd.Series(ridgereg.coef_, index=x_cols)


Intercept: 3.79611296526
Coefficients:
Out[29]:
bedrooms           0.080902
bathrooms          0.084796
sqft               0.000223
age_of_head_med    0.000365
income_med         0.000003
pct_white          4.053343
pct_black          3.653153
pct_any_native    -2.392905
pct_asian          4.183189
pct_pacific        6.299132
pct_other_race     4.881522
pct_mixed_race     3.471789
pct_mover         -0.154453
pct_owner         -0.078191
avg_hh_size       -0.060284
cars_per_hh       -0.334152
dtype: float64

As expected, Ridge regression doesn't help much. The best way to improve the model at this point is probably to add more features.

Random Forest


In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [ ]:
def RMSE(y_actual, y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

def cross_val_rf(X, y,max_f='auto', n_trees = 50, cv_method='kfold', k=5):
    """Estimate a random forest model using cross-validation and return the average error across the folds.
    Args: 
        X (DataFrame): features data
        y (Series): target data
        max_f (str or int): how to select max features to consider for the best split. 
            If “auto”, then max_features=n_features.
            If “sqrt”, then max_features=sqrt(n_features)
            If “log2”, then max_features=log2(n_features)
            If int, then consider max_features features at each split
        n_trees (number of trees to build)
        cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
        k (int): number of folds (default=5)
    Returns: 
        float: mean error (RMSE) across all training/test sets.
    """
    if cv_method == 'kfold':
        kf = KFold(n_splits=k, shuffle=True, random_state=2012016)  # use random seed for reproducibility. 
    
    E = np.ones(k)   # this array will hold the errors. 
    i=0
    for train, test in kf.split(X, y): 
        train_data_x = X.iloc[train]
        train_data_y = y.iloc[train]    
        test_data_x = X.iloc[test]
        test_data_y = y.iloc[test]

        # n_estimators is number of trees to build. 
        # max_features = 'auto' means the max_features = n_features. This is a parameter we should tune. 
        random_forest = RandomForestRegressor(n_estimators=n_trees, max_features=max_f, criterion='mse', max_depth=None)
        random_forest.fit(train_data_x,train_data_y)
        predict_y=random_forest.predict(test_data_x)
        E[i] = RMSE(test_data_y, predict_y)
        i+=1
    return np.mean(E)


def optimize_rf(df_X, df_y, max_n_trees=100, n_step = 20, cv_method='kfold', k=5): 
    """Optimize hyperparameters for a random forest regressor.
    Args: 
        df_X (DataFrame): features data
        df_y (Series): target data
        max_n_trees (int): max number of trees to generate
        n_step (int): intervals to use for max_n_trees
        cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
        k (int): number of folds (default=5)
    """
    max_features_methods = ['auto','sqrt','log2']  # methods of defining max_features to try.
    
    # create a place to store the results, for easy plotting later. 
    results = pd.DataFrame(columns=max_features_methods, index=[x for x in range(10,max_n_trees+n_step,n_step)])
    
    for m in max_features_methods:
        print('max_features:',m)
        for n in results.index:
            error = cross_val_rf(df_X, df_y,max_f=m, n_trees=n)
            print('n_trees:',n,' error:',error)
            results.ix[n,m] = error
    return results

In [34]:
# data to use - exclude nulls
df_X = data_notnull[x_cols]
df_y = data_notnull[y_col]
print(df_X.shape, df_y.shape)
#df_all = pd.concat([data_notnull[x_cols],data_notnull[y_col]], axis=1)
#df_all.shape


16
(21006, 16) (21006,)

In [ ]:
# basic model to make sure it workds
random_forest = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None)
random_forest.fit(df_X,df_y)
y_predict = random_forest.predict(df_X)
RMSE(df_y,y_predict)

We can use k-fold validation if we believe the samples are independently and identically distributed. That's probably fine right now because we have only 1.5 months of data, but later we may have some time-dependent processes in these timeseries data. If we do use k-fold, I think we should shuffle the samples, because they do not come in a non-random sequence.


In [ ]:
# without parameter tuning
cross_val_rf(df_X,df_y)

In [ ]:
# tune the parameters
rf_results = optimize_rf(df_X,df_y, max_n_trees = 100, n_step = 20)  # this is sufficient; very little improvement after n_trees=100. 
#rf_results2 = optimize_rf(df_X,df_y, max_n_trees = 500, n_step=100)
rf_results

In [ ]:
ax = rf_results.plot()
ax.set_xlabel('number of trees')
ax.set_ylabel('RMSE')
#rf_results2.plot()

Using m=sqrt(n_features) and log2(n_features) gives similar performance, and a slight improvement over m = n_features. After about 100 trees the error levels off. One of the nice things about random forest is that using additional trees doesn't lead to overfitting, so we could use more, but it's not necessary. Now we can fit the model using n_trees = 100 and m = sqrt.


In [ ]:
random_forest = RandomForestRegressor(n_estimators=100, max_features='sqrt', criterion='mse', max_depth=None)
random_forest.fit(df_X,df_y)
predict_y=random_forest.predict(df_X)

The 'importance' score provides an ordered qualitative ranking of the importance of each feature. It is calculated from the improvement in MSE provided by each feature when it is used to split the tree.


In [ ]:
# plot the importances
rf_o = pd.DataFrame({'features':x_cols,'importance':random_forest.feature_importances_})
rf_o= rf_o.sort_values(by='importance',ascending=False)


plt.figure(1,figsize=(12, 6))
plt.xticks(range(len(rf_o)), rf_o.features,rotation=45)
plt.plot(range(len(rf_o)),rf_o.importance,"o")
plt.title('Feature importances')
plt.show()

It's not surprising sqft is the most important predictor, although it is strange cars_per_hh is the second most important. I would have expected incometo be higher in the list.

If we don't think the samples are i.i.d., it's better to use time series CV.


In [ ]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

Try Boosted Forest


In [32]:
from sklearn.ensemble import GradientBoostingRegressor

In [83]:
def cross_val_gb(X,y,cv_method='kfold',k=5, **params):
    """Estimate gradient boosting regressor using cross validation.
    
    Args: 
        X (DataFrame): features data
        y (Series): target data
        cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
        k (int): number of folds (default=5)
        **params: keyword arguments for regressor
    Returns: 
        float: mean error (RMSE) across all training/test sets.
    """
    if cv_method == 'kfold':
        kf = KFold(n_splits=k, shuffle=True, random_state=2012016)  # use random seed for reproducibility. 
    
    E = np.ones(k)   # this array will hold the errors. 
    i=0
    for train, test in kf.split(X, y): 
        train_data_x = X.iloc[train]
        train_data_y = y.iloc[train]    
        test_data_x = X.iloc[test]
        test_data_y = y.iloc[test]

        # n_estimators is number of trees to build. 
        grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
        grad_boost.fit(train_data_x,train_data_y)
        predict_y=grad_boost.predict(test_data_x)
        E[i] = RMSE(test_data_y, predict_y)
        i+=1
    return np.mean(E)

In [37]:
params = {'n_estimators':100,
          'learning_rate':0.1,
          'max_depth':1,
          'min_samples_leaf':4
          }
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
RMSE(y_test, grad_boost.predict(X_test))

In [40]:
n_trees = 100
l_rate = 0.1
max_d = 1
cross_val_gb(df_X,df_y, l_rate,max_d)


Out[40]:
0.27197436244292783

tune parameters

This time we'll use Grid Search in scikit-learn. This conducts an exhaustive search through the given parameters to find the best for the given estimator.


In [70]:
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':[.1, .05, .02, .01],
              'max_depth':[2,4,6],
              'min_samples_leaf': [3,5,9,17],
              'max_features': [1, .3, .1]
              }

est= GradientBoostingRegressor(n_estimators = 1000)
gs_cv = GridSearchCV(est,param_grid).fit(df_X,df_y)

In [81]:
print(gs_cv.best_params_)
print(gs_cv.best_score_)


{'max_depth': 6, 'max_features': 0.3, 'learning_rate': 0.05, 'min_samples_leaf': 3}
0.73224907111

In [85]:
# best parameters
params = {'n_estimators':1000,
          'learning_rate':0.05,
          'max_depth':6,
          'min_samples_leaf':3
          }
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
RMSE(y_test, grad_boost.predict(X_test))


Out[85]:
0.10569476051230522

Wow, that's a big improvement on the random forest model!


In [86]:
# plot the importances
gb_o = pd.DataFrame({'features':x_cols,'importance':grad_boost.feature_importances_})
gb_o= gb_o.sort_values(by='importance',ascending=False)


plt.figure(1,figsize=(12, 6))
plt.xticks(range(len(gb_o)), gb_o.features,rotation=45)
plt.plot(range(len(gb_o)),gb_o.importance,"o")
plt.title('Feature importances')
plt.show()


Let's use partial_dependence to look at feature interactions. Look at the four most important features.


In [87]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence

In [97]:
df_X.columns


Out[97]:
Index(['bedrooms', 'bathrooms', 'sqft', 'age_of_head_med', 'income_med',
       'pct_white', 'pct_black', 'pct_any_native', 'pct_asian', 'pct_pacific',
       'pct_other_race', 'pct_mixed_race', 'pct_mover', 'pct_owner',
       'avg_hh_size', 'cars_per_hh'],
      dtype='object')

In [106]:
features = [2, 15, 4, 12,(4,2), (4,15)]
names = df_X.columns
fig, axs = plot_partial_dependence(grad_boost, df_X, features,feature_names=names, grid_resolution=50, figsize = (9,6))
fig.suptitle('Partial dependence of rental price features')
plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
plt.show()


In the last two plots, darker values indicate a stronger interaction between the two variables plotted. So income and sqft have a stronger dependence at higher values of both. The final plot suggests that for low levels of car ownership, there is a stronger dependence on median income.