In [1]:

    
import pandas as pd
import paramiko
import os
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

This notebook explores merged craigslist listings/census data and fits some initial models

Remote connection parameters

If data is stored remotely



In [2]:

    
# TODO: add putty connection too. 

#read SSH connection parameters
with open('ssh_settings.json') as settings_file:    
    settings = json.load(settings_file)

hostname = settings['hostname']
username = settings['username']
password = settings['password']
local_key_dir = settings['local_key_dir']

census_dir = 'synthetic_population/'
"""Remote directory with census data"""

results_dir = 'craigslist_census/'
"""Remote directory for results"""

# estbalish SSH connection
ssh = paramiko.SSHClient() 
ssh.load_host_keys(local_key_dir)
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(hostname,username=username, password=password)
sftp = ssh.open_sftp()

Data Preparation



In [2]:

    
def read_listings_file(fname):
    """Read csv file via SFTP and return as dataframe."""
    with sftp.open(os.path.join(listings_dir,fname)) as f:
        df = pd.read_csv(f, delimiter=',', dtype={'fips_block':str,'state':str,'mpo_id':str}, date_parser=['date'])
        # TODO: parse dates. 
    return df

def log_var(x):
    """Return log of x, but NaN if zero."""
    if x==0:
        return np.nan
    else:
        return np.log(x)
    
def create_census_vars(df):
    """Make meaningful variables and return the dataframe."""
    df['pct_white'] = df['race_of_head_1']/df['hhs_tot']
    df['pct_black'] = df['race_of_head_2']/df['hhs_tot']
    df['pct_amer_native'] = df['race_of_head_3']/df['hhs_tot']
    df['pct_alaska_native'] = df['race_of_head_4']/df['hhs_tot']
    df['pct_any_native'] = df['race_of_head_5']/df['hhs_tot']
    df['pct_asian'] = df['race_of_head_6']/df['hhs_tot']
    df['pct_pacific'] = df['race_of_head_7']/df['hhs_tot']
    df['pct_other_race'] = df['race_of_head_8']/df['hhs_tot']
    df['pct_mixed_race'] = df['race_of_head_9']/df['hhs_tot']
    df['pct_mover'] = df['recent_mover_1']/df['hhs_tot']
    df['pct_owner'] = df['tenure_1']/df['hhs_tot']
    df['avg_hh_size'] = df['persons_tot']/df['hhs_tot']
    df['cars_per_hh'] = df['cars_tot']/df['hhs_tot']
    
    df['ln_rent'] = df['rent'].apply(log_var)
    df['ln_income'] = df.income_med.apply(log_var)
    return df

def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
    """Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft. 
    Args: 
        df: Dataframe with listings. Cols names include ['rent','sqft']
        rent_range (tuple): min and max rent
        sqft_range (tuple): min and max sqft
    Returns: 
        DataFrame: listings data without outliers. 
    """
    n0=len(df)
    df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
    n1=len(df)
    print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
    df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
    n2=len(df)
    print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
    return(df)



In [3]:

    
# get list of files and load. 
# for remotely stored data by state (just do one state for now)
state='CA'
infile='cl_census_{}.csv'.format(state)
#data = read_listings_file(infile)  # uncomment to get remote data. 

# for local data: 
data_dir = '../data/'
data_file = 'sfbay_listings_03032017.csv'

data = pd.read_csv(os.path.join(data_dir,data_file),parse_dates=[1],dtype={'listing_id':str, 'rent':float, 'bedrooms':float, 'bathrooms':float, 'sqft':float,
       'rent_sqft':float, 'fips_block':str, 'state':str, 'region':str, 'mpo_id':str, 'lng':float, 'lat':float,
       'cars_tot':float, 'children_tot':float, 'persons_tot':float, 'workers_tot':float,
       'age_of_head_med':float, 'income_med':float, 'hhs_tot':float, 'race_of_head_1':float,
       'race_of_head_2':float, 'race_of_head_3':float, 'race_of_head_4':float, 'race_of_head_5':float,
       'race_of_head_6':float, 'race_of_head_7':float, 'race_of_head_8':float, 'race_of_head_9':float,
       'recent_mover_0':float, 'recent_mover_1':float, 'tenure_1':float, 'tenure_2':float})

print(len(data))
data.head()









    



82727






    Out[3]:






  
    
      
      listing_id
      date
      rent
      bedrooms
      bathrooms
      sqft
      rent_sqft
      fips_block
      state
      region
      ...
      race_of_head_4
      race_of_head_5
      race_of_head_6
      race_of_head_7
      race_of_head_8
      race_of_head_9
      recent_mover_0
      recent_mover_1
      tenure_1
      tenure_2
    
  
  
    
      0
      5950749277
      2017-01-08
      3900.0
      2.0
      1.0
      1000.0
      3.900000
      060750102001012
      CA
      sfbay
      ...
      NaN
      NaN
      51.0
      1.0
      1.0
      14.0
      547.0
      90.0
      288.0
      349.0
    
    
      1
      5930739630
      2017-01-08
      2107.0
      2.0
      1.5
      1028.0
      2.049611
      060971506122048
      CA
      sfbay
      ...
      NaN
      NaN
      11.0
      2.0
      19.0
      19.0
      536.0
      25.0
      264.0
      297.0
    
    
      2
      5950748411
      2017-01-08
      3644.0
      2.0
      2.0
      1362.0
      2.675477
      060855067033008
      CA
      sfbay
      ...
      NaN
      NaN
      130.0
      NaN
      1.0
      3.0
      322.0
      8.0
      91.0
      239.0
    
    
      3
      5950747869
      2017-01-08
      4350.0
      2.0
      1.0
      900.0
      4.833333
      060750102001012
      CA
      sfbay
      ...
      NaN
      NaN
      51.0
      1.0
      1.0
      14.0
      547.0
      90.0
      288.0
      349.0
    
    
      4
      5950747694
      2017-01-08
      2450.0
      3.0
      2.0
      1850.0
      1.324324
      060133270005038
      CA
      sfbay
      ...
      NaN
      1.0
      27.0
      2.0
      16.0
      25.0
      634.0
      25.0
      547.0
      112.0
    
  

5 rows × 33 columns



In [4]:

    
# for census vars, NA really means 0...
census_cols = ['cars_tot', 'children_tot','persons_tot', 'workers_tot', 'age_of_head_med', 'income_med','hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3','race_of_head_4', 'race_of_head_5', 'race_of_head_6', 'race_of_head_7','race_of_head_8', 'race_of_head_9', 'recent_mover_0', 'recent_mover_1','tenure_1', 'tenure_2']
for col in census_cols:
    data[col] = data[col].fillna(0)

create variables

variable codes

Race codes (from PUMS) 1 .White alone 2 .Black or African American alone 3 .American Indian alone 4 .Alaska Native alone 5 .American Indian and Alaska Native tribes specified; or American .Indian or Alaska native, not specified and no other races 6 .Asian alone 7 .Native Hawaiian and Other Pacific Islander alone 8 .Some other race alone 9 .Two or more major race groups

tenure_1 = owner (based on my guess; didn't match the PUMS codes)

mover_1 = moved past year (based on my guess)



In [5]:

    
# create useful variables 
data = create_census_vars(data)

# define some feature to include in the model. 
features_to_examine = ['rent','ln_rent', 'bedrooms','bathrooms','sqft','pct_white', 'pct_black','pct_asian','pct_mover','pct_owner','income_med','age_of_head_med','avg_hh_size','cars_per_hh']
data[features_to_examine].describe()









    Out[5]:






  
    
      
      rent
      ln_rent
      bedrooms
      bathrooms
      sqft
      pct_white
      pct_black
      pct_asian
      pct_mover
      pct_owner
      income_med
      age_of_head_med
      avg_hh_size
      cars_per_hh
    
  
  
    
      count
      8.272700e+04
      82727.000000
      82727.000000
      21135.000000
      82727.000000
      82701.000000
      82701.000000
      82701.000000
      82701.000000
      82701.000000
      82727.000000
      82727.000000
      82701.000000
      82701.000000
    
    
      mean
      8.424425e+03
      7.892339
      1.786938
      1.528507
      1070.799352
      0.614164
      0.051737
      0.251347
      0.238773
      0.444723
      84269.025923
      47.596722
      2.392070
      1.685853
    
    
      std
      8.079905e+05
      0.415221
      0.995156
      0.658353
      4395.502018
      0.222690
      0.100005
      0.199291
      0.134969
      0.267616
      35749.132951
      7.469119
      0.570091
      0.477950
    
    
      min
      1.000000e+00
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.119074
      0.084123
    
    
      25%
      2.125000e+03
      7.661527
      1.000000
      1.000000
      716.000000
      0.445419
      0.000000
      0.087121
      0.130178
      0.238063
      58000.000000
      42.000000
      1.973294
      1.422348
    
    
      50%
      2.600000e+03
      7.863267
      2.000000
      1.000000
      930.000000
      0.625000
      0.014629
      0.210407
      0.231550
      0.438642
      79630.000000
      48.000000
      2.348315
      1.764550
    
    
      75%
      3.308000e+03
      8.104099
      2.000000
      2.000000
      1187.000000
      0.791878
      0.050829
      0.383133
      0.333826
      0.643371
      105000.000000
      53.000000
      2.776371
      2.007761
    
    
      max
      2.100204e+08
      19.162715
      8.000000
      6.500000
      985024.000000
      1.000000
      0.940909
      0.950382
      0.782759
      1.000000
      267500.000000
      78.000000
      5.241071
      2.890467

Filter outliers



In [6]:

    
# I've already identified these ranges as good at exluding outliers
rent_range=(100,10000)
sqft_range=(10,5000)
data = filter_outliers(data, rent_range=rent_range, sqft_range=sqft_range)









    



Dropped 386 outside rent range $100-$10000
Dropped 107 outside sqft range 10-5000 sqft. 82234 rows remaining



In [7]:

    
# Use this to explore outliers yourself. 
g=sns.distplot(data['rent'],  kde=False)
g.set_xlim(0,10000)

g=sns.distplot(data['sqft'], kde=False)
g.set_xlim(0,10000)









    Out[7]:





(0, 10000)

Examine missing data



In [8]:

    
# examine NA's
print('Total rows:',len(data))
print('Rows with any NA:',len(data[pd.isnull(data).any(axis=1)]))
print('Rows with bathroom NA:',len(data[pd.isnull(data.bathrooms)]))
print('% rows missing bathroom col:',len(data[pd.isnull(data.bathrooms)])/len(data))









    



Total rows: 82234
Rows with any NA: 61242
Rows with bathroom NA: 61225
% rows missing bathroom col: 0.744521730671012

uh oh, 74% are missing bathrooms feature. Might have to omit that one. Only 0.02% of rows have other missing values, so that should be ok.



In [9]:

    
#for d in range(1,31):
#    print(d,'% rows missing bathroom col:',len(data[pd.isnull(data.bathrooms)&((data.date.dt.month==12)&(data.date.dt.day==d))])/len(data[(data.date.dt.month==12)&(data.date.dt.day==d)]))

Bathrooms were added on Dec 21. After that, if bathrooms aren't in the listing, the listing is thrown out. Let's try to find the date when the bathrooms column was added. So if need to use bathrooms feature, can use listings Dec 22 and after.



In [10]:

    
# uncommon to only use data after Dec 21. 
#data=data[(data.date.dt.month>=12)&(data.date.dt.day>=22)]
#data.shape



In [11]:

    
# Uncomment to drop NA's
#data = data.dropna()
#print('Dropped {} rows with NAs'.format(n0-len(data)))

Look at distributions

Since rent has a more or less logarithmic distribution, use ln_rent instead



In [12]:

    
p=sns.distplot(data.rent, kde=False)
p.set_title('rent')









    Out[12]:





<matplotlib.text.Text at 0x11283d358>



In [13]:

    
p=sns.distplot(data.ln_rent, kde=False)
p.set_title('ln rent')









    Out[13]:





<matplotlib.text.Text at 0x112e3d048>



In [14]:

    
plot_rows = math.ceil(len(features_to_examine)/2)

f, axes = plt.subplots(plot_rows,2, figsize=(8,15))
sns.despine(left=True)

for i,col in enumerate(features_to_examine):
    row_position = math.floor(i/2)
    col_position = i%2
    data_notnull = data[pd.notnull(data[col])]  # exclude NA values from plot
    sns.distplot(data_notnull[col], ax=axes[row_position, col_position],kde=False)
    axes[row_position, col_position].set_title('{}'.format(col)) 

plt.tight_layout()
plt.show()



In [15]:

    
data_notnull = data[pd.notnull(data['ln_income'])]
p=sns.distplot(data_notnull['ln_income'],kde=False)
p.set_title('ln med income')
# ln med income is not more normal.. use med income instead.









    Out[15]:





<matplotlib.text.Text at 0x1138b1240>

look at correlations



In [16]:

    
# correlation heatmap
corrmat=data[features_to_examine].corr()
corrmat.head()

f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

f.tight_layout()

The correlations appear as expected, except for cars_per_hh. Maybe this is because cars_per_hh is reflecting the size of the household more than income. Might want to try cars per adult instead..



In [17]:

    
print(data.columns)
#'pct_amer_native','pct_alaska_native',
x_cols = ['bedrooms','bathrooms', 'sqft','age_of_head_med', 'income_med','pct_white', 'pct_black', 'pct_any_native', 'pct_asian', 'pct_pacific',
       'pct_other_race', 'pct_mixed_race', 'pct_mover', 'pct_owner', 'avg_hh_size', 'cars_per_hh']
y_col = 'ln_rent'

print(len(data))

# exclude missing values
data_notnull= data[(pd.notnull(data[x_cols])).all(axis=1)]
data_notnull= data_notnull[(pd.notnull(data_notnull[y_col]))]
print('using {} rows of {} total'.format(len(data_notnull),len(data)))









    



Index(['listing_id', 'date', 'rent', 'bedrooms', 'bathrooms', 'sqft',
       'rent_sqft', 'fips_block', 'state', 'region', 'mpo_id', 'lng', 'lat',
       'cars_tot', 'children_tot', 'persons_tot', 'workers_tot',
       'age_of_head_med', 'income_med', 'hhs_tot', 'race_of_head_1',
       'race_of_head_2', 'race_of_head_3', 'race_of_head_4', 'race_of_head_5',
       'race_of_head_6', 'race_of_head_7', 'race_of_head_8', 'race_of_head_9',
       'recent_mover_0', 'recent_mover_1', 'tenure_1', 'tenure_2', 'pct_white',
       'pct_black', 'pct_amer_native', 'pct_alaska_native', 'pct_any_native',
       'pct_asian', 'pct_pacific', 'pct_other_race', 'pct_mixed_race',
       'pct_mover', 'pct_owner', 'avg_hh_size', 'cars_per_hh', 'ln_rent',
       'ln_income'],
      dtype='object')
82234
using 21006 rows of 82234 total

Comparison of models

Try a linear model

We'll start with a linear model to use as the baseline.



In [18]:

    
from sklearn import linear_model, cross_validation









    



//anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [19]:

    
# create training and testing datasets. 
# this creates a test set that is 30% of total obs.  
X_train, X_test, y_train, y_test = cross_validation.train_test_split(data_notnull[x_cols],data_notnull[y_col], test_size = .3, random_state = 201)



In [21]:

    
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)









    Out[21]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [22]:

    
# Intercept
print('Intercept:', regr.intercept_)

# The coefficients
print('Coefficients:')
pd.Series(regr.coef_, index=x_cols)









    



Intercept: 3.79537239111
Coefficients:






    Out[22]:





bedrooms           0.080902
bathrooms          0.084796
sqft               0.000223
age_of_head_med    0.000365
income_med         0.000003
pct_white          4.054088
pct_black          3.653897
pct_any_native    -2.392451
pct_asian          4.183934
pct_pacific        6.300127
pct_other_race     4.882282
pct_mixed_race     3.472586
pct_mover         -0.154455
pct_owner         -0.078191
avg_hh_size       -0.060285
cars_per_hh       -0.334151
dtype: float64



In [23]:

    
# See mean square error, using test data
print("Mean squared error: %.2f" % np.mean((regr.predict(X_test) - y_test) ** 2))
print("RMSE:", np.sqrt(np.mean((regr.predict(X_test) - y_test) ** 2)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))









    



Mean squared error: 0.06
RMSE: 0.245718331795
Variance score: 0.55



In [24]:

    
# Plot predicted values vs. observed
plt.scatter(regr.predict(X_train),y_train, color='blue',s=1, alpha=.5)
plt.show()



In [25]:

    
# plot residuals vs predicted values
plt.scatter(regr.predict(X_train), regr.predict(X_train)- y_train, color='blue',s=1, alpha=.5)
plt.scatter(regr.predict(X_test), regr.predict(X_test)- y_test, color='green',s=1, alpha=.5)
plt.show()

The residuals look pretty normally distributed.

I wonder if inclusion of all these race variables is leading to overfitting. If so, we'd have small error on training set and large error on test set.



In [26]:

    
print("Training set. Mean squared error: %.5f" % np.mean((regr.predict(X_train) - y_train) ** 2), '| Variance score: %.5f' % regr.score(X_train, y_train))
print("Test set. Mean squared error: %.5f" % np.mean((regr.predict(X_test) - y_test) ** 2), '| Variance score: %.5f' % regr.score(X_test, y_test))









    



Training set. Mean squared error: 0.06211 | Variance score: 0.54432
Test set. Mean squared error: 0.06038 | Variance score: 0.54917

Try Ridge Regression (linear regression with regularization )

Since the training error and test error are about the same, and since we're using few features, overfitting probably isn't a problem. If it were a problem, we would want to try a regression with regularization. Let's try it just for the sake of demonstration.



In [27]:

    
from sklearn.linear_model import Ridge



In [28]:

    
# try a range of different regularization terms.
for a in [10,1,0.1,.01,.001,.00001]:
    ridgereg = Ridge(alpha=a)
    ridgereg.fit(X_train, y_train)
    
    print('\n alpha:',a)
    print("Mean squared error: %.5f" % np.mean((ridgereg.predict(X_test) - y_test) ** 2),'| Variance score: %.5f' % ridgereg.score(X_test, y_test))









    



 alpha: 10
Mean squared error: 0.06101 | Variance score: 0.54446

 alpha: 1
Mean squared error: 0.06070 | Variance score: 0.54678

 alpha: 0.1
Mean squared error: 0.06040 | Variance score: 0.54897

 alpha: 0.01
Mean squared error: 0.06035 | Variance score: 0.54936

 alpha: 0.001
Mean squared error: 0.06037 | Variance score: 0.54920

 alpha: 1e-05
Mean squared error: 0.06038 | Variance score: 0.54917



In [29]:

    
# Intercept
print('Intercept:', ridgereg.intercept_)

# The coefficients
print('Coefficients:')
pd.Series(ridgereg.coef_, index=x_cols)









    



Intercept: 3.79611296526
Coefficients:






    Out[29]:





bedrooms           0.080902
bathrooms          0.084796
sqft               0.000223
age_of_head_med    0.000365
income_med         0.000003
pct_white          4.053343
pct_black          3.653153
pct_any_native    -2.392905
pct_asian          4.183189
pct_pacific        6.299132
pct_other_race     4.881522
pct_mixed_race     3.471789
pct_mover         -0.154453
pct_owner         -0.078191
avg_hh_size       -0.060284
cars_per_hh       -0.334152
dtype: float64

As expected, Ridge regression doesn't help much. The best way to improve the model at this point is probably to add more features.

Random Forest



In [25]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error



In [26]:

    
def RMSE(y_actual, y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

def cross_val_rf(X, y,max_f='auto', n_trees = 50, cv_method='kfold', k=5):
    """Estimate a random forest model using cross-validation and return the average error across the folds.
    Args: 
        X (DataFrame): features data
        y (Series): target data
        max_f (str or int): how to select max features to consider for the best split. 
            If “auto”, then max_features=n_features.
            If “sqrt”, then max_features=sqrt(n_features)
            If “log2”, then max_features=log2(n_features)
            If int, then consider max_features features at each split
        n_trees (number of trees to build)
        cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
        k (int): number of folds (default=5)
    Returns: 
        float: mean error (RMSE) across all training/test sets.
    """
    if cv_method == 'kfold':
        kf = KFold(n_splits=k, shuffle=True, random_state=2012016)  # use random seed for reproducibility. 
    
    E = np.ones(k)   # this array will hold the errors. 
    i=0
    for train, test in kf.split(X, y): 
        train_data_x = X.iloc[train]
        train_data_y = y.iloc[train]    
        test_data_x = X.iloc[test]
        test_data_y = y.iloc[test]

        # n_estimators is number of trees to build. 
        # max_features = 'auto' means the max_features = n_features. This is a parameter we should tune. 
        random_forest = RandomForestRegressor(n_estimators=n_trees, max_features=max_f, criterion='mse', max_depth=None)
        random_forest.fit(train_data_x,train_data_y)
        predict_y=random_forest.predict(test_data_x)
        E[i] = RMSE(test_data_y, predict_y)
        i+=1
    return np.mean(E)


def optimize_rf(df_X, df_y, max_n_trees=100, n_step = 20, cv_method='kfold', k=5): 
    """Optimize hyperparameters for a random forest regressor.
    Args: 
        df_X (DataFrame): features data
        df_y (Series): target data
        max_n_trees (int): max number of trees to generate
        n_step (int): intervals to use for max_n_trees
        cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
        k (int): number of folds (default=5)
    """
    max_features_methods = ['auto','sqrt','log2']  # methods of defining max_features to try.
    
    # create a place to store the results, for easy plotting later. 
    results = pd.DataFrame(columns=max_features_methods, index=[x for x in range(10,max_n_trees+n_step,n_step)])
    
    for m in max_features_methods:
        print('max_features:',m)
        for n in results.index:
            error = cross_val_rf(df_X, df_y,max_f=m, n_trees=n)
            print('n_trees:',n,' error:',error)
            results.ix[n,m] = error
    return results



In [27]:

    
# data to use - exclude nulls
df_X = data_notnull[x_cols]
df_y = data_notnull[y_col]
print(df_X.shape, df_y.shape)
#df_all = pd.concat([data_notnull[x_cols],data_notnull[y_col]], axis=1)
#df_all.shape









    



(21006, 16) (21006,)



In [ ]:

    
# basic model to make sure it workds
random_forest = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None)
random_forest.fit(df_X,df_y)
y_predict = random_forest.predict(df_X)
RMSE(df_y,y_predict)

We can use k-fold validation if we believe the samples are independently and identically distributed. That's probably fine right now because we have only 1.5 months of data, but later we may have some time-dependent processes in these timeseries data. If we do use k-fold, I think we should shuffle the samples, because they do not come in a non-random sequence.



In [ ]:

    
# without parameter tuning
cross_val_rf(df_X,df_y)



In [ ]:

    
# tune the parameters
rf_results = optimize_rf(df_X,df_y, max_n_trees = 100, n_step = 20)  # this is sufficient; very little improvement after n_trees=100. 
#rf_results2 = optimize_rf(df_X,df_y, max_n_trees = 500, n_step=100)
rf_results



In [ ]:

    
ax = rf_results.plot()
ax.set_xlabel('number of trees')
ax.set_ylabel('RMSE')
#rf_results2.plot()

Using m=sqrt(n_features) and log2(n_features) gives similar performance, and a slight improvement over m = n_features. After about 100 trees the error levels off. One of the nice things about random forest is that using additional trees doesn't lead to overfitting, so we could use more, but it's not necessary. Now we can fit the model using n_trees = 100 and m = sqrt.



In [ ]:

    
random_forest = RandomForestRegressor(n_estimators=100, max_features='sqrt', criterion='mse', max_depth=None)
random_forest.fit(df_X,df_y)
predict_y=random_forest.predict(df_X)

The 'importance' score provides an ordered qualitative ranking of the importance of each feature. It is calculated from the improvement in MSE provided by each feature when it is used to split the tree.



In [ ]:

    
# plot the importances
rf_o = pd.DataFrame({'features':x_cols,'importance':random_forest.feature_importances_})
rf_o= rf_o.sort_values(by='importance',ascending=False)


plt.figure(1,figsize=(12, 6))
plt.xticks(range(len(rf_o)), rf_o.features,rotation=45)
plt.plot(range(len(rf_o)),rf_o.importance,"o")
plt.title('Feature importances')
plt.show()

It's not surprising sqft is the most important predictor, although it is strange cars_per_hh is the second most important. I would have expected incometo be higher in the list.

If we don't think the samples are i.i.d., it's better to use time series CV.



In [ ]:

    
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

Try Boosted Forest



In [20]:

    
from sklearn.ensemble import GradientBoostingRegressor



In [24]:

    
def cross_val_gb(X,y,cv_method='kfold',k=5, **params):
    """Estimate gradient boosting regressor using cross validation.
    
    Args: 
        X (DataFrame): features data
        y (Series): target data
        cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
        k (int): number of folds (default=5)
        **params: keyword arguments for regressor
    Returns: 
        float: mean error (RMSE) across all training/test sets.
    """
    if cv_method == 'kfold':
        kf = KFold(n_splits=k, shuffle=True, random_state=2012016)  # use random seed for reproducibility. 
    
    E = np.ones(k)   # this array will hold the errors. 
    i=0
    for train, test in kf.split(X, y): 
        train_data_x = X.iloc[train]
        train_data_y = y.iloc[train]    
        test_data_x = X.iloc[test]
        test_data_y = y.iloc[test]

        # n_estimators is number of trees to build. 
        grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
        grad_boost.fit(train_data_x,train_data_y)
        predict_y=grad_boost.predict(test_data_x)
        E[i] = RMSE(test_data_y, predict_y)
        i+=1
    return np.mean(E)



In [41]:

    
params = {'n_estimators':100,
          'learning_rate':0.1,
          'max_depth':1,
          'min_samples_leaf':4
          }
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
cross_val_gb(df_X,df_y, **params)









    Out[41]:





0.25199322945255337



In [ ]:

    
n_trees = 100
l_rate = 0.1
max_d = 1
cross_val_gb(df_X,df_y, l_rate,max_d)

tune parameters

This time we'll use Grid Search in scikit-learn. This conducts an exhaustive search through the given parameters to find the best for the given estimator.



In [ ]:

    
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':[.1, .05, .02, .01],
              'max_depth':[2,4,6],
              'min_samples_leaf': [3,5,9,17],
              'max_features': [1, .3, .1]
              }

est= GradientBoostingRegressor(n_estimators = 1000)
gs_cv = GridSearchCV(est,param_grid).fit(df_X,df_y)



In [23]:

    
print(gs_cv.best_params_)
print(gs_cv.best_score_)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-23-905164be9736> in <module>()
----> 1 print(gs_cv.best_params_)
      2 print(gs_cv.best_score_)

NameError: name 'gs_cv' is not defined



In [42]:

    
# best parameters
params = {'n_estimators':1000,
          'learning_rate':0.05,
          'max_depth':6,
          'min_samples_leaf':3
          }
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
cross_val_gb(df_X,df_y, **params)









    Out[42]:





0.17111455700588998



In [29]:

    
# plot the importances
gb_o = pd.DataFrame({'features':x_cols,'importance':grad_boost.feature_importances_})
gb_o= gb_o.sort_values(by='importance',ascending=False)


plt.figure(1,figsize=(12, 6))
plt.xticks(range(len(gb_o)), gb_o.features,rotation=45)
plt.plot(range(len(gb_o)),gb_o.importance,"o")
plt.title('Feature importances')
plt.show()

Let's use partial_dependence to look at feature interactions. Look at the four most important features.



In [30]:

    
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence



In [31]:

    
df_X.columns









    Out[31]:





Index(['bedrooms', 'bathrooms', 'sqft', 'age_of_head_med', 'income_med',
       'pct_white', 'pct_black', 'pct_any_native', 'pct_asian', 'pct_pacific',
       'pct_other_race', 'pct_mixed_race', 'pct_mover', 'pct_owner',
       'avg_hh_size', 'cars_per_hh'],
      dtype='object')



In [38]:

    
features = [0,1,2, 15, 4,5,14, 12]
names = df_X.columns
fig, axs = plot_partial_dependence(grad_boost, df_X, features,feature_names=names, grid_resolution=50, figsize = (10,8))
fig.suptitle('Partial dependence of rental price features')
plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
plt.show()

The partial dependence plots show how predicted values vary with the given covariate, "controlling for" the influence of other covariates (Friedman, 2001). In the top three plots above, we can see non-linear relationships between the features and predicted values. Bedrooms generally has a positive influence on rent, but it peaks at 5-6 bedrooms and then falls. With bedrooms, there is an interesting dip at 3 bedrooms. Perhaps this reflects lower rents for large shared houses that have many rooms that might be in older, poor-conditions buildings.

The most important feature, sqft, has a generally positive influence on rent, except for a dip at the high end. Also interesting is the plot for income_med; rent increases with income, but levels off at the high end. Put another way, income of the neigborhood matters, up to a limit.



In [39]:

    
features = [(0,1),(0,2),(4,2), (4,15),(14,15)]
names = df_X.columns
fig, axs = plot_partial_dependence(grad_boost, df_X, features,feature_names=names, grid_resolution=50, figsize = (9,6))
fig.suptitle('Partial dependence of rental price features')
plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
plt.show()



In [ ]:

	listing_id	date	rent	bedrooms	bathrooms	sqft	rent_sqft	fips_block	state	region	...	race_of_head_4	race_of_head_5	race_of_head_6	race_of_head_7	race_of_head_8	race_of_head_9	recent_mover_0	recent_mover_1	tenure_1	tenure_2
0	5950749277	2017-01-08	3900.0	2.0	1.0	1000.0	3.900000	060750102001012	CA	sfbay	...	NaN	NaN	51.0	1.0	1.0	14.0	547.0	90.0	288.0	349.0
1	5930739630	2017-01-08	2107.0	2.0	1.5	1028.0	2.049611	060971506122048	CA	sfbay	...	NaN	NaN	11.0	2.0	19.0	19.0	536.0	25.0	264.0	297.0
2	5950748411	2017-01-08	3644.0	2.0	2.0	1362.0	2.675477	060855067033008	CA	sfbay	...	NaN	NaN	130.0	NaN	1.0	3.0	322.0	8.0	91.0	239.0
3	5950747869	2017-01-08	4350.0	2.0	1.0	900.0	4.833333	060750102001012	CA	sfbay	...	NaN	NaN	51.0	1.0	1.0	14.0	547.0	90.0	288.0	349.0
4	5950747694	2017-01-08	2450.0	3.0	2.0	1850.0	1.324324	060133270005038	CA	sfbay	...	NaN	1.0	27.0	2.0	16.0	25.0	634.0	25.0	547.0	112.0

	rent	ln_rent	bedrooms	bathrooms	sqft	pct_white	pct_black	pct_asian	pct_mover	pct_owner	income_med	age_of_head_med	avg_hh_size	cars_per_hh
count	8.272700e+04	82727.000000	82727.000000	21135.000000	82727.000000	82701.000000	82701.000000	82701.000000	82701.000000	82701.000000	82727.000000	82727.000000	82701.000000	82701.000000
mean	8.424425e+03	7.892339	1.786938	1.528507	1070.799352	0.614164	0.051737	0.251347	0.238773	0.444723	84269.025923	47.596722	2.392070	1.685853
std	8.079905e+05	0.415221	0.995156	0.658353	4395.502018	0.222690	0.100005	0.199291	0.134969	0.267616	35749.132951	7.469119	0.570091	0.477950
min	1.000000e+00	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.119074	0.084123
25%	2.125000e+03	7.661527	1.000000	1.000000	716.000000	0.445419	0.000000	0.087121	0.130178	0.238063	58000.000000	42.000000	1.973294	1.422348
50%	2.600000e+03	7.863267	2.000000	1.000000	930.000000	0.625000	0.014629	0.210407	0.231550	0.438642	79630.000000	48.000000	2.348315	1.764550
75%	3.308000e+03	8.104099	2.000000	2.000000	1187.000000	0.791878	0.050829	0.383133	0.333826	0.643371	105000.000000	53.000000	2.776371	2.007761
max	2.100204e+08	19.162715	8.000000	6.500000	985024.000000	1.000000	0.940909	0.950382	0.782759	1.000000	267500.000000	78.000000	5.241071	2.890467