In [34]:

    
import pandas as pd
# import paramiko
import os
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
import mpl_toolkits.basemap.pyproj as pyproj
%matplotlib inline









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-34-fd7f8c0ad582> in <module>()
      7 import matplotlib.pyplot as plt
      8 import seaborn as sns
----> 9 import mpl_toolkits.basemap.pyproj as pyproj
     10 get_ipython().magic('matplotlib inline')

ImportError: No module named 'mpl_toolkits.basemap'

This notebook explores merged craigslist listings/census data and fits some initial models

Remote connection parameters

If data is stored remotely



In [2]:

    
# # TODO: add putty connection too. 

# #read SSH connection parameters
# with open('ssh_settings.json') as settings_file:    
#     settings = json.load(settings_file)

# hostname = settings['hostname']
# username = settings['username']
# password = settings['password']
# local_key_dir = settings['local_key_dir']

# census_dir = 'synthetic_population/'
# """Remote directory with census data"""

# results_dir = 'craigslist_census/'
# """Remote directory for results"""

# # estbalish SSH connection
# ssh = paramiko.SSHClient() 
# ssh.load_host_keys(local_key_dir)
# ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# ssh.connect(hostname,username=username, password=password)
# sftp = ssh.open_sftp()

Data Preparation



In [4]:

    
# def read_listings_file(fname):
#     """Read csv file via SFTP and return as dataframe."""
#     with sftp.open(os.path.join(listings_dir,fname)) as f:
#         df = pd.read_csv(f, delimiter=',', dtype={'fips_block':str,'state':str,'mpo_id':str}, date_parser=['date'])
#         # TODO: parse dates. 
#     return df

def log_var(x):
    """Return log of x, but NaN if zero."""
    if x==0:
        return np.nan
    else:
        return np.log(x)
    
def create_census_vars(df):
    """Make meaningful variables and return the dataframe."""
    df['pct_white'] = df['race_of_head_1']/df['hhs_tot']
    df['pct_black'] = df['race_of_head_2']/df['hhs_tot']
    df['pct_amer_native'] = df['race_of_head_3']/df['hhs_tot']
    df['pct_alaska_native'] = df['race_of_head_4']/df['hhs_tot']
    df['pct_any_native'] = df['race_of_head_5']/df['hhs_tot']
    df['pct_asian'] = df['race_of_head_6']/df['hhs_tot']
    df['pct_pacific'] = df['race_of_head_7']/df['hhs_tot']
    df['pct_other_race'] = df['race_of_head_8']/df['hhs_tot']
    df['pct_mixed_race'] = df['race_of_head_9']/df['hhs_tot']
    df['pct_mover'] = df['recent_mover_1']/df['hhs_tot']
    df['pct_owner'] = df['tenure_1']/df['hhs_tot']
    df['avg_hh_size'] = df['persons_tot']/df['hhs_tot']
    df['cars_per_hh'] = df['cars_tot']/df['hhs_tot']
    
    df['ln_rent'] = df['rent'].apply(log_var)
    df['ln_income'] = df.income_med.apply(log_var)
    return df

def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
    """Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft. 
    Args: 
        df: Dataframe with listings. Cols names include ['rent','sqft']
        rent_range (tuple): min and max rent
        sqft_range (tuple): min and max sqft
    Returns: 
        DataFrame: listings data without outliers. 
    """
    n0=len(df)
    df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
    n1=len(df)
    print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
    df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
    n2=len(df)
    print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
    return(df)



In [5]:

    
# get list of files and load. 
# for remotely stored data by state (just do one state for now)
state='CA'
infile='cl_census_{}.csv'.format(state)
#data = read_listings_file(infile)  # uncomment to get remote data. 

# for local data: 
data_dir = '../data/'
data_file = r'..\data\sfbay_listings_03172017.csv'

data = pd.read_csv(os.path.join(data_file),parse_dates=[1],dtype={'listing_id':str, 'rent':float, 'bedrooms':float, 'bathrooms':float, 'sqft':float,
       'rent_sqft':float, 'fips_block':str, 'state':str, 'region':str, 'mpo_id':str, 'lng':float, 'lat':float,
       'cars_tot':float, 'children_tot':float, 'persons_tot':float, 'workers_tot':float,
       'age_of_head_med':float, 'income_med':float, 'hhs_tot':float, 'race_of_head_1':float,
       'race_of_head_2':float, 'race_of_head_3':float, 'race_of_head_4':float, 'race_of_head_5':float,
       'race_of_head_6':float, 'race_of_head_7':float, 'race_of_head_8':float, 'race_of_head_9':float,
       'recent_mover_0':float, 'recent_mover_1':float, 'tenure_1':float, 'tenure_2':float})

print(len(data))
data.head()









    



125539






    Out[5]:






  
    
      
      listing_id
      date
      rent
      bedrooms
      bathrooms
      sqft
      rent_sqft
      fips_block
      state
      region
      ...
      race_of_head_4
      race_of_head_5
      race_of_head_6
      race_of_head_7
      race_of_head_8
      race_of_head_9
      recent_mover_0
      recent_mover_1
      tenure_1
      tenure_2
    
  
  
    
      0
      5915417519
      2016-12-11
      2586.0
      2.0
      NaN
      1039.0
      2.488932
      060971530013001
      CA
      sfbay
      ...
      NaN
      2.0
      23.0
      NaN
      115.0
      29.0
      1031.0
      145.0
      560.0
      616.0
    
    
      1
      5915416703
      2016-12-11
      3178.0
      1.0
      NaN
      574.0
      5.536585
      060750615003012
      CA
      sfbay
      ...
      NaN
      NaN
      472.0
      NaN
      5.0
      2.0
      697.0
      513.0
      613.0
      597.0
    
    
      2
      5915414697
      2016-12-11
      3029.0
      1.0
      NaN
      771.0
      3.928664
      060816080021000
      CA
      sfbay
      ...
      NaN
      NaN
      293.0
      3.0
      9.0
      16.0
      636.0
      70.0
      526.0
      180.0
    
    
      3
      5915384383
      2016-12-11
      2395.0
      1.0
      NaN
      770.0
      3.110390
      060855081021004
      CA
      sfbay
      ...
      NaN
      NaN
      704.0
      NaN
      1.0
      21.0
      1005.0
      265.0
      660.0
      610.0
    
    
      4
      5915414020
      2016-12-11
      2735.0
      2.0
      NaN
      922.0
      2.966377
      060411042002000
      CA
      sfbay
      ...
      NaN
      8.0
      50.0
      2.0
      75.0
      47.0
      898.0
      113.0
      504.0
      507.0
    
  

5 rows × 33 columns



In [6]:

    
# for census vars, NA really means 0...
census_cols = ['cars_tot', 'children_tot','persons_tot', 'workers_tot', 'age_of_head_med', 'income_med','hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3','race_of_head_4', 'race_of_head_5', 'race_of_head_6', 'race_of_head_7','race_of_head_8', 'race_of_head_9', 'recent_mover_0', 'recent_mover_1','tenure_1', 'tenure_2']
for col in census_cols:
    data[col] = data[col].fillna(0)

create variables

variable codes

Race codes (from PUMS) 1 .White alone 2 .Black or African American alone 3 .American Indian alone 4 .Alaska Native alone 5 .American Indian and Alaska Native tribes specified; or American .Indian or Alaska native, not specified and no other races 6 .Asian alone 7 .Native Hawaiian and Other Pacific Islander alone 8 .Some other race alone 9 .Two or more major race groups

tenure_1 = owner (based on my guess; didn't match the PUMS codes)

mover_1 = moved past year (based on my guess)



In [7]:

    
# create useful variables 
data = create_census_vars(data)

# define some feature to include in the model. 
features_to_examine = ['rent','ln_rent', 'bedrooms','bathrooms','sqft','pct_white', 'pct_black','pct_asian','pct_mover','pct_owner','income_med','age_of_head_med','avg_hh_size','cars_per_hh']
data[features_to_examine].describe()









    



C:\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)






    Out[7]:






  
    
      
      rent
      ln_rent
      bedrooms
      bathrooms
      sqft
      pct_white
      pct_black
      pct_asian
      pct_mover
      pct_owner
      income_med
      age_of_head_med
      avg_hh_size
      cars_per_hh
    
  
  
    
      count
      1.255390e+05
      125539.000000
      125539.000000
      63137.000000
      125539.000000
      125473.000000
      125473.000000
      125473.000000
      125473.000000
      125473.000000
      125539.000000
      125539.000000
      125473.000000
      125473.000000
    
    
      mean
      6.681448e+03
      7.896976
      1.772246
      1.500744
      1061.272983
      0.613227
      0.051669
      0.252512
      0.240305
      0.441876
      84181.561563
      47.518759
      2.386755
      1.679141
    
    
      std
      6.574609e+05
      0.406612
      0.996353
      0.640847
      3834.013217
      0.222957
      0.099596
      0.199378
      0.135384
      0.267546
      35849.719881
      7.499268
      0.572601
      0.480328
    
    
      min
      1.000000e+00
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.119074
      0.084123
    
    
      25%
      2.135000e+03
      7.666222
      1.000000
      NaN
      713.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      57650.000000
      42.000000
      NaN
      NaN
    
    
      50%
      2.600000e+03
      7.863267
      2.000000
      NaN
      922.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      79500.000000
      48.000000
      NaN
      NaN
    
    
      75%
      3.341000e+03
      8.114025
      2.000000
      NaN
      1180.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      105000.000000
      53.000000
      NaN
      NaN
    
    
      max
      2.100204e+08
      19.162715
      8.000000
      7.500000
      985024.000000
      1.000000
      0.940909
      0.950382
      0.782759
      1.000000
      267500.000000
      78.000000
      5.241071
      2.966543

Filter outliers



In [8]:

    
# I've already identified these ranges as good at exluding outliers
rent_range=(100,10000)
sqft_range=(10,5000)
data = filter_outliers(data, rent_range=rent_range, sqft_range=sqft_range)









    



Dropped 566 outside rent range $100-$10000
Dropped 153 outside sqft range 10-5000 sqft. 124820 rows remaining



In [9]:

    
# Use this to explore outliers yourself. 
g=sns.distplot(data['rent'],  kde=False)
g.set_xlim(0,10000)

g=sns.distplot(data['sqft'], kde=False)
g.set_xlim(0,10000)









    Out[9]:





(0, 10000)

Examine missing data



In [10]:

    
# examine NA's
print('Total rows:',len(data))
print('Rows with any NA:',len(data[pd.isnull(data).any(axis=1)]))
print('Rows with bathroom NA:',len(data[pd.isnull(data.bathrooms)]))
print('% rows missing bathroom col:',len(data[pd.isnull(data.bathrooms)])/len(data))









    



Total rows: 124820
Rows with any NA: 62105
Rows with bathroom NA: 62024
% rows missing bathroom col: 0.49690754686748917

uh oh, 74% are missing bathrooms feature. Might have to omit that one. Only 0.02% of rows have other missing values, so that should be ok.



In [11]:

    
#for d in range(1,31):
#    print(d,'% rows missing bathroom col:',len(data[pd.isnull(data.bathrooms)&((data.date.dt.month==12)&(data.date.dt.day==d))])/len(data[(data.date.dt.month==12)&(data.date.dt.day==d)]))

Bathrooms were added on Dec 21. After that, if bathrooms aren't in the listing, the listing is thrown out. Let's try to find the date when the bathrooms column was added. So if need to use bathrooms feature, can use listings Dec 22 and after.



In [12]:

    
# uncommon to only use data after Dec 21. 
#data=data[(data.date.dt.month>=12)&(data.date.dt.day>=22)]
#data.shape



In [13]:

    
# Uncomment to drop NA's
#data = data.dropna()
#print('Dropped {} rows with NAs'.format(n0-len(data)))

Look at distributions

Since rent has a more or less logarithmic distribution, use ln_rent instead



In [14]:

    
p=sns.distplot(data.rent, kde=False)
p.set_title('rent')









    Out[14]:





<matplotlib.text.Text at 0x1f35fb15b38>



In [15]:

    
p=sns.distplot(data.ln_rent, kde=False)
p.set_title('ln rent')









    Out[15]:





<matplotlib.text.Text at 0x1f35fc284a8>



In [16]:

    
plot_rows = math.ceil(len(features_to_examine)/2)

f, axes = plt.subplots(plot_rows,2, figsize=(8,15))
sns.despine(left=True)

for i,col in enumerate(features_to_examine):
    row_position = math.floor(i/2)
    col_position = i%2
    data_notnull = data[pd.notnull(data[col])]  # exclude NA values from plot
    sns.distplot(data_notnull[col], ax=axes[row_position, col_position],kde=False)
    axes[row_position, col_position].set_title('{}'.format(col)) 

plt.tight_layout()
plt.show()



In [17]:

    
data_notnull = data[pd.notnull(data['ln_income'])]
p=sns.distplot(data_notnull['ln_income'],kde=False)
p.set_title('ln med income')
# ln med income is not more normal.. use med income instead.









    Out[17]:





<matplotlib.text.Text at 0x1f360960e10>

look at correlations



In [18]:

    
# correlation heatmap
corrmat=data[features_to_examine].corr()
corrmat.head()

f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

f.tight_layout()

The correlations appear as expected, except for cars_per_hh. Maybe this is because cars_per_hh is reflecting the size of the household more than income. Might want to try cars per adult instead..



In [19]:

    
print(data.columns)
#'pct_amer_native','pct_alaska_native',
x_cols = ['bedrooms','bathrooms', 'sqft','age_of_head_med', 'income_med','pct_white', 'pct_black', 'pct_any_native', 'pct_asian', 'pct_pacific',
       'pct_other_race', 'pct_mixed_race', 'pct_mover', 'pct_owner', 'avg_hh_size', 'cars_per_hh']
y_col = 'ln_rent'

print(len(data))

# exclude missing values
data_notnull= data[(pd.notnull(data[x_cols])).all(axis=1)]
data_notnull= data_notnull[(pd.notnull(data_notnull[y_col]))]
print('using {} rows of {} total'.format(len(data_notnull),len(data)))









    



Index(['listing_id', 'date', 'rent', 'bedrooms', 'bathrooms', 'sqft',
       'rent_sqft', 'fips_block', 'state', 'region', 'mpo_id', 'lng', 'lat',
       'cars_tot', 'children_tot', 'persons_tot', 'workers_tot',
       'age_of_head_med', 'income_med', 'hhs_tot', 'race_of_head_1',
       'race_of_head_2', 'race_of_head_3', 'race_of_head_4', 'race_of_head_5',
       'race_of_head_6', 'race_of_head_7', 'race_of_head_8', 'race_of_head_9',
       'recent_mover_0', 'recent_mover_1', 'tenure_1', 'tenure_2', 'pct_white',
       'pct_black', 'pct_amer_native', 'pct_alaska_native', 'pct_any_native',
       'pct_asian', 'pct_pacific', 'pct_other_race', 'pct_mixed_race',
       'pct_mover', 'pct_owner', 'avg_hh_size', 'cars_per_hh', 'ln_rent',
       'ln_income'],
      dtype='object')
124820
using 62754 rows of 124820 total



In [30]:

    
data_notnull.head()









    Out[30]:






  
    
      
      listing_id
      date
      rent
      bedrooms
      bathrooms
      sqft
      rent_sqft
      fips_block
      state
      region
      ...
      pct_asian
      pct_pacific
      pct_other_race
      pct_mixed_race
      pct_mover
      pct_owner
      avg_hh_size
      cars_per_hh
      ln_rent
      ln_income
    
  
  
    
      90
      5952685186
      2017-01-09
      1700.0
      1.0
      1.0
      435.0
      3.908046
      060133170001018
      CA
      sfbay
      ...
      0.000000
      0.005587
      0.013035
      0.067039
      0.253259
      0.504655
      1.802607
      1.791434
      7.438384
      11.238489
    
    
      91
      5952700057
      2017-01-09
      2392.0
      1.0
      1.0
      556.0
      4.302158
      060855091021031
      CA
      sfbay
      ...
      0.488615
      0.000000
      0.015180
      0.023719
      0.187856
      0.668880
      1.857685
      1.582543
      7.779885
      11.626254
    
    
      92
      5952699799
      2017-01-09
      3195.0
      3.0
      1.0
      1107.0
      2.886179
      060855058002002
      CA
      sfbay
      ...
      0.166220
      0.002681
      0.058981
      0.016086
      0.203753
      0.404826
      2.016086
      1.839142
      8.069342
      11.302204
    
    
      93
      5952699780
      2017-01-09
      2398.0
      2.0
      1.0
      1030.0
      2.328155
      060855002004019
      CA
      sfbay
      ...
      0.244554
      0.000000
      0.028713
      0.054455
      0.500990
      0.256436
      1.887129
      1.630693
      7.782390
      11.302204
    
    
      94
      5952699706
      2017-01-09
      3696.0
      3.0
      2.0
      1327.0
      2.785230
      060855064011006
      CA
      sfbay
      ...
      0.122523
      0.008108
      0.114414
      0.065766
      0.177477
      0.370270
      2.406306
      1.839640
      8.215006
      10.915088
    
  

5 rows × 48 columns



In [33]:

    
data_notnull.to_csv(r'..\data\sfbay_listings_03172017_notnull.csv',index=False)

Comparison of models

Try a linear model

We'll start with a linear model to use as the baseline.



In [20]:

    
from sklearn import linear_model, cross_validation



In [21]:

    
# create training and testing datasets. 
# this creates a test set that is 30% of total obs.  
X_train, X_test, y_train, y_test = cross_validation.train_test_split(data_notnull[x_cols],data_notnull[y_col], test_size = .3, random_state = 201)



In [22]:

    
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)









    Out[22]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [23]:

    
# Intercept
print('Intercept:', regr.intercept_)

# The coefficients
print('Coefficients:')
pd.Series(regr.coef_, index=x_cols)









    



Intercept: 4.19441673223
Coefficients:






    Out[23]:





bedrooms           0.072043
bathrooms          0.072522
sqft               0.000265
age_of_head_med    0.000375
income_med         0.000003
pct_white          3.662811
pct_black          3.260943
pct_any_native    -1.835531
pct_asian          3.768787
pct_pacific        5.576854
pct_other_race     4.323419
pct_mixed_race     3.062192
pct_mover         -0.153098
pct_owner         -0.079592
avg_hh_size       -0.041547
cars_per_hh       -0.341830
dtype: float64



In [24]:

    
# See mean square error, using test data
print("Mean squared error: %.2f" % np.mean((regr.predict(X_test) - y_test) ** 2))
print("RMSE:", np.sqrt(np.mean((regr.predict(X_test) - y_test) ** 2)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))









    



Mean squared error: 0.06
RMSE: 0.237953812109
Variance score: 0.57



In [25]:

    
# Plot predicted values vs. observed
plt.scatter(regr.predict(X_train),y_train, color='blue',s=1, alpha=.5)
plt.show()



In [26]:

    
# plot residuals vs predicted values
plt.scatter(regr.predict(X_train), regr.predict(X_train)- y_train, color='blue',s=1, alpha=.5)
plt.scatter(regr.predict(X_test), regr.predict(X_test)- y_test, color='green',s=1, alpha=.5)
plt.show()

The residuals look pretty normally distributed.

I wonder if inclusion of all these race variables is leading to overfitting. If so, we'd have small error on training set and large error on test set.



In [26]:

    
print("Training set. Mean squared error: %.5f" % np.mean((regr.predict(X_train) - y_train) ** 2), '| Variance score: %.5f' % regr.score(X_train, y_train))
print("Test set. Mean squared error: %.5f" % np.mean((regr.predict(X_test) - y_test) ** 2), '| Variance score: %.5f' % regr.score(X_test, y_test))









    



Training set. Mean squared error: 0.06211 | Variance score: 0.54432
Test set. Mean squared error: 0.06038 | Variance score: 0.54917

Try Ridge Regression (linear regression with regularization )

Since the training error and test error are about the same, and since we're using few features, overfitting probably isn't a problem. If it were a problem, we would want to try a regression with regularization. Let's try it just for the sake of demonstration.



In [27]:

    
from sklearn.linear_model import Ridge



In [28]:

    
# try a range of different regularization terms.
for a in [10,1,0.1,.01,.001,.00001]:
    ridgereg = Ridge(alpha=a)
    ridgereg.fit(X_train, y_train)
    
    print('\n alpha:',a)
    print("Mean squared error: %.5f" % np.mean((ridgereg.predict(X_test) - y_test) ** 2),'| Variance score: %.5f' % ridgereg.score(X_test, y_test))









    



 alpha: 10
Mean squared error: 0.06101 | Variance score: 0.54446

 alpha: 1
Mean squared error: 0.06070 | Variance score: 0.54678

 alpha: 0.1
Mean squared error: 0.06040 | Variance score: 0.54897

 alpha: 0.01
Mean squared error: 0.06035 | Variance score: 0.54936

 alpha: 0.001
Mean squared error: 0.06037 | Variance score: 0.54920

 alpha: 1e-05
Mean squared error: 0.06038 | Variance score: 0.54917



In [29]:

    
# Intercept
print('Intercept:', ridgereg.intercept_)

# The coefficients
print('Coefficients:')
pd.Series(ridgereg.coef_, index=x_cols)









    



Intercept: 3.79611296526
Coefficients:






    Out[29]:





bedrooms           0.080902
bathrooms          0.084796
sqft               0.000223
age_of_head_med    0.000365
income_med         0.000003
pct_white          4.053343
pct_black          3.653153
pct_any_native    -2.392905
pct_asian          4.183189
pct_pacific        6.299132
pct_other_race     4.881522
pct_mixed_race     3.471789
pct_mover         -0.154453
pct_owner         -0.078191
avg_hh_size       -0.060284
cars_per_hh       -0.334152
dtype: float64

As expected, Ridge regression doesn't help much. The best way to improve the model at this point is probably to add more features.

Random Forest



In [30]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error



In [ ]:

    
def RMSE(y_actual, y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

def cross_val_rf(X, y,max_f='auto', n_trees = 50, cv_method='kfold', k=5):
    """Estimate a random forest model using cross-validation and return the average error across the folds.
    Args: 
        X (DataFrame): features data
        y (Series): target data
        max_f (str or int): how to select max features to consider for the best split. 
            If “auto”, then max_features=n_features.
            If “sqrt”, then max_features=sqrt(n_features)
            If “log2”, then max_features=log2(n_features)
            If int, then consider max_features features at each split
        n_trees (number of trees to build)
        cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
        k (int): number of folds (default=5)
    Returns: 
        float: mean error (RMSE) across all training/test sets.
    """
    if cv_method == 'kfold':
        kf = KFold(n_splits=k, shuffle=True, random_state=2012016)  # use random seed for reproducibility. 
    
    E = np.ones(k)   # this array will hold the errors. 
    i=0
    for train, test in kf.split(X, y): 
        train_data_x = X.iloc[train]
        train_data_y = y.iloc[train]    
        test_data_x = X.iloc[test]
        test_data_y = y.iloc[test]

        # n_estimators is number of trees to build. 
        # max_features = 'auto' means the max_features = n_features. This is a parameter we should tune. 
        random_forest = RandomForestRegressor(n_estimators=n_trees, max_features=max_f, criterion='mse', max_depth=None)
        random_forest.fit(train_data_x,train_data_y)
        predict_y=random_forest.predict(test_data_x)
        E[i] = RMSE(test_data_y, predict_y)
        i+=1
    return np.mean(E)


def optimize_rf(df_X, df_y, max_n_trees=100, n_step = 20, cv_method='kfold', k=5): 
    """Optimize hyperparameters for a random forest regressor.
    Args: 
        df_X (DataFrame): features data
        df_y (Series): target data
        max_n_trees (int): max number of trees to generate
        n_step (int): intervals to use for max_n_trees
        cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
        k (int): number of folds (default=5)
    """
    max_features_methods = ['auto','sqrt','log2']  # methods of defining max_features to try.
    
    # create a place to store the results, for easy plotting later. 
    results = pd.DataFrame(columns=max_features_methods, index=[x for x in range(10,max_n_trees+n_step,n_step)])
    
    for m in max_features_methods:
        print('max_features:',m)
        for n in results.index:
            error = cross_val_rf(df_X, df_y,max_f=m, n_trees=n)
            print('n_trees:',n,' error:',error)
            results.ix[n,m] = error
    return results



In [34]:

    
# data to use - exclude nulls
df_X = data_notnull[x_cols]
df_y = data_notnull[y_col]
print(df_X.shape, df_y.shape)
#df_all = pd.concat([data_notnull[x_cols],data_notnull[y_col]], axis=1)
#df_all.shape









    



16
(21006, 16) (21006,)



In [ ]:

    
# basic model to make sure it workds
random_forest = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None)
random_forest.fit(df_X,df_y)
y_predict = random_forest.predict(df_X)
RMSE(df_y,y_predict)

We can use k-fold validation if we believe the samples are independently and identically distributed. That's probably fine right now because we have only 1.5 months of data, but later we may have some time-dependent processes in these timeseries data. If we do use k-fold, I think we should shuffle the samples, because they do not come in a non-random sequence.



In [ ]:

    
# without parameter tuning
cross_val_rf(df_X,df_y)



In [ ]:

    
# tune the parameters
rf_results = optimize_rf(df_X,df_y, max_n_trees = 100, n_step = 20)  # this is sufficient; very little improvement after n_trees=100. 
#rf_results2 = optimize_rf(df_X,df_y, max_n_trees = 500, n_step=100)
rf_results



In [ ]:

    
ax = rf_results.plot()
ax.set_xlabel('number of trees')
ax.set_ylabel('RMSE')
#rf_results2.plot()

Using m=sqrt(n_features) and log2(n_features) gives similar performance, and a slight improvement over m = n_features. After about 100 trees the error levels off. One of the nice things about random forest is that using additional trees doesn't lead to overfitting, so we could use more, but it's not necessary. Now we can fit the model using n_trees = 100 and m = sqrt.



In [ ]:

    
random_forest = RandomForestRegressor(n_estimators=100, max_features='sqrt', criterion='mse', max_depth=None)
random_forest.fit(df_X,df_y)
predict_y=random_forest.predict(df_X)

The 'importance' score provides an ordered qualitative ranking of the importance of each feature. It is calculated from the improvement in MSE provided by each feature when it is used to split the tree.



In [ ]:

    
# plot the importances
rf_o = pd.DataFrame({'features':x_cols,'importance':random_forest.feature_importances_})
rf_o= rf_o.sort_values(by='importance',ascending=False)


plt.figure(1,figsize=(12, 6))
plt.xticks(range(len(rf_o)), rf_o.features,rotation=45)
plt.plot(range(len(rf_o)),rf_o.importance,"o")
plt.title('Feature importances')
plt.show()

It's not surprising sqft is the most important predictor, although it is strange cars_per_hh is the second most important. I would have expected incometo be higher in the list.

If we don't think the samples are i.i.d., it's better to use time series CV.



In [ ]:

    
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

Try Boosted Forest



In [32]:

    
from sklearn.ensemble import GradientBoostingRegressor



In [83]:

    
def cross_val_gb(X,y,cv_method='kfold',k=5, **params):
    """Estimate gradient boosting regressor using cross validation.
    
    Args: 
        X (DataFrame): features data
        y (Series): target data
        cv_method (str): how to split the data ('kfold' (default) or 'timeseries')
        k (int): number of folds (default=5)
        **params: keyword arguments for regressor
    Returns: 
        float: mean error (RMSE) across all training/test sets.
    """
    if cv_method == 'kfold':
        kf = KFold(n_splits=k, shuffle=True, random_state=2012016)  # use random seed for reproducibility. 
    
    E = np.ones(k)   # this array will hold the errors. 
    i=0
    for train, test in kf.split(X, y): 
        train_data_x = X.iloc[train]
        train_data_y = y.iloc[train]    
        test_data_x = X.iloc[test]
        test_data_y = y.iloc[test]

        # n_estimators is number of trees to build. 
        grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
        grad_boost.fit(train_data_x,train_data_y)
        predict_y=grad_boost.predict(test_data_x)
        E[i] = RMSE(test_data_y, predict_y)
        i+=1
    return np.mean(E)



In [37]:

    
params = {'n_estimators':100,
          'learning_rate':0.1,
          'max_depth':1,
          'min_samples_leaf':4
          }
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
RMSE(y_test, grad_boost.predict(X_test))



In [40]:

    
n_trees = 100
l_rate = 0.1
max_d = 1
cross_val_gb(df_X,df_y, l_rate,max_d)









    Out[40]:





0.27197436244292783

tune parameters

This time we'll use Grid Search in scikit-learn. This conducts an exhaustive search through the given parameters to find the best for the given estimator.



In [70]:

    
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':[.1, .05, .02, .01],
              'max_depth':[2,4,6],
              'min_samples_leaf': [3,5,9,17],
              'max_features': [1, .3, .1]
              }

est= GradientBoostingRegressor(n_estimators = 1000)
gs_cv = GridSearchCV(est,param_grid).fit(df_X,df_y)



In [81]:

    
print(gs_cv.best_params_)
print(gs_cv.best_score_)









    



{'max_depth': 6, 'max_features': 0.3, 'learning_rate': 0.05, 'min_samples_leaf': 3}
0.73224907111



In [85]:

    
# best parameters
params = {'n_estimators':1000,
          'learning_rate':0.05,
          'max_depth':6,
          'min_samples_leaf':3
          }
grad_boost = GradientBoostingRegressor(loss='ls',criterion='mse', **params)
grad_boost.fit(df_X,df_y)
RMSE(y_test, grad_boost.predict(X_test))









    Out[85]:





0.10569476051230522

Wow, that's a big improvement on the random forest model!



In [86]:

    
# plot the importances
gb_o = pd.DataFrame({'features':x_cols,'importance':grad_boost.feature_importances_})
gb_o= gb_o.sort_values(by='importance',ascending=False)


plt.figure(1,figsize=(12, 6))
plt.xticks(range(len(gb_o)), gb_o.features,rotation=45)
plt.plot(range(len(gb_o)),gb_o.importance,"o")
plt.title('Feature importances')
plt.show()

Let's use partial_dependence to look at feature interactions. Look at the four most important features.



In [87]:

    
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence



In [97]:

    
df_X.columns









    Out[97]:





Index(['bedrooms', 'bathrooms', 'sqft', 'age_of_head_med', 'income_med',
       'pct_white', 'pct_black', 'pct_any_native', 'pct_asian', 'pct_pacific',
       'pct_other_race', 'pct_mixed_race', 'pct_mover', 'pct_owner',
       'avg_hh_size', 'cars_per_hh'],
      dtype='object')



In [106]:

    
features = [2, 15, 4, 12,(4,2), (4,15)]
names = df_X.columns
fig, axs = plot_partial_dependence(grad_boost, df_X, features,feature_names=names, grid_resolution=50, figsize = (9,6))
fig.suptitle('Partial dependence of rental price features')
plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
plt.show()

In the last two plots, darker values indicate a stronger interaction between the two variables plotted. So income and sqft have a stronger dependence at higher values of both. The final plot suggests that for low levels of car ownership, there is a stronger dependence on median income.

	listing_id	date	rent	bedrooms	bathrooms	sqft	rent_sqft	fips_block	state	region	...	race_of_head_4	race_of_head_5	race_of_head_6	race_of_head_7	race_of_head_8	race_of_head_9	recent_mover_0	recent_mover_1	tenure_1	tenure_2
0	5915417519	2016-12-11	2586.0	2.0	NaN	1039.0	2.488932	060971530013001	CA	sfbay	...	NaN	2.0	23.0	NaN	115.0	29.0	1031.0	145.0	560.0	616.0
1	5915416703	2016-12-11	3178.0	1.0	NaN	574.0	5.536585	060750615003012	CA	sfbay	...	NaN	NaN	472.0	NaN	5.0	2.0	697.0	513.0	613.0	597.0
2	5915414697	2016-12-11	3029.0	1.0	NaN	771.0	3.928664	060816080021000	CA	sfbay	...	NaN	NaN	293.0	3.0	9.0	16.0	636.0	70.0	526.0	180.0
3	5915384383	2016-12-11	2395.0	1.0	NaN	770.0	3.110390	060855081021004	CA	sfbay	...	NaN	NaN	704.0	NaN	1.0	21.0	1005.0	265.0	660.0	610.0
4	5915414020	2016-12-11	2735.0	2.0	NaN	922.0	2.966377	060411042002000	CA	sfbay	...	NaN	8.0	50.0	2.0	75.0	47.0	898.0	113.0	504.0	507.0

	rent	ln_rent	bedrooms	bathrooms	sqft	pct_white	pct_black	pct_asian	pct_mover	pct_owner	income_med	age_of_head_med	avg_hh_size	cars_per_hh
count	1.255390e+05	125539.000000	125539.000000	63137.000000	125539.000000	125473.000000	125473.000000	125473.000000	125473.000000	125473.000000	125539.000000	125539.000000	125473.000000	125473.000000
mean	6.681448e+03	7.896976	1.772246	1.500744	1061.272983	0.613227	0.051669	0.252512	0.240305	0.441876	84181.561563	47.518759	2.386755	1.679141
std	6.574609e+05	0.406612	0.996353	0.640847	3834.013217	0.222957	0.099596	0.199378	0.135384	0.267546	35849.719881	7.499268	0.572601	0.480328
min	1.000000e+00	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.119074	0.084123
25%	2.135000e+03	7.666222	1.000000	NaN	713.000000	NaN	NaN	NaN	NaN	NaN	57650.000000	42.000000	NaN	NaN
50%	2.600000e+03	7.863267	2.000000	NaN	922.000000	NaN	NaN	NaN	NaN	NaN	79500.000000	48.000000	NaN	NaN
75%	3.341000e+03	8.114025	2.000000	NaN	1180.000000	NaN	NaN	NaN	NaN	NaN	105000.000000	53.000000	NaN	NaN
max	2.100204e+08	19.162715	8.000000	7.500000	985024.000000	1.000000	0.940909	0.950382	0.782759	1.000000	267500.000000	78.000000	5.241071	2.966543

	listing_id	date	rent	bedrooms	bathrooms	sqft	rent_sqft	fips_block	state	region	...	pct_asian	pct_pacific	pct_other_race	pct_mixed_race	pct_mover	pct_owner	avg_hh_size	cars_per_hh	ln_rent	ln_income
90	5952685186	2017-01-09	1700.0	1.0	1.0	435.0	3.908046	060133170001018	CA	sfbay	...	0.000000	0.005587	0.013035	0.067039	0.253259	0.504655	1.802607	1.791434	7.438384	11.238489
91	5952700057	2017-01-09	2392.0	1.0	1.0	556.0	4.302158	060855091021031	CA	sfbay	...	0.488615	0.000000	0.015180	0.023719	0.187856	0.668880	1.857685	1.582543	7.779885	11.626254
92	5952699799	2017-01-09	3195.0	3.0	1.0	1107.0	2.886179	060855058002002	CA	sfbay	...	0.166220	0.002681	0.058981	0.016086	0.203753	0.404826	2.016086	1.839142	8.069342	11.302204
93	5952699780	2017-01-09	2398.0	2.0	1.0	1030.0	2.328155	060855002004019	CA	sfbay	...	0.244554	0.000000	0.028713	0.054455	0.500990	0.256436	1.887129	1.630693	7.782390	11.302204
94	5952699706	2017-01-09	3696.0	3.0	2.0	1327.0	2.785230	060855064011006	CA	sfbay	...	0.122523	0.008108	0.114414	0.065766	0.177477	0.370270	2.406306	1.839640	8.215006	10.915088