In [84]:
import pandas as pd
import paramiko
import os
import numpy as np
import math
import pylab
from math import sqrt
from sklearn.metrics import mean_squared_error

Remote connection parameters


In [4]:
hostname = "169.229.154.119"
username = "cy290e"
password = ""  # local password 
#local_key_dir = '~/.ssh/known_hosts'
local_key_dir = 'C:/Users/Ruoying/.ssh'
"""SSH connection parameters"""

listings_dir = 'craigslist_census_02202017/'
"""Remote directory for results"""

# estbalish SSH connection
ssh = paramiko.SSHClient() 
ssh.load_host_keys(local_key_dir)
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(hostname,username=username, password=password)
sftp = ssh.open_sftp()


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-4-49579fcafc92> in <module>()
     11 # estbalish SSH connection
     12 ssh = paramiko.SSHClient()
---> 13 ssh.load_host_keys(local_key_dir)
     14 ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
     15 ssh.connect(hostname,username=username, password=password)

C:\Anaconda\lib\site-packages\paramiko\client.pyc in load_host_keys(self, filename)
    122         """
    123         self._host_keys_filename = filename
--> 124         self._host_keys.load(filename)
    125 
    126     def save_host_keys(self, filename):

C:\Anaconda\lib\site-packages\paramiko\hostkeys.pyc in load(self, filename)
     93         :raises IOError: if there was an error reading the file
     94         """
---> 95         with open(filename, 'r') as f:
     96             for lineno, line in enumerate(f, 1):
     97                 line = line.strip()

IOError: [Errno 13] Permission denied: 'C:/Users/Ruoying/.ssh'

In [41]:
import pysftp

In [42]:
srv = pysftp.Connection(host="169.229.154.119", username="cy290e", password="")


---------------------------------------------------------------------------
BadAuthenticationType                     Traceback (most recent call last)
<ipython-input-42-7eb43a73f19c> in <module>()
----> 1 srv = pysftp.Connection(host="169.229.154.119", username="cy290e", password="")

C:\Anaconda\lib\site-packages\pysftp\__init__.pyc in __init__(self, host, username, private_key, password, port, private_key_pass, ciphers, log, cnopts, default_path)
    141         self._transport.use_compression(self._cnopts.compression)
    142         self._set_authentication(password, private_key, private_key_pass)
--> 143         self._transport.connect(**self._tconnect)
    144 
    145     def _set_authentication(self, password, private_key, private_key_pass):

C:\Anaconda\lib\site-packages\paramiko\transport.pyc in connect(self, hostkey, username, password, pkey, gss_host, gss_auth, gss_kex, gss_deleg_creds)
   1110             else:
   1111                 self._log(DEBUG, 'Attempting password auth...')
-> 1112                 self.auth_password(username, password)
   1113 
   1114         return

C:\Anaconda\lib\site-packages\paramiko\transport.pyc in auth_password(self, username, password, event, fallback)
   1274             return []
   1275         try:
-> 1276             return self.auth_handler.wait_for_response(my_event)
   1277         except BadAuthenticationType as e:
   1278             # if password auth isn't allowed, but keyboard-interactive *is*, try to fudge it

C:\Anaconda\lib\site-packages\paramiko\auth_handler.pyc in wait_for_response(self, event)
    206             if issubclass(e.__class__, PartialAuthentication):
    207                 return e.allowed_types
--> 208             raise e
    209         return []
    210 

BadAuthenticationType: ('Bad authentication type', [u'publickey']) (allowed_types=[u'publickey'])

In [26]:
#https://www.example-code.com/python/sftp_use_ppk_putty_private_key.asp
import sys
import chilkat

sftp = chilkat.CkSFtp()
success = sftp.UnlockComponent("Anything for 30-day trial.")
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()

    #  Load a .ppk PuTTY private key.
puttyKey = chilkat.CkSshKey()
ppkText = puttyKey.loadText("C:/Users/Ruoying/.ssh/putty_rsa_private.ppk")

puttyKey.put_Password("")
success = puttyKey.FromPuttyPrivateKey(ppkText)
if (success != True):
    print(puttyKey.lastErrorText())
    sys.exit()

sshHostname = "169.229.154.119"
sshPort = 22
success = sftp.Connect(sshHostname,sshPort)
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()
    
success = sftp.AuthenticatePk("cy290e",puttyKey)
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()

In [17]:
#https://www.example-code.com/python/sftp_use_ppk_putty_private_key.asp
import sys
import chilkat

ssh = chilkat.CkSsh()
success = ssh.UnlockComponent("Anything for 30-day trial.")
if (success != True):
    print(ssh.lastErrorText())
    sys.exit()

    #  Load a .ppk PuTTY private key.
puttyKey = chilkat.CkSshKey()
ppkText = puttyKey.loadText("C:/Users/Ruoying/.ssh/putty_rsa_private.ppk")

puttyKey.put_Password("")
success = puttyKey.FromPuttyPrivateKey(ppkText)
if (success != True):
    print(puttyKey.lastErrorText())
    sys.exit()

sshHostname = "169.229.154.119"
sshPort = 22
success = ssh.Connect(sshHostname,sshPort)
if (success != True):
    print(ssh.lastErrorText())
    sys.exit()
    
success = ssh.AuthenticatePk("cy290e",puttyKey)
if (success != True):
    print(ssh.lastErrorText())
    sys.exit()

In [31]:
# open a file on the server
# After authenticating, the SFTP subsystem must be initialized:
#https://www.example-code.com/python/sftp_downloadFile.asp

success = sftp.InitializeSftp()
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()

In [34]:
newfile = sftp.openFile("craigslist_census_02202017\cl_census_CA.csv","readOnly","openExisting")
#newfile = sftp.openFile("test.py","readOnly","openExisting")

In [37]:
success=sftp.DownloadFile(newfile,"C:/Users/Ruoying/notebook/cp290/local_data/cl_census_CA.csv")

In [40]:
len(newfile)


Out[40]:
8

In [7]:
def read_listings_file(fname):
    """Read csv file via SFTP and return as dataframe."""
    with sftp.open(listings_dir+fname) as f:
        df = pd.read_csv(f, delimiter=',', dtype={'date':str,'fips_block':str,'state':str,'mpo_id':str})
        # TODO: parse dates. 
    return df

def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
    """Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft. 
    Args: 
        df: Dataframe with listings. Cols names include ['rent','sqft']
        rent_range (tuple): min and max rent
        sqft_range (tuple): min and max sqft
    Returns: 
        DataFrame: listings data without outliers. 
    """
    n0=len(df)
    df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
    n1=len(df)
    print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
    df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
    n2=len(df)
    print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
    return(df)

In [14]:
def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
    """Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft. 
    Args: 
        df: Dataframe with listings. Cols names include ['rent','sqft']
        rent_range (tuple): min and max rent
        sqft_range (tuple): min and max sqft
    Returns: 
        DataFrame: listings data without outliers. 
    """
    n0=len(df)
    df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
    n1=len(df)
    print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
    df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
    n2=len(df)
    print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
    return(df)

Load data


In [10]:
# get list of files and load. 
# TODO: decide what to load. 
state='NY'
infile='cl_census_{}.csv'.format(state)
df = read_listings_file(infile)
df.head()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-10-259ec032052a> in <module>()
      3 state='NY'
      4 infile='cl_census_{}.csv'.format(state)
----> 5 df = read_listings_file(infile)
      6 df.head()

<ipython-input-7-d71d48036a4d> in read_listings_file(fname)
      1 def read_listings_file(fname):
      2     """Read csv file via SFTP and return as dataframe."""
----> 3     with sftp.open(listings_dir+fname) as f:
      4         df = pd.read_csv(f, delimiter=',', dtype={'date':str,'fips_block':str,'state':str,'mpo_id':str})
      5         # TODO: parse dates.

C:\Users\Ruoying\AppData\Roaming\Python\Python27\site-packages\chilkat.pyc in <lambda>(self, name)
  17068     __setattr__ = lambda self, name, value: _swig_setattr(self, CkSFtp, name, value)
  17069     __swig_getmethods__ = {}
> 17070     __getattr__ = lambda self, name: _swig_getattr(self, CkSFtp, name)
  17071     __repr__ = _swig_repr
  17072 

C:\Users\Ruoying\AppData\Roaming\Python\Python27\site-packages\chilkat.pyc in _swig_getattr(self, class_type, name)
     72 
     73 def _swig_getattr(self, class_type, name):
---> 74     return _swig_getattr_nondynamic(self, class_type, name, 0)
     75 
     76 

C:\Users\Ruoying\AppData\Roaming\Python\Python27\site-packages\chilkat.pyc in _swig_getattr_nondynamic(self, class_type, name, static)
     67         return method(self)
     68     if (not static):
---> 69         return object.__getattr__(self, name)
     70     else:
     71         raise AttributeError(name)

AttributeError: type object 'object' has no attribute '__getattr__'

In [2]:
# local data
df = pd.read_csv('C:/Users/Ruoying/notebook/cp290/rental-listings-census/data/cl_census_CA.csv')

In [3]:
df.head()


Out[3]:
listing_id date rent bedrooms sqft rent_sqft fips_block state mpo_id cars_tot ... race_of_head_4 race_of_head_5 race_of_head_6 race_of_head_7 race_of_head_8 race_of_head_9 recent_mover_0 recent_mover_1 tenure_1 tenure_2
0 5873877617 2016-11-13 925 2 874 1.058352 60470010021060 CA 6197202 4603 ... NaN 8 527 1 198 42 1516 628 1310 834
1 5873876292 2016-11-13 735 0 650 1.130769 60070012001017 CA 6198000 646 ... NaN NaN 6 1 3 16 371 44 98 317
2 5873889346 2016-11-13 1675 3 1000 1.675000 60790125023017 CA 6199200 1902 ... NaN NaN 10 NaN 30 21 712 257 477 492
3 5873893871 2016-11-13 1818 2 1084 1.677122 60830020061208 CA 6196600 1058 ... NaN 2 21 1 8 8 427 41 420 48
4 5849643168 2016-11-13 1050 2 1100 0.954545 60890113004008 CA 6198100 786 ... NaN 3 NaN 2 26 23 287 205 202 290

5 rows × 29 columns


In [4]:
# for census vars, NA really means 0...
census_cols = ['cars_tot', 'children_tot','persons_tot', 'workers_tot', 'age_of_head_med', 'income_med','hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3','race_of_head_4', 'race_of_head_5', 'race_of_head_6', 'race_of_head_7','race_of_head_8', 'race_of_head_9', 'recent_mover_0', 'recent_mover_1','tenure_1', 'tenure_2']
for col in census_cols:
    df[col] = df[col].fillna(0)

In [5]:
df[census_cols].head()


Out[5]:
cars_tot children_tot persons_tot workers_tot age_of_head_med income_med hhs_tot race_of_head_1 race_of_head_2 race_of_head_3 race_of_head_4 race_of_head_5 race_of_head_6 race_of_head_7 race_of_head_8 race_of_head_9 recent_mover_0 recent_mover_1 tenure_1 tenure_2
0 4603 2790 7775 2978 44 64205 2144 1295 66 7 0 8 527 1 198 42 1516 628 1310 834
1 646 53 645 394 55 21000 415 387 0 2 0 0 6 1 3 16 371 44 98 317
2 1902 598 2307 1183 48 50000 969 899 6 3 0 0 10 0 30 21 712 257 477 492
3 1058 252 1210 516 55 82675 468 414 0 14 0 2 21 1 8 8 427 41 420 48
4 786 398 1385 454 49 39500 492 425 0 13 0 3 0 2 26 23 287 205 202 290

In [6]:
# make meaningful variables.
df['pct_race_1'] = df['race_of_head_1']/df['hhs_tot']
df['pct_race_2'] = df['race_of_head_2']/df['hhs_tot']
df['pct_race_3'] = df['race_of_head_3']/df['hhs_tot']
df['pct_mover'] = df['recent_mover_1']/df['hhs_tot']
df['pct_tenure_1'] = df['tenure_1']/df['hhs_tot']
df['avg_hh_size'] = df['persons_tot']/df['hhs_tot']
df['cars_per_hh'] = df['cars_tot']/df['hhs_tot']


df['ln_rent'] = np.log(df['rent'])

In [7]:
features_to_examine = ['rent','ln_rent', 'bedrooms', 'sqft','pct_race_1','pct_race_2','pct_race_3','pct_mover','pct_tenure_1','income_med','age_of_head_med','avg_hh_size','cars_per_hh']
df[features_to_examine].describe()


Out[7]:
rent ln_rent bedrooms sqft pct_race_1 pct_race_2 pct_race_3 pct_mover pct_tenure_1 income_med age_of_head_med avg_hh_size cars_per_hh
count 99983.000000 99983.000000 99983.000000 99983.000000 99873.000000 99873.000000 99873.000000 99873.000000 99873.000000 99983.000000 99983.000000 99873.000000 99873.000000
mean 4146.943140 7.606714 1.855145 1099.243551 0.689928 0.055831 0.004523 0.250387 0.422082 67663.143429 47.536421 2.454458 1.712874
std 295221.312495 0.514716 1.026275 1588.197792 0.208653 0.089254 0.008103 0.144263 0.289548 33404.836031 8.167785 0.650203 0.436757
min 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
25% 1500.000000 7.313220 1.000000 740.000000 0.564815 0.000000 0.000000 0.137255 0.166667 43575.000000 42.000000 1.956522 1.440000
50% 1995.000000 7.598399 2.000000 950.000000 0.727833 0.022613 0.002198 0.244579 0.400000 61600.000000 48.000000 2.410755 1.745501
75% 2685.000000 7.895436 2.000000 1200.000000 0.850420 0.072745 0.005634 0.349929 0.657823 85000.000000 53.000000 2.884972 2.008929
max 83157676.000000 18.236249 9.000000 227510.000000 1.000000 0.973036 0.252294 1.000000 1.000000 421000.000000 79.000000 5.248571 2.917143

Filter outliers

Can use the following to examine outliers...


In [8]:
rent_range=(100,10000)
sqft_range=(10,5000)
"""Ranges to define outliers in CL data. I defined based on inspection of distributions."""


Out[8]:
'Ranges to define outliers in CL data. I defined based on inspection of distributions.'

In [27]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# use this to define outliers. 
g=sns.distplot(df['rent'],kde='false')
g.set_xlim(0,100000)

g=sns.distplot(df['sqft'],kde='false')
g.set_xlim(0,10000)


Out[27]:
(0, 10000)

In [15]:
df = filter_outliers(df)


Dropped 614 outside rent range $100-$10000
Dropped 161 outside sqft range 10-5000 sqft. 99208 rows remaining

drop NA


In [16]:
df = df[features_to_examine]
print(len(df))
print(len(df[pd.isnull(df).any(axis=1)]))
df = df.dropna()


99208
110

look at distributions

Since rent has a more or less logarithmic distribution, we use ln_rent instead


In [17]:
sns.distplot(df.rent)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a142b70>

In [18]:
sns.distplot(df.ln_rent)


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x858def0>

In [26]:
plot_rows = math.ceil(len(features_to_examine)/2)

f, axes = plt.subplots(plot_rows,2, figsize=(8,12))
sns.despine(left=True)

for i,col in enumerate(features_to_examine):
    row_position = round(i/2)
    col_position = i%2
    p=sns.distplot(df[col], ax=axes[row_position, col_position])
    p.set_title(''.format(col))  # while isn't the subplot title working? 
plt.show()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-26-5620a62eade9> in <module>()
      1 plot_rows = math.ceil(len(features_to_examine)/2)
      2 
----> 3 f, axes = plt.subplots(plot_rows,2, figsize=(8,12))
      4 sns.despine(left=True)
      5 

C:\Anaconda\lib\site-packages\matplotlib\pyplot.pyc in subplots(nrows, ncols, sharex, sharey, squeeze, subplot_kw, gridspec_kw, **fig_kw)
   1184 
   1185     # Create first subplot separately, so we can share it if requested
-> 1186     ax0 = fig.add_subplot(gs[0, 0], **subplot_kw)
   1187     axarr[0] = ax0
   1188 

C:\Anaconda\lib\site-packages\matplotlib\figure.pyc in add_subplot(self, *args, **kwargs)
   1003                     self._axstack.remove(ax)
   1004 
-> 1005             a = subplot_class_factory(projection_class)(self, *args, **kwargs)
   1006 
   1007         self._axstack.add(key, a)

C:\Anaconda\lib\site-packages\matplotlib\axes\_subplots.pyc in __init__(self, fig, *args, **kwargs)
     68             raise ValueError('Illegal argument(s) to subplot: %s' % (args,))
     69 
---> 70         self.update_params()
     71 
     72         # _axes_class is set in the subplot_class_factory

C:\Anaconda\lib\site-packages\matplotlib\axes\_subplots.pyc in update_params(self)
    112         self.figbox, self.rowNum, self.colNum, self.numRows, self.numCols = \
    113             self.get_subplotspec().get_position(self.figure,
--> 114                                                 return_all=True)
    115 
    116     def is_first_col(self):

C:\Anaconda\lib\site-packages\matplotlib\gridspec.pyc in get_position(self, fig, return_all)
    423 
    424         figBottoms, figTops, figLefts, figRights = \
--> 425                     gridspec.get_grid_positions(fig)
    426 
    427 

C:\Anaconda\lib\site-packages\matplotlib\gridspec.pyc in get_grid_positions(self, fig)
    103             cellHeights = [netHeight*r/tr for r in self._row_height_ratios]
    104         else:
--> 105             cellHeights = [cellH] * nrows
    106 
    107         sepHeights = [0] + ([sepH] * (nrows-1))

TypeError: can't multiply sequence by non-int of type 'float'
<matplotlib.figure.Figure at 0x19c7c390>

In [ ]:

look at correlations


In [45]:
# correlation heatmap
corrmat=df[features_to_examine].corr()
corrmat.head()

f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

f.tight_layout()



In [21]:
df.head()


Out[21]:
rent ln_rent bedrooms sqft pct_race_1 pct_race_2 pct_race_3 pct_mover pct_tenure_1 income_med age_of_head_med avg_hh_size cars_per_hh
0 925 6.829794 2 874 0.604011 0.030784 0.003265 0.292910 0.611007 64205 44 3.626399 2.146922
1 735 6.599870 0 650 0.932530 0.000000 0.004819 0.106024 0.236145 21000 55 1.554217 1.556627
2 1675 7.423568 3 1000 0.927761 0.006192 0.003096 0.265222 0.492260 50000 48 2.380805 1.962848
3 1818 7.505492 2 1084 0.884615 0.000000 0.029915 0.087607 0.897436 82675 55 2.585470 2.260684
4 1050 6.956545 2 1100 0.863821 0.000000 0.026423 0.416667 0.410569 39500 49 2.815041 1.597561

In [24]:
import statsmodels.api as sm
from patsy import dmatrices

In [25]:
y, X = dmatrices('ln_rent ~ bedrooms+sqft+pct_race_1+pct_race_2+pct_race_3+pct_mover+pct_tenure_1+income_med+cars_per_hh', 
                 data=df, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit() 
print res.summary() 

# tenure_1: owner


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                ln_rent   R-squared:                       0.472
Model:                            OLS   Adj. R-squared:                  0.472
Method:                 Least Squares   F-statistic:                     9827.
Date:                Fri, 10 Mar 2017   Prob (F-statistic):               0.00
Time:                        10:13:04   Log-Likelihood:                -33714.
No. Observations:               99098   AIC:                         6.745e+04
Df Residuals:                   99088   BIC:                         6.754e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        7.5936      0.007   1020.176      0.000         7.579     7.608
bedrooms         0.0161      0.002      9.098      0.000         0.013     0.020
sqft             0.0004   3.41e-06    110.067      0.000         0.000     0.000
pct_race_1      -0.1606      0.006    -26.667      0.000        -0.172    -0.149
pct_race_2      -0.4731      0.014    -33.816      0.000        -0.501    -0.446
pct_race_3      -6.9133      0.138    -50.061      0.000        -7.184    -6.643
pct_mover       -0.3336      0.010    -32.136      0.000        -0.354    -0.313
pct_tenure_1    -0.4853      0.007    -65.837      0.000        -0.500    -0.471
income_med    7.632e-06   4.33e-08    176.290      0.000      7.55e-06  7.72e-06
cars_per_hh     -0.2819      0.004    -78.510      0.000        -0.289    -0.275
==============================================================================
Omnibus:                    13777.824   Durbin-Watson:                   1.456
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            48979.321
Skew:                          -0.688   Prob(JB):                         0.00
Kurtosis:                       6.158   Cond. No.                     9.59e+06
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.59e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

random forest generator


In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold


C:\Anaconda\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [38]:
df.columns.values


Out[38]:
array(['rent', 'ln_rent', 'bedrooms', 'sqft', 'pct_race_1', 'pct_race_2',
       'pct_race_3', 'pct_mover', 'pct_tenure_1', 'income_med',
       'age_of_head_med', 'avg_hh_size', 'cars_per_hh'], dtype=object)

In [64]:
cols_all = ['ln_rent','bedrooms', 'sqft','pct_race_1', 'pct_race_2',
       'pct_race_3', 'pct_mover', 'pct_tenure_1', 'income_med',
       'age_of_head_med', 'avg_hh_size', 'cars_per_hh']

cols = ['bedrooms', 'sqft','pct_race_1', 'pct_race_2',
       'pct_race_3', 'pct_mover', 'pct_tenure_1', 'income_med',
       'age_of_head_med', 'avg_hh_size', 'cars_per_hh']

df_val=df[cols_all].values
X_df= df[cols].values
y_df = df['ln_rent'].values

In [65]:
#xx=X_df.as_matrix()
#yy=y_df.as_matrix()

X_df1=np.array(X_df)
y_df1=np.array(y_df)
df_np=np.array(df_val)

In [73]:
X_df2=df_np[:,1:]
print X_df2.shape


(99098L, 11L)

In [74]:
y_df2=df_np[:,:1]
print y_df2.shape


(99098L, 1L)

In [78]:
random_forest = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=None)
random_forest.fit(X_df2, y_df2)


C:\Anaconda\lib\site-packages\ipykernel\__main__.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  app.launch_new_instance()
Out[78]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [67]:
print df_np.shape


(99098L, 12L)

In [59]:
print X_df1.shape


(99098L, 11L)

In [60]:
random_forest = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None)
random_forest.fit(X_df1, y_df1)
#random_forest.fit(X_df, y_df)
#y_test_rf = random_forest.predict(X_test_rf)


Out[60]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [61]:
rf_o = pd.DataFrame({'features':cols,'importance':random_forest.feature_importances_})
rf_o= rf_o.sort('importance',ascending=False)
pylab.figure(1,figsize=(12, 6))
pylab.xticks(range(len(rf_o)), rf_o.features,rotation=45)
pylab.plot(range(len(rf_o)),rf_o.importance,"o")

pylab.show()



In [80]:
#RMSE
def RMSE(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

# cross validation 
def cross_val(Traindata, est):
    kf=KFold(len(Traindata),n_folds=10)
    E = np.ones(10)
    i = 0
    for train,test in kf:
        train_data = Traindata[train,:]
        train_data_x = train_data[:,1:]
        train_data_y = train_data[:,:1]
        x = Traindata[test,:]
        test_data_x = x[:,1:]

        real_test_Y = x[:,:1]
        
        #all_predict=[]
        random_forest = RandomForestRegressor(n_estimators=est, criterion='mse', max_depth=None)
        random_forest.fit(train_data_x,train_data_y)
        predict_Y=random_forest.predict(test_data_x)
        error = RMSE(real_test_Y, predict_Y)
        E[i] = error
        i = i + 1
    return np.mean(E)

In [86]:
# find the optimal parameters for random forest prediction  
def opt_est(data):
    
    #initialize est
    est_max=30
    est=20
    est_list = []
    error_list = []
    
    while est<=est_max:
           
        err=cross_val(data,est)
        error_list.append(err)
        est_list.append(est)
        est+=1

    return min(error_list), est_list[[i for i, x in enumerate(error_list) if x == min(error_list)][0]]

In [87]:
print "min RMSE", "   opt #est"
opt_est(df_np)


min RMSE    opt #est
C:\Anaconda\lib\site-packages\ipykernel\__main__.py:21: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
Out[87]:
(0.18566607643666994, 30)

In [28]:
df1 = pd.read_csv('C:/Users/Ruoying/notebook/cp290/rental-listings-census/data/cl_census_CA.csv')
df1.head()


Out[28]:
listing_id date rent bedrooms sqft rent_sqft fips_block state mpo_id cars_tot ... race_of_head_4 race_of_head_5 race_of_head_6 race_of_head_7 race_of_head_8 race_of_head_9 recent_mover_0 recent_mover_1 tenure_1 tenure_2
0 5873877617 2016-11-13 925 2 874 1.058352 60470010021060 CA 6197202 4603 ... NaN 8 527 1 198 42 1516 628 1310 834
1 5873876292 2016-11-13 735 0 650 1.130769 60070012001017 CA 6198000 646 ... NaN NaN 6 1 3 16 371 44 98 317
2 5873889346 2016-11-13 1675 3 1000 1.675000 60790125023017 CA 6199200 1902 ... NaN NaN 10 NaN 30 21 712 257 477 492
3 5873893871 2016-11-13 1818 2 1084 1.677122 60830020061208 CA 6196600 1058 ... NaN 2 21 1 8 8 427 41 420 48
4 5849643168 2016-11-13 1050 2 1100 0.954545 60890113004008 CA 6198100 786 ... NaN 3 NaN 2 26 23 287 205 202 290

5 rows × 29 columns


In [35]:
df1.loc['rent']


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-35-adce12dc3044> in <module>()
----> 1 df1.loc['rent']

C:\Anaconda\lib\site-packages\pandas\core\indexing.pyc in __getitem__(self, key)
   1187             return self._getitem_tuple(key)
   1188         else:
-> 1189             return self._getitem_axis(key, axis=0)
   1190 
   1191     def _getitem_axis(self, key, axis=0):

C:\Anaconda\lib\site-packages\pandas\core\indexing.pyc in _getitem_axis(self, key, axis)
   1331 
   1332         # fall thru to straight lookup
-> 1333         self._has_valid_type(key, axis)
   1334         return self._get_label(key, axis=axis)
   1335 

C:\Anaconda\lib\site-packages\pandas\core\indexing.pyc in _has_valid_type(self, key, axis)
   1293                 raise
   1294             except:
-> 1295                 error()
   1296 
   1297         return True

C:\Anaconda\lib\site-packages\pandas\core\indexing.pyc in error()
   1280                         "cannot use label indexing with a null key")
   1281                 raise KeyError("the label [%s] is not in the [%s]" %
-> 1282                                (key, self.obj._get_axis_name(axis)))
   1283 
   1284             try:

KeyError: 'the label [rent] is not in the [index]'