In [84]:
    
import pandas as pd
import paramiko
import os
import numpy as np
import math
import pylab
from math import sqrt
from sklearn.metrics import mean_squared_error
    
In [4]:
    
hostname = "169.229.154.119"
username = "cy290e"
password = ""  # local password 
#local_key_dir = '~/.ssh/known_hosts'
local_key_dir = 'C:/Users/Ruoying/.ssh'
"""SSH connection parameters"""
listings_dir = 'craigslist_census_02202017/'
"""Remote directory for results"""
# estbalish SSH connection
ssh = paramiko.SSHClient() 
ssh.load_host_keys(local_key_dir)
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(hostname,username=username, password=password)
sftp = ssh.open_sftp()
    
    
In [41]:
    
import pysftp
    
In [42]:
    
srv = pysftp.Connection(host="169.229.154.119", username="cy290e", password="")
    
    
In [26]:
    
#https://www.example-code.com/python/sftp_use_ppk_putty_private_key.asp
import sys
import chilkat
sftp = chilkat.CkSFtp()
success = sftp.UnlockComponent("Anything for 30-day trial.")
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()
    #  Load a .ppk PuTTY private key.
puttyKey = chilkat.CkSshKey()
ppkText = puttyKey.loadText("C:/Users/Ruoying/.ssh/putty_rsa_private.ppk")
puttyKey.put_Password("")
success = puttyKey.FromPuttyPrivateKey(ppkText)
if (success != True):
    print(puttyKey.lastErrorText())
    sys.exit()
sshHostname = "169.229.154.119"
sshPort = 22
success = sftp.Connect(sshHostname,sshPort)
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()
    
success = sftp.AuthenticatePk("cy290e",puttyKey)
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()
    
In [17]:
    
#https://www.example-code.com/python/sftp_use_ppk_putty_private_key.asp
import sys
import chilkat
ssh = chilkat.CkSsh()
success = ssh.UnlockComponent("Anything for 30-day trial.")
if (success != True):
    print(ssh.lastErrorText())
    sys.exit()
    #  Load a .ppk PuTTY private key.
puttyKey = chilkat.CkSshKey()
ppkText = puttyKey.loadText("C:/Users/Ruoying/.ssh/putty_rsa_private.ppk")
puttyKey.put_Password("")
success = puttyKey.FromPuttyPrivateKey(ppkText)
if (success != True):
    print(puttyKey.lastErrorText())
    sys.exit()
sshHostname = "169.229.154.119"
sshPort = 22
success = ssh.Connect(sshHostname,sshPort)
if (success != True):
    print(ssh.lastErrorText())
    sys.exit()
    
success = ssh.AuthenticatePk("cy290e",puttyKey)
if (success != True):
    print(ssh.lastErrorText())
    sys.exit()
    
In [31]:
    
# open a file on the server
# After authenticating, the SFTP subsystem must be initialized:
#https://www.example-code.com/python/sftp_downloadFile.asp
success = sftp.InitializeSftp()
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()
    
In [34]:
    
newfile = sftp.openFile("craigslist_census_02202017\cl_census_CA.csv","readOnly","openExisting")
#newfile = sftp.openFile("test.py","readOnly","openExisting")
    
In [37]:
    
success=sftp.DownloadFile(newfile,"C:/Users/Ruoying/notebook/cp290/local_data/cl_census_CA.csv")
    
In [40]:
    
len(newfile)
    
    Out[40]:
In [7]:
    
def read_listings_file(fname):
    """Read csv file via SFTP and return as dataframe."""
    with sftp.open(listings_dir+fname) as f:
        df = pd.read_csv(f, delimiter=',', dtype={'date':str,'fips_block':str,'state':str,'mpo_id':str})
        # TODO: parse dates. 
    return df
def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
    """Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft. 
    Args: 
        df: Dataframe with listings. Cols names include ['rent','sqft']
        rent_range (tuple): min and max rent
        sqft_range (tuple): min and max sqft
    Returns: 
        DataFrame: listings data without outliers. 
    """
    n0=len(df)
    df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
    n1=len(df)
    print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
    df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
    n2=len(df)
    print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
    return(df)
    
In [14]:
    
def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
    """Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft. 
    Args: 
        df: Dataframe with listings. Cols names include ['rent','sqft']
        rent_range (tuple): min and max rent
        sqft_range (tuple): min and max sqft
    Returns: 
        DataFrame: listings data without outliers. 
    """
    n0=len(df)
    df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
    n1=len(df)
    print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
    df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
    n2=len(df)
    print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
    return(df)
    
In [10]:
    
# get list of files and load. 
# TODO: decide what to load. 
state='NY'
infile='cl_census_{}.csv'.format(state)
df = read_listings_file(infile)
df.head()
    
    
In [2]:
    
# local data
df = pd.read_csv('C:/Users/Ruoying/notebook/cp290/rental-listings-census/data/cl_census_CA.csv')
    
In [3]:
    
df.head()
    
    Out[3]:
In [4]:
    
# for census vars, NA really means 0...
census_cols = ['cars_tot', 'children_tot','persons_tot', 'workers_tot', 'age_of_head_med', 'income_med','hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3','race_of_head_4', 'race_of_head_5', 'race_of_head_6', 'race_of_head_7','race_of_head_8', 'race_of_head_9', 'recent_mover_0', 'recent_mover_1','tenure_1', 'tenure_2']
for col in census_cols:
    df[col] = df[col].fillna(0)
    
In [5]:
    
df[census_cols].head()
    
    Out[5]:
In [6]:
    
# make meaningful variables.
df['pct_race_1'] = df['race_of_head_1']/df['hhs_tot']
df['pct_race_2'] = df['race_of_head_2']/df['hhs_tot']
df['pct_race_3'] = df['race_of_head_3']/df['hhs_tot']
df['pct_mover'] = df['recent_mover_1']/df['hhs_tot']
df['pct_tenure_1'] = df['tenure_1']/df['hhs_tot']
df['avg_hh_size'] = df['persons_tot']/df['hhs_tot']
df['cars_per_hh'] = df['cars_tot']/df['hhs_tot']
df['ln_rent'] = np.log(df['rent'])
    
In [7]:
    
features_to_examine = ['rent','ln_rent', 'bedrooms', 'sqft','pct_race_1','pct_race_2','pct_race_3','pct_mover','pct_tenure_1','income_med','age_of_head_med','avg_hh_size','cars_per_hh']
df[features_to_examine].describe()
    
    Out[7]:
In [8]:
    
rent_range=(100,10000)
sqft_range=(10,5000)
"""Ranges to define outliers in CL data. I defined based on inspection of distributions."""
    
    Out[8]:
In [27]:
    
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# use this to define outliers. 
g=sns.distplot(df['rent'],kde='false')
g.set_xlim(0,100000)
g=sns.distplot(df['sqft'],kde='false')
g.set_xlim(0,10000)
    
    Out[27]:
    
In [15]:
    
df = filter_outliers(df)
    
    
In [16]:
    
df = df[features_to_examine]
print(len(df))
print(len(df[pd.isnull(df).any(axis=1)]))
df = df.dropna()
    
    
In [17]:
    
sns.distplot(df.rent)
    
    Out[17]:
    
In [18]:
    
sns.distplot(df.ln_rent)
    
    Out[18]:
    
In [26]:
    
plot_rows = math.ceil(len(features_to_examine)/2)
f, axes = plt.subplots(plot_rows,2, figsize=(8,12))
sns.despine(left=True)
for i,col in enumerate(features_to_examine):
    row_position = round(i/2)
    col_position = i%2
    p=sns.distplot(df[col], ax=axes[row_position, col_position])
    p.set_title(''.format(col))  # while isn't the subplot title working? 
plt.show()
    
    
    
In [ ]:
    
    
In [45]:
    
# correlation heatmap
corrmat=df[features_to_examine].corr()
corrmat.head()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
f.tight_layout()
    
    
In [21]:
    
df.head()
    
    Out[21]:
In [24]:
    
import statsmodels.api as sm
from patsy import dmatrices
    
In [25]:
    
y, X = dmatrices('ln_rent ~ bedrooms+sqft+pct_race_1+pct_race_2+pct_race_3+pct_mover+pct_tenure_1+income_med+cars_per_hh', 
                 data=df, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit() 
print res.summary() 
# tenure_1: owner
    
    
random forest generator
In [37]:
    
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
    
    
In [38]:
    
df.columns.values
    
    Out[38]:
In [64]:
    
cols_all = ['ln_rent','bedrooms', 'sqft','pct_race_1', 'pct_race_2',
       'pct_race_3', 'pct_mover', 'pct_tenure_1', 'income_med',
       'age_of_head_med', 'avg_hh_size', 'cars_per_hh']
cols = ['bedrooms', 'sqft','pct_race_1', 'pct_race_2',
       'pct_race_3', 'pct_mover', 'pct_tenure_1', 'income_med',
       'age_of_head_med', 'avg_hh_size', 'cars_per_hh']
df_val=df[cols_all].values
X_df= df[cols].values
y_df = df['ln_rent'].values
    
In [65]:
    
#xx=X_df.as_matrix()
#yy=y_df.as_matrix()
X_df1=np.array(X_df)
y_df1=np.array(y_df)
df_np=np.array(df_val)
    
In [73]:
    
X_df2=df_np[:,1:]
print X_df2.shape
    
    
In [74]:
    
y_df2=df_np[:,:1]
print y_df2.shape
    
    
In [78]:
    
random_forest = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=None)
random_forest.fit(X_df2, y_df2)
    
    
    Out[78]:
In [67]:
    
print df_np.shape
    
    
In [59]:
    
print X_df1.shape
    
    
In [60]:
    
random_forest = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None)
random_forest.fit(X_df1, y_df1)
#random_forest.fit(X_df, y_df)
#y_test_rf = random_forest.predict(X_test_rf)
    
    Out[60]:
In [61]:
    
rf_o = pd.DataFrame({'features':cols,'importance':random_forest.feature_importances_})
rf_o= rf_o.sort('importance',ascending=False)
pylab.figure(1,figsize=(12, 6))
pylab.xticks(range(len(rf_o)), rf_o.features,rotation=45)
pylab.plot(range(len(rf_o)),rf_o.importance,"o")
pylab.show()
    
    
In [80]:
    
#RMSE
def RMSE(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))
# cross validation 
def cross_val(Traindata, est):
    kf=KFold(len(Traindata),n_folds=10)
    E = np.ones(10)
    i = 0
    for train,test in kf:
        train_data = Traindata[train,:]
        train_data_x = train_data[:,1:]
        train_data_y = train_data[:,:1]
        x = Traindata[test,:]
        test_data_x = x[:,1:]
        real_test_Y = x[:,:1]
        
        #all_predict=[]
        random_forest = RandomForestRegressor(n_estimators=est, criterion='mse', max_depth=None)
        random_forest.fit(train_data_x,train_data_y)
        predict_Y=random_forest.predict(test_data_x)
        error = RMSE(real_test_Y, predict_Y)
        E[i] = error
        i = i + 1
    return np.mean(E)
    
In [86]:
    
# find the optimal parameters for random forest prediction  
def opt_est(data):
    
    #initialize est
    est_max=30
    est=20
    est_list = []
    error_list = []
    
    while est<=est_max:
           
        err=cross_val(data,est)
        error_list.append(err)
        est_list.append(est)
        est+=1
    return min(error_list), est_list[[i for i, x in enumerate(error_list) if x == min(error_list)][0]]
    
In [87]:
    
print "min RMSE", "   opt #est"
opt_est(df_np)
    
    
    
    Out[87]:
In [28]:
    
df1 = pd.read_csv('C:/Users/Ruoying/notebook/cp290/rental-listings-census/data/cl_census_CA.csv')
df1.head()
    
    Out[28]:
In [35]:
    
df1.loc['rent']