In [84]:
import pandas as pd
import paramiko
import os
import numpy as np
import math
import pylab
from math import sqrt
from sklearn.metrics import mean_squared_error
In [4]:
hostname = "169.229.154.119"
username = "cy290e"
password = "" # local password
#local_key_dir = '~/.ssh/known_hosts'
local_key_dir = 'C:/Users/Ruoying/.ssh'
"""SSH connection parameters"""
listings_dir = 'craigslist_census_02202017/'
"""Remote directory for results"""
# estbalish SSH connection
ssh = paramiko.SSHClient()
ssh.load_host_keys(local_key_dir)
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(hostname,username=username, password=password)
sftp = ssh.open_sftp()
In [41]:
import pysftp
In [42]:
srv = pysftp.Connection(host="169.229.154.119", username="cy290e", password="")
In [26]:
#https://www.example-code.com/python/sftp_use_ppk_putty_private_key.asp
import sys
import chilkat
sftp = chilkat.CkSFtp()
success = sftp.UnlockComponent("Anything for 30-day trial.")
if (success != True):
print(sftp.lastErrorText())
sys.exit()
# Load a .ppk PuTTY private key.
puttyKey = chilkat.CkSshKey()
ppkText = puttyKey.loadText("C:/Users/Ruoying/.ssh/putty_rsa_private.ppk")
puttyKey.put_Password("")
success = puttyKey.FromPuttyPrivateKey(ppkText)
if (success != True):
print(puttyKey.lastErrorText())
sys.exit()
sshHostname = "169.229.154.119"
sshPort = 22
success = sftp.Connect(sshHostname,sshPort)
if (success != True):
print(sftp.lastErrorText())
sys.exit()
success = sftp.AuthenticatePk("cy290e",puttyKey)
if (success != True):
print(sftp.lastErrorText())
sys.exit()
In [17]:
#https://www.example-code.com/python/sftp_use_ppk_putty_private_key.asp
import sys
import chilkat
ssh = chilkat.CkSsh()
success = ssh.UnlockComponent("Anything for 30-day trial.")
if (success != True):
print(ssh.lastErrorText())
sys.exit()
# Load a .ppk PuTTY private key.
puttyKey = chilkat.CkSshKey()
ppkText = puttyKey.loadText("C:/Users/Ruoying/.ssh/putty_rsa_private.ppk")
puttyKey.put_Password("")
success = puttyKey.FromPuttyPrivateKey(ppkText)
if (success != True):
print(puttyKey.lastErrorText())
sys.exit()
sshHostname = "169.229.154.119"
sshPort = 22
success = ssh.Connect(sshHostname,sshPort)
if (success != True):
print(ssh.lastErrorText())
sys.exit()
success = ssh.AuthenticatePk("cy290e",puttyKey)
if (success != True):
print(ssh.lastErrorText())
sys.exit()
In [31]:
# open a file on the server
# After authenticating, the SFTP subsystem must be initialized:
#https://www.example-code.com/python/sftp_downloadFile.asp
success = sftp.InitializeSftp()
if (success != True):
print(sftp.lastErrorText())
sys.exit()
In [34]:
newfile = sftp.openFile("craigslist_census_02202017\cl_census_CA.csv","readOnly","openExisting")
#newfile = sftp.openFile("test.py","readOnly","openExisting")
In [37]:
success=sftp.DownloadFile(newfile,"C:/Users/Ruoying/notebook/cp290/local_data/cl_census_CA.csv")
In [40]:
len(newfile)
Out[40]:
In [7]:
def read_listings_file(fname):
"""Read csv file via SFTP and return as dataframe."""
with sftp.open(listings_dir+fname) as f:
df = pd.read_csv(f, delimiter=',', dtype={'date':str,'fips_block':str,'state':str,'mpo_id':str})
# TODO: parse dates.
return df
def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
"""Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft.
Args:
df: Dataframe with listings. Cols names include ['rent','sqft']
rent_range (tuple): min and max rent
sqft_range (tuple): min and max sqft
Returns:
DataFrame: listings data without outliers.
"""
n0=len(df)
df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
n1=len(df)
print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
n2=len(df)
print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
return(df)
In [14]:
def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
"""Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft.
Args:
df: Dataframe with listings. Cols names include ['rent','sqft']
rent_range (tuple): min and max rent
sqft_range (tuple): min and max sqft
Returns:
DataFrame: listings data without outliers.
"""
n0=len(df)
df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
n1=len(df)
print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
n2=len(df)
print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
return(df)
In [10]:
# get list of files and load.
# TODO: decide what to load.
state='NY'
infile='cl_census_{}.csv'.format(state)
df = read_listings_file(infile)
df.head()
In [2]:
# local data
df = pd.read_csv('C:/Users/Ruoying/notebook/cp290/rental-listings-census/data/cl_census_CA.csv')
In [3]:
df.head()
Out[3]:
In [4]:
# for census vars, NA really means 0...
census_cols = ['cars_tot', 'children_tot','persons_tot', 'workers_tot', 'age_of_head_med', 'income_med','hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3','race_of_head_4', 'race_of_head_5', 'race_of_head_6', 'race_of_head_7','race_of_head_8', 'race_of_head_9', 'recent_mover_0', 'recent_mover_1','tenure_1', 'tenure_2']
for col in census_cols:
df[col] = df[col].fillna(0)
In [5]:
df[census_cols].head()
Out[5]:
In [6]:
# make meaningful variables.
df['pct_race_1'] = df['race_of_head_1']/df['hhs_tot']
df['pct_race_2'] = df['race_of_head_2']/df['hhs_tot']
df['pct_race_3'] = df['race_of_head_3']/df['hhs_tot']
df['pct_mover'] = df['recent_mover_1']/df['hhs_tot']
df['pct_tenure_1'] = df['tenure_1']/df['hhs_tot']
df['avg_hh_size'] = df['persons_tot']/df['hhs_tot']
df['cars_per_hh'] = df['cars_tot']/df['hhs_tot']
df['ln_rent'] = np.log(df['rent'])
In [7]:
features_to_examine = ['rent','ln_rent', 'bedrooms', 'sqft','pct_race_1','pct_race_2','pct_race_3','pct_mover','pct_tenure_1','income_med','age_of_head_med','avg_hh_size','cars_per_hh']
df[features_to_examine].describe()
Out[7]:
In [8]:
rent_range=(100,10000)
sqft_range=(10,5000)
"""Ranges to define outliers in CL data. I defined based on inspection of distributions."""
Out[8]:
In [27]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# use this to define outliers.
g=sns.distplot(df['rent'],kde='false')
g.set_xlim(0,100000)
g=sns.distplot(df['sqft'],kde='false')
g.set_xlim(0,10000)
Out[27]:
In [15]:
df = filter_outliers(df)
In [16]:
df = df[features_to_examine]
print(len(df))
print(len(df[pd.isnull(df).any(axis=1)]))
df = df.dropna()
In [17]:
sns.distplot(df.rent)
Out[17]:
In [18]:
sns.distplot(df.ln_rent)
Out[18]:
In [26]:
plot_rows = math.ceil(len(features_to_examine)/2)
f, axes = plt.subplots(plot_rows,2, figsize=(8,12))
sns.despine(left=True)
for i,col in enumerate(features_to_examine):
row_position = round(i/2)
col_position = i%2
p=sns.distplot(df[col], ax=axes[row_position, col_position])
p.set_title(''.format(col)) # while isn't the subplot title working?
plt.show()
In [ ]:
In [45]:
# correlation heatmap
corrmat=df[features_to_examine].corr()
corrmat.head()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
f.tight_layout()
In [21]:
df.head()
Out[21]:
In [24]:
import statsmodels.api as sm
from patsy import dmatrices
In [25]:
y, X = dmatrices('ln_rent ~ bedrooms+sqft+pct_race_1+pct_race_2+pct_race_3+pct_mover+pct_tenure_1+income_med+cars_per_hh',
data=df, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
print res.summary()
# tenure_1: owner
random forest generator
In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
In [38]:
df.columns.values
Out[38]:
In [64]:
cols_all = ['ln_rent','bedrooms', 'sqft','pct_race_1', 'pct_race_2',
'pct_race_3', 'pct_mover', 'pct_tenure_1', 'income_med',
'age_of_head_med', 'avg_hh_size', 'cars_per_hh']
cols = ['bedrooms', 'sqft','pct_race_1', 'pct_race_2',
'pct_race_3', 'pct_mover', 'pct_tenure_1', 'income_med',
'age_of_head_med', 'avg_hh_size', 'cars_per_hh']
df_val=df[cols_all].values
X_df= df[cols].values
y_df = df['ln_rent'].values
In [65]:
#xx=X_df.as_matrix()
#yy=y_df.as_matrix()
X_df1=np.array(X_df)
y_df1=np.array(y_df)
df_np=np.array(df_val)
In [73]:
X_df2=df_np[:,1:]
print X_df2.shape
In [74]:
y_df2=df_np[:,:1]
print y_df2.shape
In [78]:
random_forest = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=None)
random_forest.fit(X_df2, y_df2)
Out[78]:
In [67]:
print df_np.shape
In [59]:
print X_df1.shape
In [60]:
random_forest = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None)
random_forest.fit(X_df1, y_df1)
#random_forest.fit(X_df, y_df)
#y_test_rf = random_forest.predict(X_test_rf)
Out[60]:
In [61]:
rf_o = pd.DataFrame({'features':cols,'importance':random_forest.feature_importances_})
rf_o= rf_o.sort('importance',ascending=False)
pylab.figure(1,figsize=(12, 6))
pylab.xticks(range(len(rf_o)), rf_o.features,rotation=45)
pylab.plot(range(len(rf_o)),rf_o.importance,"o")
pylab.show()
In [80]:
#RMSE
def RMSE(y_actual, y_predicted):
return sqrt(mean_squared_error(y_actual, y_predicted))
# cross validation
def cross_val(Traindata, est):
kf=KFold(len(Traindata),n_folds=10)
E = np.ones(10)
i = 0
for train,test in kf:
train_data = Traindata[train,:]
train_data_x = train_data[:,1:]
train_data_y = train_data[:,:1]
x = Traindata[test,:]
test_data_x = x[:,1:]
real_test_Y = x[:,:1]
#all_predict=[]
random_forest = RandomForestRegressor(n_estimators=est, criterion='mse', max_depth=None)
random_forest.fit(train_data_x,train_data_y)
predict_Y=random_forest.predict(test_data_x)
error = RMSE(real_test_Y, predict_Y)
E[i] = error
i = i + 1
return np.mean(E)
In [86]:
# find the optimal parameters for random forest prediction
def opt_est(data):
#initialize est
est_max=30
est=20
est_list = []
error_list = []
while est<=est_max:
err=cross_val(data,est)
error_list.append(err)
est_list.append(est)
est+=1
return min(error_list), est_list[[i for i, x in enumerate(error_list) if x == min(error_list)][0]]
In [87]:
print "min RMSE", " opt #est"
opt_est(df_np)
Out[87]:
In [28]:
df1 = pd.read_csv('C:/Users/Ruoying/notebook/cp290/rental-listings-census/data/cl_census_CA.csv')
df1.head()
Out[28]:
In [35]:
df1.loc['rent']