In [3]:
import pandas as pd
import paramiko
import os
import numpy as np
import math
In [4]:
hostname = "169.229.154.119"
username = "cy290e"
password = "" # local password
#local_key_dir = '~/.ssh/known_hosts'
local_key_dir = 'C:/Users/Ruoying/.ssh'
"""SSH connection parameters"""
listings_dir = 'craigslist_census_02202017/'
"""Remote directory for results"""
# estbalish SSH connection
ssh = paramiko.SSHClient()
ssh.load_host_keys(local_key_dir)
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(hostname,username=username, password=password)
sftp = ssh.open_sftp()
In [1]:
#https://www.example-code.com/python/sftp_use_ppk_putty_private_key.asp
import sys
import chilkat
In [2]:
sftp = chilkat.CkSFtp()
success = sftp.UnlockComponent("Anything for 30-day trial.")
if (success != True):
print(sftp.lastErrorText())
sys.exit()
In [3]:
# Load a .ppk PuTTY private key.
puttyKey = chilkat.CkSshKey()
ppkText = puttyKey.loadText("C:/Users/Ruoying/.ssh/putty_rsa_private.ppk")
In [4]:
puttyKey.put_Password("")
success = puttyKey.FromPuttyPrivateKey(ppkText)
if (success != True):
print(puttyKey.lastErrorText())
sys.exit()
In [5]:
sshHostname = "169.229.154.119"
sshPort = 22
success = sftp.Connect(sshHostname,sshPort)
if (success != True):
print(sftp.lastErrorText())
sys.exit()
In [6]:
success = sftp.AuthenticatePk("cy290e",puttyKey)
if (success != True):
print(sftp.lastErrorText())
sys.exit()
In [7]:
def read_listings_file(fname):
"""Read csv file via SFTP and return as dataframe."""
with sftp.open(listings_dir+fname) as f:
df = pd.read_csv(f, delimiter=',', dtype={'date':str,'fips_block':str,'state':str,'mpo_id':str})
# TODO: parse dates.
return df
def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
"""Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft.
Args:
df: Dataframe with listings. Cols names include ['rent','sqft']
rent_range (tuple): min and max rent
sqft_range (tuple): min and max sqft
Returns:
DataFrame: listings data without outliers.
"""
n0=len(df)
df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
n1=len(df)
print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
n2=len(df)
print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
return(df)
In [8]:
# get list of files and load.
# TODO: decide what to load.
state='NY'
infile='cl_census_{}.csv'.format(state)
df = read_listings_file(infile)
df.head()
In [92]:
# for census vars, NA really means 0...
census_cols = ['cars_tot', 'children_tot','persons_tot', 'workers_tot', 'age_of_head_med', 'income_med','hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3','race_of_head_4', 'race_of_head_5', 'race_of_head_6', 'race_of_head_7','race_of_head_8', 'race_of_head_9', 'recent_mover_0', 'recent_mover_1','tenure_1', 'tenure_2']
for col in census_cols:
df[col] = df[col].fillna(0)
In [93]:
df[census_cols].head()
Out[93]:
In [94]:
# make meaningful variables.
df['pct_race_1'] = df['race_of_head_1']/df['hhs_tot']
df['pct_race_2'] = df['race_of_head_2']/df['hhs_tot']
df['pct_race_3'] = df['race_of_head_3']/df['hhs_tot']
df['pct_mover'] = df['recent_mover_1']/df['hhs_tot']
df['pct_tenure_1'] = df['tenure_1']/df['hhs_tot']
df['avg_hh_size'] = df['persons_tot']/df['hhs_tot']
df['cars_per_hh'] = df['cars_tot']/df['hhs_tot']
df['ln_rent'] = np.log(df['rent'])
In [95]:
features_to_examine = ['rent','ln_rent', 'bedrooms', 'sqft','pct_race_1','pct_race_2','pct_race_3','pct_mover','pct_tenure_1','income_med','age_of_head_med','avg_hh_size','cars_per_hh']
df[features_to_examine].describe()
Out[95]:
In [96]:
rent_range=(100,10000)
sqft_range=(10,5000)
"""Ranges to define outliers in CL data. I defined based on inspection of distributions."""
Out[96]:
In [119]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# use this to define outliers.
g=sns.distplot(df['rent'])
g.set_xlim(0,100000)
g=sns.distplot(df['sqft'])
g.set_xlim(0,10000)
Out[119]:
In [97]:
df = filter_outliers(df)
In [107]:
df = df[features_to_examine]
print(len(df))
print(len(df[pd.isnull(df).any(axis=1)]))
df = df.dropna()
In [109]:
sns.distplot(df.rent)
Out[109]:
In [110]:
sns.distplot(df.ln_rent)
Out[110]:
In [121]:
plot_rows = math.ceil(len(features_to_examine)/2)
f, axes = plt.subplots(plot_rows,2, figsize=(8,12))
sns.despine(left=True)
for i,col in enumerate(features_to_examine):
row_position = round(i/2)
col_position = i%2
p=sns.distplot(df[col], ax=axes[row_position, col_position])
p.set_title(''.format(col)) # while isn't the subplot title working?
plt.show()
In [ ]:
In [45]:
# correlation heatmap
corrmat=df[features_to_examine].corr()
corrmat.head()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
f.tight_layout()
In [ ]: