In [3]:
import pandas as pd
import paramiko
import os
import numpy as np
import math

Remote connection parameters


In [4]:
hostname = "169.229.154.119"
username = "cy290e"
password = ""  # local password 
#local_key_dir = '~/.ssh/known_hosts'
local_key_dir = 'C:/Users/Ruoying/.ssh'
"""SSH connection parameters"""

listings_dir = 'craigslist_census_02202017/'
"""Remote directory for results"""

# estbalish SSH connection
ssh = paramiko.SSHClient() 
ssh.load_host_keys(local_key_dir)
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(hostname,username=username, password=password)
sftp = ssh.open_sftp()


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-4-49579fcafc92> in <module>()
     11 # estbalish SSH connection
     12 ssh = paramiko.SSHClient()
---> 13 ssh.load_host_keys(local_key_dir)
     14 ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
     15 ssh.connect(hostname,username=username, password=password)

C:\Anaconda\lib\site-packages\paramiko\client.pyc in load_host_keys(self, filename)
    122         """
    123         self._host_keys_filename = filename
--> 124         self._host_keys.load(filename)
    125 
    126     def save_host_keys(self, filename):

C:\Anaconda\lib\site-packages\paramiko\hostkeys.pyc in load(self, filename)
     93         :raises IOError: if there was an error reading the file
     94         """
---> 95         with open(filename, 'r') as f:
     96             for lineno, line in enumerate(f, 1):
     97                 line = line.strip()

IOError: [Errno 13] Permission denied: 'C:/Users/Ruoying/.ssh'

In [1]:
#https://www.example-code.com/python/sftp_use_ppk_putty_private_key.asp
import sys
import chilkat

In [2]:
sftp = chilkat.CkSFtp()
success = sftp.UnlockComponent("Anything for 30-day trial.")
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()

In [3]:
#  Load a .ppk PuTTY private key.
puttyKey = chilkat.CkSshKey()
ppkText = puttyKey.loadText("C:/Users/Ruoying/.ssh/putty_rsa_private.ppk")

In [4]:
puttyKey.put_Password("")
success = puttyKey.FromPuttyPrivateKey(ppkText)
if (success != True):
    print(puttyKey.lastErrorText())
    sys.exit()

In [5]:
sshHostname = "169.229.154.119"
sshPort = 22
success = sftp.Connect(sshHostname,sshPort)
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()

In [6]:
success = sftp.AuthenticatePk("cy290e",puttyKey)
if (success != True):
    print(sftp.lastErrorText())
    sys.exit()

In [7]:
def read_listings_file(fname):
    """Read csv file via SFTP and return as dataframe."""
    with sftp.open(listings_dir+fname) as f:
        df = pd.read_csv(f, delimiter=',', dtype={'date':str,'fips_block':str,'state':str,'mpo_id':str})
        # TODO: parse dates. 
    return df

def filter_outliers(df, rent_range=(100,10000),sqft_range=(10,5000)):
    """Drop outliers from listings dataframe. For now, only need to filter out rent and sq ft. 
    Args: 
        df: Dataframe with listings. Cols names include ['rent','sqft']
        rent_range (tuple): min and max rent
        sqft_range (tuple): min and max sqft
    Returns: 
        DataFrame: listings data without outliers. 
    """
    n0=len(df)
    df=df[(df.rent>=rent_range[0])&(df.rent<rent_range[1])]
    n1=len(df)
    print('Dropped {} outside rent range ${}-${}'.format(n0-n1,rent_range[0],rent_range[1]))
    df=df[(df.sqft>=sqft_range[0])&(df.sqft<sqft_range[1])]
    n2=len(df)
    print('Dropped {} outside sqft range {}-{} sqft. {} rows remaining'.format(n1-n2,sqft_range[0],sqft_range[1],len(df)))
    return(df)

Load data


In [8]:
# get list of files and load. 
# TODO: decide what to load. 
state='NY'
infile='cl_census_{}.csv'.format(state)
df = read_listings_file(infile)
df.head()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-8-259ec032052a> in <module>()
      3 state='NY'
      4 infile='cl_census_{}.csv'.format(state)
----> 5 df = read_listings_file(infile)
      6 df.head()

<ipython-input-7-d71d48036a4d> in read_listings_file(fname)
      1 def read_listings_file(fname):
      2     """Read csv file via SFTP and return as dataframe."""
----> 3     with sftp.open(listings_dir+fname) as f:
      4         df = pd.read_csv(f, delimiter=',', dtype={'date':str,'fips_block':str,'state':str,'mpo_id':str})
      5         # TODO: parse dates.

C:\Users\Ruoying\AppData\Roaming\Python\Python27\site-packages\chilkat.pyc in <lambda>(self, name)
  17068     __setattr__ = lambda self, name, value: _swig_setattr(self, CkSFtp, name, value)
  17069     __swig_getmethods__ = {}
> 17070     __getattr__ = lambda self, name: _swig_getattr(self, CkSFtp, name)
  17071     __repr__ = _swig_repr
  17072 

C:\Users\Ruoying\AppData\Roaming\Python\Python27\site-packages\chilkat.pyc in _swig_getattr(self, class_type, name)
     72 
     73 def _swig_getattr(self, class_type, name):
---> 74     return _swig_getattr_nondynamic(self, class_type, name, 0)
     75 
     76 

C:\Users\Ruoying\AppData\Roaming\Python\Python27\site-packages\chilkat.pyc in _swig_getattr_nondynamic(self, class_type, name, static)
     67         return method(self)
     68     if (not static):
---> 69         return object.__getattr__(self, name)
     70     else:
     71         raise AttributeError(name)

AttributeError: type object 'object' has no attribute '__getattr__'

In [92]:
# for census vars, NA really means 0...
census_cols = ['cars_tot', 'children_tot','persons_tot', 'workers_tot', 'age_of_head_med', 'income_med','hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3','race_of_head_4', 'race_of_head_5', 'race_of_head_6', 'race_of_head_7','race_of_head_8', 'race_of_head_9', 'recent_mover_0', 'recent_mover_1','tenure_1', 'tenure_2']
for col in census_cols:
    df[col] = df[col].fillna(0)

In [93]:
df[census_cols].head()


Out[93]:
cars_tot children_tot persons_tot workers_tot age_of_head_med income_med hhs_tot race_of_head_1 race_of_head_2 race_of_head_3 race_of_head_4 race_of_head_5 race_of_head_6 race_of_head_7 race_of_head_8 race_of_head_9 recent_mover_0 recent_mover_1 tenure_1 tenure_2
0 621 119 718 345 56 57000 306 306 0 0 0 0 0 0 0 0 306 0 278 28
1 695 155 877 517 58 67500 356 334 22 0 0 0 0 0 0 0 350 6 348 8
2 872 264 1128 606 51 103000 477 425 0 3 0 1 40 0 1 7 425 52 264 213
3 2032 328 2464 1441 52 51000 1203 1128 0 0 0 0 75 0 0 0 861 342 614 589
4 1389 106 1671 809 64 71620 1023 997 22 0 0 0 4 0 0 0 847 176 650 373

In [94]:
# make meaningful variables.
df['pct_race_1'] = df['race_of_head_1']/df['hhs_tot']
df['pct_race_2'] = df['race_of_head_2']/df['hhs_tot']
df['pct_race_3'] = df['race_of_head_3']/df['hhs_tot']
df['pct_mover'] = df['recent_mover_1']/df['hhs_tot']
df['pct_tenure_1'] = df['tenure_1']/df['hhs_tot']
df['avg_hh_size'] = df['persons_tot']/df['hhs_tot']
df['cars_per_hh'] = df['cars_tot']/df['hhs_tot']


df['ln_rent'] = np.log(df['rent'])

In [95]:
features_to_examine = ['rent','ln_rent', 'bedrooms', 'sqft','pct_race_1','pct_race_2','pct_race_3','pct_mover','pct_tenure_1','income_med','age_of_head_med','avg_hh_size','cars_per_hh']
df[features_to_examine].describe()


Out[95]:
rent ln_rent bedrooms sqft pct_race_1 pct_race_2 pct_race_3 pct_mover pct_tenure_1 income_med age_of_head_med avg_hh_size cars_per_hh
count 54934.000000 54934.000000 54934.000000 54934.000000 54399.000000 54399.000000 54399.000000 54399.000000 54399.000000 54934.000000 54934.000000 54399.000000 54399.000000
mean 4397.204973 7.256942 2.031693 1125.127808 0.785019 0.112677 0.001894 0.204687 0.458737 59514.072806 48.418520 2.211441 1.208953
std 178219.456117 0.686124 1.124379 4769.754000 0.255684 0.213344 0.004698 0.145567 0.297378 33497.944359 9.401062 0.502620 0.650050
min 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
25% 885.000000 6.785588 1.000000 778.000000 0.710448 0.000000 0.000000 0.098214 0.199211 36800.000000 45.000000 1.868852 0.577375
50% 1345.000000 7.204149 2.000000 1000.000000 0.880579 0.020938 0.000000 0.181271 0.464939 52200.000000 50.000000 2.182510 1.369110
75% 2200.000000 7.696213 3.000000 1240.000000 0.973856 0.099259 0.002454 0.284289 0.681514 75000.000000 54.000000 2.535857 1.751625
max 23772584.000000 16.984044 9.000000 850825.000000 1.000000 1.000000 0.390339 0.920128 1.000000 269600.000000 76.000000 5.700608 2.692771

Filter outliers

Can use the following to examine outliers...


In [96]:
rent_range=(100,10000)
sqft_range=(10,5000)
"""Ranges to define outliers in CL data. I defined based on inspection of distributions."""


Out[96]:
'Ranges to define outliers in CL data. I defined based on inspection of distributions.'

In [119]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# use this to define outliers. 
g=sns.distplot(df['rent'])
g.set_xlim(0,100000)

g=sns.distplot(df['sqft'])
g.set_xlim(0,10000)


//anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[119]:
(0, 10000)

In [97]:
df = filter_outliers(df)


Dropped 314 outside rent range $100-$10000
Dropped 181 outside sqft range 10-5000 sqft. 54439 rows remaining

drop NA


In [107]:
df = df[features_to_examine]
print(len(df))
print(len(df[pd.isnull(df).any(axis=1)]))
df = df.dropna()


54439
532

look at distributions

Since rent has a more or less logarithmic distribution, we use ln_rent instead


In [109]:
sns.distplot(df.rent)


//anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[109]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c424048>

In [110]:
sns.distplot(df.ln_rent)


//anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x10dca3240>

In [121]:
plot_rows = math.ceil(len(features_to_examine)/2)

f, axes = plt.subplots(plot_rows,2, figsize=(8,12))
sns.despine(left=True)

for i,col in enumerate(features_to_examine):
    row_position = round(i/2)
    col_position = i%2
    p=sns.distplot(df[col], ax=axes[row_position, col_position])
    p.set_title(''.format(col))  # while isn't the subplot title working? 
plt.show()


//anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [ ]:

look at correlations


In [45]:
# correlation heatmap
corrmat=df[features_to_examine].corr()
corrmat.head()

f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

f.tight_layout()



In [ ]: