In [38]:
import pandas as pd
import h5py
import numpy as np
import matplotlib.pyplot as plt

In [39]:
filename = r'..\data\sfbay_listings_03172017.csv'
data = pd.read_csv(filename)
data.head()


Out[39]:
listing_id date rent bedrooms bathrooms sqft rent_sqft fips_block state region ... race_of_head_4 race_of_head_5 race_of_head_6 race_of_head_7 race_of_head_8 race_of_head_9 recent_mover_0 recent_mover_1 tenure_1 tenure_2
0 5915417519 2016-12-11 2586.0 2.0 NaN 1039.0 2.488932 60971530013001 CA sfbay ... NaN 2.0 23.0 NaN 115.0 29.0 1031.0 145.0 560.0 616.0
1 5915416703 2016-12-11 3178.0 1.0 NaN 574.0 5.536585 60750615003012 CA sfbay ... NaN NaN 472.0 NaN 5.0 2.0 697.0 513.0 613.0 597.0
2 5915414697 2016-12-11 3029.0 1.0 NaN 771.0 3.928664 60816080021000 CA sfbay ... NaN NaN 293.0 3.0 9.0 16.0 636.0 70.0 526.0 180.0
3 5915384383 2016-12-11 2395.0 1.0 NaN 770.0 3.110390 60855081021004 CA sfbay ... NaN NaN 704.0 NaN 1.0 21.0 1005.0 265.0 660.0 610.0
4 5915414020 2016-12-11 2735.0 2.0 NaN 922.0 2.966377 60411042002000 CA sfbay ... NaN 8.0 50.0 2.0 75.0 47.0 898.0 113.0 504.0 507.0

5 rows × 33 columns


In [41]:
sub = data.dropna()

In [42]:
len(sub.index)


Out[42]:
1300

In [44]:
sub.to_csv(r'..\data\sfbay_listings_03172017_sub.csv')

In [6]:



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-6-6471456e2c1e> in <module>()
      1 import sys
----> 2 import chilkat
      3 
      4 sftp = chilkat.CkSFtp()
      5 

ImportError: No module named 'chilkat'

In [9]:
data = pd.read_csv(r'..\data\cl_census_CA.csv')
data.head()


Out[9]:
listing_id date rent bedrooms sqft rent_sqft fips_block state mpo_id cars_tot ... race_of_head_4 race_of_head_5 race_of_head_6 race_of_head_7 race_of_head_8 race_of_head_9 recent_mover_0 recent_mover_1 tenure_1 tenure_2
0 5873877617 2016-11-13 925.0 2.0 874.0 1.058352 60470010021060 CA 6197202.0 4603.0 ... NaN 8.0 527.0 1.0 198.0 42.0 1516.0 628.0 1310.0 834.0
1 5873876292 2016-11-13 735.0 0.0 650.0 1.130769 60070012001017 CA 6198000.0 646.0 ... NaN NaN 6.0 1.0 3.0 16.0 371.0 44.0 98.0 317.0
2 5873889346 2016-11-13 1675.0 3.0 1000.0 1.675000 60790125023017 CA 6199200.0 1902.0 ... NaN NaN 10.0 NaN 30.0 21.0 712.0 257.0 477.0 492.0
3 5873893871 2016-11-13 1818.0 2.0 1084.0 1.677122 60830020061208 CA 6196600.0 1058.0 ... NaN 2.0 21.0 1.0 8.0 8.0 427.0 41.0 420.0 48.0
4 5849643168 2016-11-13 1050.0 2.0 1100.0 0.954545 60890113004008 CA 6198100.0 786.0 ... NaN 3.0 NaN 2.0 26.0 23.0 287.0 205.0 202.0 290.0

5 rows × 29 columns


In [8]:
data.columns.values


Out[8]:
array(['listing_id', 'date', 'rent', 'bedrooms', 'sqft', 'rent_sqft',
       'fips_block', 'state', 'mpo_id', 'cars_tot', 'children_tot',
       'persons_tot', 'workers_tot', 'age_of_head_med', 'income_med',
       'hhs_tot', 'race_of_head_1', 'race_of_head_2', 'race_of_head_3',
       'race_of_head_4', 'race_of_head_5', 'race_of_head_6',
       'race_of_head_7', 'race_of_head_8', 'race_of_head_9',
       'recent_mover_0', 'recent_mover_1', 'tenure_1', 'tenure_2'], dtype=object)