In [1]:

    
from datetime import datetime, timedelta
import itertools
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import time
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter, defaultdict
import seaborn as sns
import re
import math
from datetime import datetime
import copy

functions



In [2]:

    
PRICE_CONST = [10000, 4000000]

def onlyNumber(inputArr):
    outputArr = []
    for x in inputArr:
        try:
            r = re.sub("[^0-9.]", "", str(x))
        except:
            r = np.nan
        outputArr.append(r)
    return outputArr

def convertInt(inputArr, valueConstraint=None):
    outputArr = []
    for x in inputArr:
        try:
            r = int(x)
            if valueConstraint is not None:
                lowerBound = valueConstraint[0]
                upperBound = valueConstraint[1]
                if r < lowerBound or r > upperBound:
                    r = np.nan
        except:
            r = np.nan
        outputArr.append(r)
    return outputArr

def convertFloat(inputArr, valueConstraint=None):
    outputArr = []
    for x in inputArr:
        try:
            r = float(x)
            if valueConstraint is not None:
                lowerBound = valueConstraint[0]
                upperBound = valueConstraint[1]
                if r < lowerBound or r > upperBound:
                    r = np.nan
        except:
            r = np.nan
        outputArr.append(r)
    return outputArr

def convertDatetime(inputArr):
    outputArr = []
    for x in inputArr:
        try:
            r = datetime.strptime(x, '%m/%d/%y')
        except:
            r = np.nan
        outputArr.append(r)
    return outputArr

def nanCounter(inputArr):
    try:
        c = Counter(inputArr[~np.isnan(inputArr)])
    except:
        c = Counter(inputArr)
    return c

def nanCounterMostCommon(inputArr):
    try:
        c = Counter(inputArr[~np.isnan(inputArr)]).most_common()
    except:
        c = Counter(inputArr).most_common()
    return c

def nanDistPlot(inputArr):
    x = inputArr[~np.isnan(inputArr)]
    sns.distplot(x)

load data



In [3]:

    
data1 = pd.read_csv("./data/raw/Zillow-Fremont_20170716.csv")
data2 = pd.read_csv("./data/raw/zillow_1234city_20170811.csv")
data3 = pd.read_csv("./data/raw/city5678.csv")
data1.rename(columns={"bet":"bed"}, inplace=True)
print data1.shape
print data2.shape
print data3.shape









    



(6319, 31)
(8402, 31)
(34829, 31)






    



/Users/qiweili/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [4]:

    
# do they have the same column names?
print sum(data1.columns != data2.columns)
print sum(data1.columns != data3.columns)
print sum(data2.columns != data3.columns)



In [5]:

    
# merge them into one dataframe
data = pd.concat([data1, data2, data3])
data.reset_index(drop=True, inplace=True)
print data.shape









    



(49550, 31)

missing data percentages



In [6]:

    
plt.figure(figsize=(5,10))
sns.barplot(y=data.isnull().sum().index, x=data.isnull().sum().values*1.0/data.shape[0])









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x103c63f10>



In [7]:

    
useful_columns = ["city","zipcode","area","bed","bath","sqft","price","Zestimate","type","built","Parking","Lastremodel","year1","price1","year2","price2","year3","price3","year4","price4","year5","price5"]
data = data.loc[:, useful_columns]
print data.shape
data.head()









    



(49550, 22)






    Out[7]:







  
    
      
      city
      zipcode
      area
      bed
      bath
      sqft
      price
      Zestimate
      type
      built
      Parking
      Lastremodel
      year1
      price1
      year2
      price2
      year3
      price3
      year4
      price4
      year5
      price5
    
  
  
    
      0
      Fremont
      94555
      Northgate
      4
      1
      1400
      140000.0
      986436
      Single Family
      1970
      Garage - Attached
      1970.0
      06/01/99
      292000
      38400
      140000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      Fremont
      94538
      Downtown
      2
      2
      1042
      160000.0
      535006
      Condo
      1972
      No Data
      1972.0
      37570
      225000
      04/28/17
      160000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      Fremont
      94539
      Mission Valley
      3
      2
      1434
      130000.0
      1.32095e+06
      Single Family
      1961
      Garage - Attached
      1962.0
      04/25/17
      130000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      Fremont
      94538
      Sundale
      3
      1
      948
      141500.0
      668271
      Single Family
      1959
      Garage - Attached
      1961.0
      06/26/00
      329000
      04/21/17
      141500
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      Fremont
      94538
      Blacow
      2
      2.5
      1157
      60000.0
      710981
      Condo
      1994
      1 space
      1994.0
      02/17/95
      173000
      02/29/00
      244500
      07/21/15
      605000
      03/31/17
      60000
      NaN
      NaN

city



In [8]:

    
count = Counter(data.city)
usefulValues = np.array(count.keys())[np.array(count.values()) > 1000]
data.city = np.array([x if x in usefulValues else np.nan for x in data.city])
nanCounterMostCommon(data.city)









    Out[8]:





[('San Jose', 26850),
 ('Fremont', 6317),
 ('Sunnyvale', 3312),
 ('Santa Clara', 2987),
 ('San Mateo', 2916),
 ('Redwood City', 2027),
 ('Mountain View', 1964),
 ('Union City', 1754),
 ('Newark', 1398),
 ('nan', 25)]

zipcode



In [9]:

    
data.zipcode = convertInt(data.zipcode)
count = Counter(data.zipcode)
usefulValues = np.array(count.keys())[np.array(count.values()) > 100]   
data.zipcode = [x if x in usefulValues else np.nan for x in data.zipcode]
nanCounterMostCommon(data.zipcode)









    Out[9]:





[(95123.0, 2511),
 (95125.0, 2150),
 (94536.0, 2129),
 (94587.0, 1746),
 (95124.0, 1634),
 (94538.0, 1615),
 (95051.0, 1466),
 (94539.0, 1442),
 (94560.0, 1398),
 (95136.0, 1382),
 (95112.0, 1281),
 (94087.0, 1267),
 (95111.0, 1189),
 (95120.0, 1153),
 (94555.0, 1130),
 (95126.0, 1108),
 (94403.0, 1090),
 (95118.0, 1083),
 (95131.0, 1032),
 (94401.0, 1006),
 (95132.0, 1004),
 (94086.0, 974),
 (95116.0, 953),
 (95135.0, 927),
 (95148.0, 926),
 (94043.0, 923),
 (95050.0, 920),
 (95122.0, 878),
 (95128.0, 878),
 (94061.0, 855),
 (95127.0, 823),
 (95129.0, 807),
 (95121.0, 806),
 (95133.0, 747),
 (95138.0, 746),
 (95110.0, 736),
 (94040.0, 698),
 (94402.0, 633),
 (95054.0, 607),
 (95117.0, 580),
 (94085.0, 571),
 (94089.0, 518),
 (94065.0, 458),
 (94062.0, 423),
 (95119.0, 363),
 (94041.0, 335),
 (95130.0, 310),
 (95139.0, 273),
 (94063.0, 250),
 (95134.0, 197),
 (94404.0, 181),
 (95113.0, 127)]

area



In [10]:

    
count = Counter(data.area)
usefulValues = np.array(count.keys())[np.array(count.values()) > 10] 
data.area = [x if x in usefulValues else np.nan for x in data.area]
nanCounterMostCommon(data.area)









    



/Users/qiweili/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  app.launch_new_instance()






    Out[10]:





[('Willow Glen', 3304),
 ('Evergreen', 2511),
 ('Downtown', 2500),
 ('Blossom Valley', 2281),
 ('West San Jose', 2127),
 ('East San Jose', 2007),
 ('Santa Teresa', 1986),
 ('North Valley', 1846),
 ('Cambrian Park', 1805),
 ('94587', 1745),
 ('Almaden Valley', 1577),
 ('Fairgrounds', 1510),
 ('95051', 1464),
 ('94560', 1397),
 ('Berryessa', 1045),
 ('95050', 913),
 ('Alum Rock-East Foothills', 744),
 ('Edenvale - Seven Trees', 724),
 ('North San Jose', 621),
 ('95054', 607),
 ('Ponderosa', 533),
 ('Lakewood', 524),
 ('Ardenwood', 511),
 ('Northgate', 471),
 ('Centerville', 466),
 ('Rose Garden', 458),
 ('De Anza', 445),
 ('94043', 436),
 ('Cherry-Guardino', 420),
 ('West Murphy', 409),
 ('Warm Springs', 371),
 ('94040', 358),
 ('Glenmoor', 355),
 ('Ortega', 326),
 ('Cabrillo', 325),
 ('Washington', 321),
 ('East Murphy', 320),
 ('Serra', 299),
 ('Central', 293),
 ('Irvington', 288),
 ('Niles', 265),
 ('Grimmer', 252),
 ('North Central', 237),
 ('Sundale', 221),
 ('Mission Valley', 219),
 ('Blacow', 212),
 ('Mission San Jose', 204),
 ('Kimber-Gomes', 193),
 ('South Sundale', 190),
 ('Woodside Plaza', 188),
 ('Steinberger', 177),
 ('Old Mountain View', 175),
 ('Brookvale', 166),
 ('San Mateo Heights', 161),
 ('South Shoreview', 159),
 ('Parkmont', 153),
 ('Baywood', 151),
 ('Eagle Hill', 148),
 ('Lakes and Birds', 140),
 ('Hayward Park', 140),
 ('Westport', 137),
 ('Mariners Island', 135),
 ('Raynor', 134),
 ('Oakwood', 132),
 ('Slater', 131),
 ('Farm Hill', 129),
 ('San Mateo Village', 129),
 ('Roosevelt', 127),
 ('Twenty-Fifth Ave', 126),
 ('Vineyards-Avalon', 120),
 ('North Shoreview', 116),
 ('28 Palms', 114),
 ('Palm', 113),
 ('Laurelwood', 111),
 ('Aragon', 110),
 ('Weibel', 109),
 ('Moffett Boulevard', 107),
 ('Southeast Hillsdale', 106),
 ('Canyon Heights', 102),
 ('Cuesta Park', 101),
 ('San Mateo Park', 99),
 ('East Industrial', 98),
 (nan, 97),
 ('Cameron Hills', 96),
 ('Fiesta Gardens', 95),
 ('Parkside', 93),
 ('Canyon', 93),
 ('Central Business District', 91),
 ('Wagon Wheel', 90),
 ('94041', 83),
 ('Northeast Hillsdale', 83),
 ('Greater San Antonio', 77),
 ('Friendly Acres', 76),
 ('Arlington', 75),
 ('Sunnybrae', 75),
 ('Shoreline West', 69),
 ('Monta Loma', 67),
 ('Los Prados', 65),
 ('Marlin', 65),
 ('Lauriedale', 64),
 ('Rex Manor', 57),
 ('Sequoia', 56),
 ('Country Club Heights', 53),
 ('Centennial', 52),
 ('Marina Park', 50),
 ('Dolphin', 47),
 ('Edgewater Isle', 46),
 ('Bay Meadows', 46),
 ('Waverly Park', 45),
 ('East San Mateo', 42),
 ('Southwest Hillsdale', 41),
 ('Northwest Hillsdale', 39),
 ('Middlefield', 39),
 ('Westshore', 39),
 ('Foothill Terrace', 39),
 ('Shearwater', 36),
 ('North Wishman', 35),
 ('Baywood Knolls', 34),
 ('94062', 33),
 ('95037', 32),
 ('College Heights', 31),
 ('Gemello Park', 30),
 ('Mission Hills', 27),
 ('Nineteenth Avenue', 27),
 ('Lido', 24),
 ('Saint Francis Acres', 24),
 ('Baywood Park', 23),
 ('Ampex', 17),
 ('Sylvan Park', 15),
 ('Eastern Varsity Park', 13),
 ('Spring Meadows', 13),
 ('Martens-Carmelita', 11)]

bed



In [11]:

    
data.bed = convertInt(data.bed)
count = Counter(data.bed)
usefulValues = np.array(count.keys())[np.array(count.values()) > 100] 
data.bed = [x if x in usefulValues else np.nan for x in data.bed]
nanCounterMostCommon(data.bed)









    Out[11]:





[(3.0, 19677),
 (4.0, 12343),
 (2.0, 10798),
 (5.0, 2731),
 (1.0, 2199),
 (6.0, 390)]

bath



In [12]:

    
data.bath = convertFloat(data.bath)
data.bath[(data.bath > 2) & (data.bath < 2.5)] = 2.0
count = Counter(data.bath)
usefulValues = np.array(count.keys())[np.array(count.values()) > 100] 
data.bath = [x if x in usefulValues else np.nan for x in data.bath]
nanCounterMostCommon(data.bath)









    



/Users/qiweili/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app






    Out[12]:





[(2.0, 21892),
 (3.0, 8882),
 (1.0, 7532),
 (2.5, 5374),
 (1.5, 1509),
 (4.0, 1439),
 (3.5, 947),
 (5.0, 280),
 (4.5, 168)]

sqft



In [13]:

    
data.sqft = convertFloat(data.sqft, valueConstraint=[0, 8000])
nanDistPlot(data.sqft)

price



In [14]:

    
data.price = convertFloat(data.price, valueConstraint=PRICE_CONST)
nanDistPlot(data.price)

Zestimate



In [15]:

    
data.Zestimate = convertFloat(onlyNumber(data.Zestimate), valueConstraint=PRICE_CONST)
nanDistPlot(data.Zestimate)

type



In [16]:

    
count = Counter(data.type)
usefulValues = np.array(count.keys())[np.array(count.values()) > 300]
data.type = np.array([x if x in usefulValues else np.nan for x in data.type])
nanCounterMostCommon(data.type)









    



/Users/qiweili/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  app.launch_new_instance()






    Out[16]:





[('Single Family', 32223),
 ('Condo', 10862),
 ('Townhouse', 4327),
 ('Multi Family', 943),
 ('nan', 696),
 ('Mobile / Manufactured', 499)]

built



In [17]:

    
data.built = convertInt(data.built, valueConstraint=[1850, 2018])
nanDistPlot(data.built)

Parking



In [18]:

    
data = data.drop('Parking', axis=1)

Lastremodel



In [19]:

    
data.Lastremodel = convertInt(data.Lastremodel, valueConstraint=[1850, 2018])
nanDistPlot(data.Lastremodel)

year1 year2 year3 year4 year5



In [20]:

    
data.year1 = convertDatetime(data.year1)
data.year2 = convertDatetime(data.year2)
data.year3 = convertDatetime(data.year3)
data.year4 = convertDatetime(data.year4)
data.year5 = convertDatetime(data.year5)

price1 price2 price3 price4 price5



In [21]:

    
data.price1 = convertFloat(onlyNumber(data.price1), valueConstraint=PRICE_CONST)
data.price2 = convertFloat(onlyNumber(data.price2), valueConstraint=PRICE_CONST)
data.price3 = convertFloat(onlyNumber(data.price3), valueConstraint=PRICE_CONST)
data.price4 = convertFloat(onlyNumber(data.price4), valueConstraint=PRICE_CONST)
data.price5 = convertFloat(onlyNumber(data.price5), valueConstraint=PRICE_CONST)

check and save



In [22]:

    
plt.figure(figsize=(5,10))
sns.barplot(y=data.isnull().sum().index, x=data.isnull().sum().values*1.0/data.shape[0])









    Out[22]:





<matplotlib.axes._subplots.AxesSubplot at 0x1119a8050>



In [23]:

    
data.to_csv("./data/clean/cleanData.csv", index=False)

	city	zipcode	area	bed	bath	sqft	price	Zestimate	type	built	Parking	Lastremodel	year1	price1	year2	price2	year3	price3	year4	price4	year5	price5
0	Fremont	94555	Northgate	4	1	1400	140000.0	986436	Single Family	1970	Garage - Attached	1970.0	06/01/99	292000	38400	140000	NaN	NaN	NaN	NaN	NaN	NaN
1	Fremont	94538	Downtown	2	2	1042	160000.0	535006	Condo	1972	No Data	1972.0	37570	225000	04/28/17	160000	NaN	NaN	NaN	NaN	NaN	NaN
2	Fremont	94539	Mission Valley	3	2	1434	130000.0	1.32095e+06	Single Family	1961	Garage - Attached	1962.0	04/25/17	130000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Fremont	94538	Sundale	3	1	948	141500.0	668271	Single Family	1959	Garage - Attached	1961.0	06/26/00	329000	04/21/17	141500	NaN	NaN	NaN	NaN	NaN	NaN
4	Fremont	94538	Blacow	2	2.5	1157	60000.0	710981	Condo	1994	1 space	1994.0	02/17/95	173000	02/29/00	244500	07/21/15	605000	03/31/17	60000	NaN	NaN

functions

load data

missing data percentages

city

zipcode

area

bed

bath

sqft

price

Zestimate

type

built

Parking

Lastremodel

year1 year2 year3 year4 year5

price1 price2 price3 price4 price5

check and save

Archive