In [1]:
from datetime import datetime, timedelta
import itertools
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import time
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter, defaultdict
import seaborn as sns
import re
import math
from datetime import datetime
import copy
In [2]:
PRICE_CONST = [10000, 4000000]
def onlyNumber(inputArr):
outputArr = []
for x in inputArr:
try:
r = re.sub("[^0-9.]", "", str(x))
except:
r = np.nan
outputArr.append(r)
return outputArr
def convertInt(inputArr, valueConstraint=None):
outputArr = []
for x in inputArr:
try:
r = int(x)
if valueConstraint is not None:
lowerBound = valueConstraint[0]
upperBound = valueConstraint[1]
if r < lowerBound or r > upperBound:
r = np.nan
except:
r = np.nan
outputArr.append(r)
return outputArr
def convertFloat(inputArr, valueConstraint=None):
outputArr = []
for x in inputArr:
try:
r = float(x)
if valueConstraint is not None:
lowerBound = valueConstraint[0]
upperBound = valueConstraint[1]
if r < lowerBound or r > upperBound:
r = np.nan
except:
r = np.nan
outputArr.append(r)
return outputArr
def convertDatetime(inputArr):
outputArr = []
for x in inputArr:
try:
r = datetime.strptime(x, '%m/%d/%y')
except:
r = np.nan
outputArr.append(r)
return outputArr
def nanCounter(inputArr):
try:
c = Counter(inputArr[~np.isnan(inputArr)])
except:
c = Counter(inputArr)
return c
def nanCounterMostCommon(inputArr):
try:
c = Counter(inputArr[~np.isnan(inputArr)]).most_common()
except:
c = Counter(inputArr).most_common()
return c
def nanDistPlot(inputArr):
x = inputArr[~np.isnan(inputArr)]
sns.distplot(x)
In [3]:
data1 = pd.read_csv("./data/raw/Zillow-Fremont_20170716.csv")
data2 = pd.read_csv("./data/raw/zillow_1234city_20170811.csv")
data3 = pd.read_csv("./data/raw/city5678.csv")
data1.rename(columns={"bet":"bed"}, inplace=True)
print data1.shape
print data2.shape
print data3.shape
In [4]:
# do they have the same column names?
print sum(data1.columns != data2.columns)
print sum(data1.columns != data3.columns)
print sum(data2.columns != data3.columns)
In [5]:
# merge them into one dataframe
data = pd.concat([data1, data2, data3])
data.reset_index(drop=True, inplace=True)
print data.shape
In [6]:
plt.figure(figsize=(5,10))
sns.barplot(y=data.isnull().sum().index, x=data.isnull().sum().values*1.0/data.shape[0])
Out[6]:
In [7]:
useful_columns = ["city","zipcode","area","bed","bath","sqft","price","Zestimate","type","built","Parking","Lastremodel","year1","price1","year2","price2","year3","price3","year4","price4","year5","price5"]
data = data.loc[:, useful_columns]
print data.shape
data.head()
Out[7]:
In [8]:
count = Counter(data.city)
usefulValues = np.array(count.keys())[np.array(count.values()) > 1000]
data.city = np.array([x if x in usefulValues else np.nan for x in data.city])
nanCounterMostCommon(data.city)
Out[8]:
In [9]:
data.zipcode = convertInt(data.zipcode)
count = Counter(data.zipcode)
usefulValues = np.array(count.keys())[np.array(count.values()) > 100]
data.zipcode = [x if x in usefulValues else np.nan for x in data.zipcode]
nanCounterMostCommon(data.zipcode)
Out[9]:
In [10]:
count = Counter(data.area)
usefulValues = np.array(count.keys())[np.array(count.values()) > 10]
data.area = [x if x in usefulValues else np.nan for x in data.area]
nanCounterMostCommon(data.area)
Out[10]:
In [11]:
data.bed = convertInt(data.bed)
count = Counter(data.bed)
usefulValues = np.array(count.keys())[np.array(count.values()) > 100]
data.bed = [x if x in usefulValues else np.nan for x in data.bed]
nanCounterMostCommon(data.bed)
Out[11]:
In [12]:
data.bath = convertFloat(data.bath)
data.bath[(data.bath > 2) & (data.bath < 2.5)] = 2.0
count = Counter(data.bath)
usefulValues = np.array(count.keys())[np.array(count.values()) > 100]
data.bath = [x if x in usefulValues else np.nan for x in data.bath]
nanCounterMostCommon(data.bath)
Out[12]:
In [13]:
data.sqft = convertFloat(data.sqft, valueConstraint=[0, 8000])
nanDistPlot(data.sqft)
In [14]:
data.price = convertFloat(data.price, valueConstraint=PRICE_CONST)
nanDistPlot(data.price)
In [15]:
data.Zestimate = convertFloat(onlyNumber(data.Zestimate), valueConstraint=PRICE_CONST)
nanDistPlot(data.Zestimate)
In [16]:
count = Counter(data.type)
usefulValues = np.array(count.keys())[np.array(count.values()) > 300]
data.type = np.array([x if x in usefulValues else np.nan for x in data.type])
nanCounterMostCommon(data.type)
Out[16]:
In [17]:
data.built = convertInt(data.built, valueConstraint=[1850, 2018])
nanDistPlot(data.built)
In [18]:
data = data.drop('Parking', axis=1)
In [19]:
data.Lastremodel = convertInt(data.Lastremodel, valueConstraint=[1850, 2018])
nanDistPlot(data.Lastremodel)
In [20]:
data.year1 = convertDatetime(data.year1)
data.year2 = convertDatetime(data.year2)
data.year3 = convertDatetime(data.year3)
data.year4 = convertDatetime(data.year4)
data.year5 = convertDatetime(data.year5)
In [21]:
data.price1 = convertFloat(onlyNumber(data.price1), valueConstraint=PRICE_CONST)
data.price2 = convertFloat(onlyNumber(data.price2), valueConstraint=PRICE_CONST)
data.price3 = convertFloat(onlyNumber(data.price3), valueConstraint=PRICE_CONST)
data.price4 = convertFloat(onlyNumber(data.price4), valueConstraint=PRICE_CONST)
data.price5 = convertFloat(onlyNumber(data.price5), valueConstraint=PRICE_CONST)
In [22]:
plt.figure(figsize=(5,10))
sns.barplot(y=data.isnull().sum().index, x=data.isnull().sum().values*1.0/data.shape[0])
Out[22]:
In [23]:
data.to_csv("./data/clean/cleanData.csv", index=False)