In [1]:
    
from datetime import datetime, timedelta
import itertools
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import time
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter, defaultdict
import seaborn as sns
import re
import math
from datetime import datetime
    
    
In [2]:
    
data = pd.read_csv("./data/clean/cleanData.csv")
print data.shape
    
    
In [3]:
    
data.built = 2017 - data.built
data.Lastremodel = 2017 - data.Lastremodel
data.head()
    
    Out[3]:
In [4]:
    
plt.scatter(x=data.price, y=data.Zestimate)
plt.xlabel("price")
plt.ylabel("Zestimate")
plt.scatter(x=data.price, y=data.price, marker=".")
    
    Out[4]:
    
In [5]:
    
use_columns = ["city","zipcode","area","bed","bath","sqft","Zestimate","type","built"]
data = data.loc[:, use_columns]
sns.barplot(y=data.isnull().sum().index, x=data.isnull().sum().values*1.0/data.shape[0])
    
    Out[5]:
    
In [6]:
    
print data.shape
ind = data.isnull().sum(1) == 0
data = data.loc[ind, :]
print data.shape
    
    
In [7]:
    
data.head()
    
    Out[7]:
In [8]:
    
cate_df = data.loc[:, ["city", "zipcode", "area", "bet", "bath", "type"]]
cont_df = data.loc[:, ["sqft", "Zestimate", "built"]]
cate_df = cate_df.astype("str")
    
In [9]:
    
dummy_df = pd.get_dummies(cate_df)
print dummy_df.shape
    
    
In [10]:
    
df_full = pd.concat([cont_df, dummy_df], axis=1)
print df_full.shape
df_full.head()
    
    
    Out[10]:
In [11]:
    
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
    
In [12]:
    
X = df_full.drop("Zestimate", 1)
y = df_full.Zestimate
print X.shape
print y.shape
    
    
In [13]:
    
rep = 10
mat = []
ip = np.zeros(X.shape[1])
for i in xrange(rep):
    RF = RandomForestRegressor()
    pred = cross_val_predict(RF, X, y, cv=5)
    mat.append(pred)
    RF.fit(X, y)
    ip += RF.feature_importances_
    print i
    
preds = np.array(mat).mean(0)
err = np.sqrt(mean_squared_error(y_pred=preds, y_true=y))
plt.scatter(x=y, y=preds)
plt.scatter(x=y, y=y, marker=".")
plt.xlabel("true")
plt.ylabel("prediction")
plt.title("RF error: " + str(err))
    
    
    Out[13]:
    
In [14]:
    
ipdf = pd.DataFrame(zip(X.columns, ip), columns=["feature", "importance"]).sort_values("importance", ascending=False)
ipdf.head(20)
    
    Out[14]:
In [15]:
    
plt.plot(np.cumsum(sorted(ipdf.importance, reverse=True)))
    
    Out[15]:
    
In [16]:
    
top_features = ipdf.feature[:20]
X = X.loc[:, top_features]
y = df_full.Zestimate
print X.shape
print y.shape
    
    
In [17]:
    
rep = 10
mat = []
ip = np.zeros(X.shape[1])
for i in xrange(rep):
    RF = RandomForestRegressor()
    pred = cross_val_predict(RF, X, y, cv=5)
    mat.append(pred)
    RF.fit(X, y)
    ip += RF.feature_importances_
    print i
    
preds = np.array(mat).mean(0)
err = np.sqrt(mean_squared_error(y_pred=preds, y_true=y))
plt.scatter(x=y, y=preds)
plt.scatter(x=y, y=y, marker=".")
plt.xlabel("true")
plt.ylabel("prediction")
plt.title("RF error: " + str(err))
    
    
    Out[17]:
    
In [18]:
    
pd.DataFrame(zip(X.columns, ip), columns=["feature", "importance"]).sort_values("importance", ascending=False).head(20)
    
    Out[18]:
In [19]:
    
plt.scatter(x=data.sqft, y=data.Zestimate)
plt.xlabel("sqft")
plt.ylabel("Zestimate")
    
    Out[19]:
    
In [20]:
    
plt.figure(figsize=(40,10))
sns.boxplot(x=data.zipcode, y=data.Zestimate)
plt.yticks(rotation=90)
    
    Out[20]:
    
In [25]:
    
plt.figure(figsize=(40,10))
sns.boxplot(x=data.zipcode, y=data.Zestimate / data.sqft)
plt.yticks(rotation=90) 
plt.ylim(0,2000)
plt.title("price per square by zip code")
    
    Out[25]:
    
In [21]:
    
sns.boxplot(x=data.bed, y=data.Zestimate)
    
    Out[21]:
    
In [22]:
    
sns.boxplot(x=data.bath, y=data.Zestimate)
    
    Out[22]:
    
In [ ]: