In [1]:
from datetime import datetime, timedelta
import itertools
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import time
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter, defaultdict
import seaborn as sns
import re
import math
from datetime import datetime
In [2]:
data = pd.read_csv("./data/clean/cleanData.csv")
print data.shape
In [3]:
data.built = 2017 - data.built
data.Lastremodel = 2017 - data.Lastremodel
data.head()
Out[3]:
In [4]:
plt.scatter(x=data.price, y=data.Zestimate)
plt.xlabel("price")
plt.ylabel("Zestimate")
plt.scatter(x=data.price, y=data.price, marker=".")
Out[4]:
In [5]:
use_columns = ["city","zipcode","area","bed","bath","sqft","Zestimate","type","built"]
data = data.loc[:, use_columns]
sns.barplot(y=data.isnull().sum().index, x=data.isnull().sum().values*1.0/data.shape[0])
Out[5]:
In [6]:
print data.shape
ind = data.isnull().sum(1) == 0
data = data.loc[ind, :]
print data.shape
In [7]:
data.head()
Out[7]:
In [8]:
cate_df = data.loc[:, ["city", "zipcode", "area", "bet", "bath", "type"]]
cont_df = data.loc[:, ["sqft", "Zestimate", "built"]]
cate_df = cate_df.astype("str")
In [9]:
dummy_df = pd.get_dummies(cate_df)
print dummy_df.shape
In [10]:
df_full = pd.concat([cont_df, dummy_df], axis=1)
print df_full.shape
df_full.head()
Out[10]:
In [11]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
In [12]:
X = df_full.drop("Zestimate", 1)
y = df_full.Zestimate
print X.shape
print y.shape
In [13]:
rep = 10
mat = []
ip = np.zeros(X.shape[1])
for i in xrange(rep):
RF = RandomForestRegressor()
pred = cross_val_predict(RF, X, y, cv=5)
mat.append(pred)
RF.fit(X, y)
ip += RF.feature_importances_
print i
preds = np.array(mat).mean(0)
err = np.sqrt(mean_squared_error(y_pred=preds, y_true=y))
plt.scatter(x=y, y=preds)
plt.scatter(x=y, y=y, marker=".")
plt.xlabel("true")
plt.ylabel("prediction")
plt.title("RF error: " + str(err))
Out[13]:
In [14]:
ipdf = pd.DataFrame(zip(X.columns, ip), columns=["feature", "importance"]).sort_values("importance", ascending=False)
ipdf.head(20)
Out[14]:
In [15]:
plt.plot(np.cumsum(sorted(ipdf.importance, reverse=True)))
Out[15]:
In [16]:
top_features = ipdf.feature[:20]
X = X.loc[:, top_features]
y = df_full.Zestimate
print X.shape
print y.shape
In [17]:
rep = 10
mat = []
ip = np.zeros(X.shape[1])
for i in xrange(rep):
RF = RandomForestRegressor()
pred = cross_val_predict(RF, X, y, cv=5)
mat.append(pred)
RF.fit(X, y)
ip += RF.feature_importances_
print i
preds = np.array(mat).mean(0)
err = np.sqrt(mean_squared_error(y_pred=preds, y_true=y))
plt.scatter(x=y, y=preds)
plt.scatter(x=y, y=y, marker=".")
plt.xlabel("true")
plt.ylabel("prediction")
plt.title("RF error: " + str(err))
Out[17]:
In [18]:
pd.DataFrame(zip(X.columns, ip), columns=["feature", "importance"]).sort_values("importance", ascending=False).head(20)
Out[18]:
In [19]:
plt.scatter(x=data.sqft, y=data.Zestimate)
plt.xlabel("sqft")
plt.ylabel("Zestimate")
Out[19]:
In [20]:
plt.figure(figsize=(40,10))
sns.boxplot(x=data.zipcode, y=data.Zestimate)
plt.yticks(rotation=90)
Out[20]:
In [25]:
plt.figure(figsize=(40,10))
sns.boxplot(x=data.zipcode, y=data.Zestimate / data.sqft)
plt.yticks(rotation=90)
plt.ylim(0,2000)
plt.title("price per square by zip code")
Out[25]:
In [21]:
sns.boxplot(x=data.bed, y=data.Zestimate)
Out[21]:
In [22]:
sns.boxplot(x=data.bath, y=data.Zestimate)
Out[22]:
In [ ]: