In [1]:
from datetime import datetime, timedelta
import itertools
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import time
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter, defaultdict
import seaborn as sns
import re
import math
from datetime import datetime
In [2]:
# # functions
# def diff_month(d1, d2):
# return (d1.year - d2.year) * 12 + d1.month - d2.month
# def convertMonthToNow(inputArr, now="2017-08-24"):
# outputArr = []
# for x in inputArr:
# try:
# d1 = datetime.strptime(now, "%Y-%m-%d")
# d2 = datetime.strptime(x, "%Y-%m-%d")
# r = diff_month(d1, d2)
# except:
# r = np.nan
# outputArr.append(r)
# return outputArr
# # load data
# data = pd.read_csv("./data/clean/cleanData.csv")
# data.built = 2017 - data.built
# data.Lastremodel = 2017 - data.Lastremodel
# # add current information
# data.rename(columns={"price":"year0"}, inplace=True)
# data.rename(columns={"Zestimate":"price0"}, inplace=True)
# data.year0 = np.zeros(data.shape[0])
# # convert datetime to monthToNow
# data.year1 = convertMonthToNow(data.year1)
# data.year2 = convertMonthToNow(data.year2)
# data.year3 = convertMonthToNow(data.year3)
# data.year4 = convertMonthToNow(data.year4)
# data.year5 = convertMonthToNow(data.year5)
# # manipulate data from wide to long
# data.loc[:, "houseId"] = data.index
# from IPython.display import clear_output
# dfs = []
# for name, group in data.groupby("houseId"):
# group.reset_index(drop=True, inplace=True)
# static = group.loc[:, ["city","zipcode","area","bed","bath","sqft","type","built"]]
# staticDf = pd.DataFrame(np.array(list(np.array(static))*6)).astype("str")
# dynamicDf = pd.DataFrame([[group.year0[0], group.price0[0]], [group.year1[0], group.price1[0]], [group.year2[0], group.price2[0]],
# [group.year3[0], group.price3[0]], [group.year4[0], group.price4[0]],[group.year5[0], group.price5[0]]])
# staticDf.rename(columns={0:'city', 1:'zipcode', 2:'area', 3:'bed', 4:'bath', 5:'sqft', 6:'type', 7:'built'}, inplace=True)
# dynamicDf.rename(columns={0:'monthToNow', 1:'price'}, inplace=True)
# dfs.append(pd.concat([staticDf, dynamicDf], axis=1))
# print name
# clear_output(wait=True)
# full_df = pd.concat(dfs, axis=0)
# full_df.shape
# full_df.to_csv("./data/clean/cleanDataLong.csv", index=False)
# full_df.head()
In [3]:
full_df = pd.read_csv("./data/clean/cleanDataLong.csv")
cleanDf = full_df.dropna(axis=0)
print cleanDf.shape
cleanDf.head()
Out[3]:
In [4]:
Counter(full_df.city)
Out[4]:
In [7]:
plt.figure(figsize = (20,10))
ax1 = plt.subplot(1,2,1)
sns.regplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.mean, ax=ax1)
sns.plt.xlim(0,250)
sns.plt.ylim(0,1500000)
plt.title("value trend (mean) - all data")
ax2 = plt.subplot(1,2,2)
sns.regplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.median, ax=ax2)
sns.plt.xlim(0,250)
sns.plt.ylim(0,1500000)
plt.title("value trend (median) - all data")
Out[7]:
In [14]:
plt.figure(figsize = (30,30))
ax1 = plt.subplot(3,3,1)
ax2 = plt.subplot(3,3,2)
ax3 = plt.subplot(3,3,3)
ax4 = plt.subplot(3,3,4)
ax5 = plt.subplot(3,3,5)
ax6 = plt.subplot(3,3,6)
ax7 = plt.subplot(3,3,7)
ax8 = plt.subplot(3,3,8)
ax9 = plt.subplot(3,3,9)
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Union City',:], x_estimator=np.mean, ax=ax1)
ax1.set_xlim([0, 250])
ax1.set_ylim([0, 1800000])
ax1.set_title("value trend (mean) - Union City")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Fremont',:], x_estimator=np.mean, ax=ax2)
ax2.set_xlim([0, 250])
ax2.set_ylim([0, 1800000])
ax2.set_title("value trend (mean) - Fremont")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Newark',:], x_estimator=np.mean, ax=ax3)
ax3.set_xlim([0, 250])
ax3.set_ylim([0, 1800000])
ax3.set_title("value trend (mean) - Newark")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='San Mateo',:], x_estimator=np.mean, ax=ax4)
ax4.set_xlim([0, 250])
ax4.set_ylim([0, 1800000])
ax4.set_title("value trend (mean) - San Mateo")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Redwood City',:], x_estimator=np.mean, ax=ax5)
ax5.set_xlim([0, 250])
ax5.set_ylim([0, 1800000])
ax5.set_title("value trend (mean) - Redwood City")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Mountain View',:], x_estimator=np.mean, ax=ax6)
ax6.set_xlim([0, 250])
ax6.set_ylim([0, 1800000])
ax6.set_title("value trend (mean) - Mountain View")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Santa Clara',:], x_estimator=np.mean, ax=ax7)
ax7.set_xlim([0, 250])
ax7.set_ylim([0, 1800000])
ax7.set_title("value trend (mean) - Santa Clara")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Sunnyvale',:], x_estimator=np.mean, ax=ax8)
ax8.set_xlim([0, 250])
ax8.set_ylim([0, 1800000])
ax8.set_title("value trend (mean) - Sunnyvale")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='San Jose',:], x_estimator=np.mean, ax=ax9)
ax9.set_xlim([0, 250])
ax9.set_ylim([0, 1800000])
ax9.set_title("value trend (mean) - San Jose")
Out[14]:
In [15]:
plt.figure(figsize = (30,30))
ax1 = plt.subplot(3,3,1)
ax2 = plt.subplot(3,3,2)
ax3 = plt.subplot(3,3,3)
ax4 = plt.subplot(3,3,4)
ax5 = plt.subplot(3,3,5)
ax6 = plt.subplot(3,3,6)
ax7 = plt.subplot(3,3,7)
ax8 = plt.subplot(3,3,8)
ax9 = plt.subplot(3,3,9)
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Union City',:], x_estimator=np.median, ax=ax1)
ax1.set_xlim([0, 250])
ax1.set_ylim([0, 1800000])
ax1.set_title("value trend (median) - Union City")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Fremont',:], x_estimator=np.median, ax=ax2)
ax2.set_xlim([0, 250])
ax2.set_ylim([0, 1800000])
ax2.set_title("value trend (median) - Fremont")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Newark',:], x_estimator=np.median, ax=ax3)
ax3.set_xlim([0, 250])
ax3.set_ylim([0, 1800000])
ax3.set_title("value trend (median) - Newark")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='San Mateo',:], x_estimator=np.median, ax=ax4)
ax4.set_xlim([0, 250])
ax4.set_ylim([0, 1800000])
ax4.set_title("value trend (median) - San Mateo")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Redwood City',:], x_estimator=np.median, ax=ax5)
ax5.set_xlim([0, 250])
ax5.set_ylim([0, 1800000])
ax5.set_title("value trend (median) - Redwood City")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Mountain View',:], x_estimator=np.median, ax=ax6)
ax6.set_xlim([0, 250])
ax6.set_ylim([0, 1800000])
ax6.set_title("value trend (median) - Mountain View")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Santa Clara',:], x_estimator=np.median, ax=ax7)
ax7.set_xlim([0, 250])
ax7.set_ylim([0, 1800000])
ax7.set_title("value trend (median) - Santa Clara")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Sunnyvale',:], x_estimator=np.median, ax=ax8)
ax8.set_xlim([0, 250])
ax8.set_ylim([0, 1800000])
ax8.set_title("value trend (median) - Sunnyvale")
sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='San Jose',:], x_estimator=np.median, ax=ax9)
ax9.set_xlim([0, 250])
ax9.set_ylim([0, 1800000])
ax9.set_title("value trend (median) - San Jose")
Out[15]:
In [18]:
sns.lmplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.mean, scatter=False, hue="city")
sns.plt.xlim(0,250)
sns.plt.ylim(0,)
plt.title("value trend (mean) - by city")
sns.lmplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.median, scatter=False, hue="city")
sns.plt.xlim(0,250)
sns.plt.ylim(0,)
plt.title("value trend (median) - by city")
Out[18]:
In [19]:
sns.lmplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.median, scatter=False, hue="type")
sns.plt.xlim(0,250)
sns.plt.ylim(0,)
plt.title("value trend (median) - by type")
Out[19]: