In [1]:
from datetime import datetime, timedelta
import itertools
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import time
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter, defaultdict
import seaborn as sns
import re
import math
from datetime import datetime


manipulate data from wide to long


In [2]:
# # functions
# def diff_month(d1, d2):
#     return (d1.year - d2.year) * 12 + d1.month - d2.month

# def convertMonthToNow(inputArr, now="2017-08-24"):
#     outputArr = []
#     for x in inputArr:
#         try:
#             d1 = datetime.strptime(now, "%Y-%m-%d")
#             d2 = datetime.strptime(x, "%Y-%m-%d")
#             r = diff_month(d1, d2)
#         except:
#             r = np.nan
#         outputArr.append(r)
#     return outputArr

# # load data
# data = pd.read_csv("./data/clean/cleanData.csv")
# data.built = 2017 - data.built
# data.Lastremodel = 2017 - data.Lastremodel

# # add current information
# data.rename(columns={"price":"year0"}, inplace=True)
# data.rename(columns={"Zestimate":"price0"}, inplace=True)
# data.year0 = np.zeros(data.shape[0])

# # convert datetime to monthToNow
# data.year1 = convertMonthToNow(data.year1)
# data.year2 = convertMonthToNow(data.year2)
# data.year3 = convertMonthToNow(data.year3)
# data.year4 = convertMonthToNow(data.year4)
# data.year5 = convertMonthToNow(data.year5)

# # manipulate data from wide to long
# data.loc[:, "houseId"] = data.index
# from IPython.display import clear_output

# dfs = []
# for name, group in data.groupby("houseId"):
#     group.reset_index(drop=True, inplace=True)
#     static = group.loc[:, ["city","zipcode","area","bed","bath","sqft","type","built"]]
#     staticDf = pd.DataFrame(np.array(list(np.array(static))*6)).astype("str")
#     dynamicDf = pd.DataFrame([[group.year0[0], group.price0[0]], [group.year1[0], group.price1[0]], [group.year2[0], group.price2[0]],
#                               [group.year3[0], group.price3[0]], [group.year4[0], group.price4[0]],[group.year5[0], group.price5[0]]])
#     staticDf.rename(columns={0:'city', 1:'zipcode', 2:'area', 3:'bed', 4:'bath', 5:'sqft', 6:'type', 7:'built'}, inplace=True)
#     dynamicDf.rename(columns={0:'monthToNow', 1:'price'}, inplace=True)
#     dfs.append(pd.concat([staticDf, dynamicDf], axis=1))
#     print name
#     clear_output(wait=True)

# full_df = pd.concat(dfs, axis=0)
# full_df.shape
# full_df.to_csv("./data/clean/cleanDataLong.csv", index=False)
# full_df.head()

In [3]:
full_df = pd.read_csv("./data/clean/cleanDataLong.csv")
cleanDf = full_df.dropna(axis=0)
print cleanDf.shape
cleanDf.head()


(155273, 10)
Out[3]:
city zipcode area bed bath sqft type built monthToNow price
0 Fremont 94555.0 Northgate 4.0 1.0 1400.0 Single Family 47.0 0.0 986436.0
1 Fremont 94555.0 Northgate 4.0 1.0 1400.0 Single Family 47.0 218.0 292000.0
6 Fremont 94538.0 Downtown 2.0 2.0 1042.0 Condo 45.0 0.0 535006.0
8 Fremont 94538.0 Downtown 2.0 2.0 1042.0 Condo 45.0 4.0 160000.0
12 Fremont 94539.0 Mission Valley 3.0 2.0 1434.0 Single Family 56.0 0.0 1320951.0

In [4]:
Counter(full_df.city)


Out[4]:
Counter({nan: 150,
         'Fremont': 37902,
         'Mountain View': 11784,
         'Newark': 8388,
         'Redwood City': 12162,
         'San Jose': 161100,
         'San Mateo': 17496,
         'Santa Clara': 17922,
         'Sunnyvale': 19872,
         'Union City': 10524})

In [7]:
plt.figure(figsize = (20,10))
ax1 = plt.subplot(1,2,1)
sns.regplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.mean, ax=ax1)
sns.plt.xlim(0,250)
sns.plt.ylim(0,1500000)
plt.title("value trend (mean) - all data")

ax2 = plt.subplot(1,2,2)
sns.regplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.median, ax=ax2)
sns.plt.xlim(0,250)
sns.plt.ylim(0,1500000)
plt.title("value trend (median) - all data")


Out[7]:
<matplotlib.text.Text at 0x1239ffb90>

In [14]:
plt.figure(figsize = (30,30))
ax1 = plt.subplot(3,3,1)
ax2 = plt.subplot(3,3,2)
ax3 = plt.subplot(3,3,3)
ax4 = plt.subplot(3,3,4)
ax5 = plt.subplot(3,3,5)
ax6 = plt.subplot(3,3,6)
ax7 = plt.subplot(3,3,7)
ax8 = plt.subplot(3,3,8)
ax9 = plt.subplot(3,3,9)

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Union City',:], x_estimator=np.mean, ax=ax1)
ax1.set_xlim([0, 250])
ax1.set_ylim([0, 1800000])
ax1.set_title("value trend (mean) - Union City")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Fremont',:], x_estimator=np.mean, ax=ax2)
ax2.set_xlim([0, 250])
ax2.set_ylim([0, 1800000])
ax2.set_title("value trend (mean) - Fremont")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Newark',:], x_estimator=np.mean, ax=ax3)
ax3.set_xlim([0, 250])
ax3.set_ylim([0, 1800000])
ax3.set_title("value trend (mean) - Newark")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='San Mateo',:], x_estimator=np.mean, ax=ax4)
ax4.set_xlim([0, 250])
ax4.set_ylim([0, 1800000])
ax4.set_title("value trend (mean) - San Mateo")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Redwood City',:], x_estimator=np.mean, ax=ax5)
ax5.set_xlim([0, 250])
ax5.set_ylim([0, 1800000])
ax5.set_title("value trend (mean) - Redwood City")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Mountain View',:], x_estimator=np.mean, ax=ax6)
ax6.set_xlim([0, 250])
ax6.set_ylim([0, 1800000])
ax6.set_title("value trend (mean) - Mountain View")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Santa Clara',:], x_estimator=np.mean, ax=ax7)
ax7.set_xlim([0, 250])
ax7.set_ylim([0, 1800000])
ax7.set_title("value trend (mean) - Santa Clara")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Sunnyvale',:], x_estimator=np.mean, ax=ax8)
ax8.set_xlim([0, 250])
ax8.set_ylim([0, 1800000])
ax8.set_title("value trend (mean) - Sunnyvale")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='San Jose',:], x_estimator=np.mean, ax=ax9)
ax9.set_xlim([0, 250])
ax9.set_ylim([0, 1800000])
ax9.set_title("value trend (mean) - San Jose")


Out[14]:
<matplotlib.text.Text at 0x12d5897d0>

In [15]:
plt.figure(figsize = (30,30))
ax1 = plt.subplot(3,3,1)
ax2 = plt.subplot(3,3,2)
ax3 = plt.subplot(3,3,3)
ax4 = plt.subplot(3,3,4)
ax5 = plt.subplot(3,3,5)
ax6 = plt.subplot(3,3,6)
ax7 = plt.subplot(3,3,7)
ax8 = plt.subplot(3,3,8)
ax9 = plt.subplot(3,3,9)

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Union City',:], x_estimator=np.median, ax=ax1)
ax1.set_xlim([0, 250])
ax1.set_ylim([0, 1800000])
ax1.set_title("value trend (median) - Union City")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Fremont',:], x_estimator=np.median, ax=ax2)
ax2.set_xlim([0, 250])
ax2.set_ylim([0, 1800000])
ax2.set_title("value trend (median) - Fremont")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Newark',:], x_estimator=np.median, ax=ax3)
ax3.set_xlim([0, 250])
ax3.set_ylim([0, 1800000])
ax3.set_title("value trend (median) - Newark")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='San Mateo',:], x_estimator=np.median, ax=ax4)
ax4.set_xlim([0, 250])
ax4.set_ylim([0, 1800000])
ax4.set_title("value trend (median) - San Mateo")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Redwood City',:], x_estimator=np.median, ax=ax5)
ax5.set_xlim([0, 250])
ax5.set_ylim([0, 1800000])
ax5.set_title("value trend (median) - Redwood City")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Mountain View',:], x_estimator=np.median, ax=ax6)
ax6.set_xlim([0, 250])
ax6.set_ylim([0, 1800000])
ax6.set_title("value trend (median) - Mountain View")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Santa Clara',:], x_estimator=np.median, ax=ax7)
ax7.set_xlim([0, 250])
ax7.set_ylim([0, 1800000])
ax7.set_title("value trend (median) - Santa Clara")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='Sunnyvale',:], x_estimator=np.median, ax=ax8)
ax8.set_xlim([0, 250])
ax8.set_ylim([0, 1800000])
ax8.set_title("value trend (median) - Sunnyvale")

sns.regplot(x="monthToNow", y="price", data=cleanDf.loc[cleanDf.city=='San Jose',:], x_estimator=np.median, ax=ax9)
ax9.set_xlim([0, 250])
ax9.set_ylim([0, 1800000])
ax9.set_title("value trend (median) - San Jose")


Out[15]:
<matplotlib.text.Text at 0x1395d9510>

In [18]:
sns.lmplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.mean, scatter=False, hue="city")
sns.plt.xlim(0,250)
sns.plt.ylim(0,)
plt.title("value trend (mean) - by city")

sns.lmplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.median, scatter=False, hue="city")
sns.plt.xlim(0,250)
sns.plt.ylim(0,)
plt.title("value trend (median) - by city")


Out[18]:
<matplotlib.text.Text at 0x138a51550>

In [19]:
sns.lmplot(x="monthToNow", y="price", data=cleanDf, x_estimator=np.median, scatter=False, hue="type")
sns.plt.xlim(0,250)
sns.plt.ylim(0,)
plt.title("value trend (median) - by type")


Out[19]:
<matplotlib.text.Text at 0x133bd5810>