Implements two models:
Surprisingly, the OLS model with the rescaled model does worse than the diff model in Atlanta. This may be because we are overfitting our model, i.e. we should be using fewer/different lags.
In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
% matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import sys, os, copy
print("Using environment in "+sys.prefix)
print("Python version "+sys.version)
In [2]:
fname = 'Metro_Zhvi_AllHomes.csv'
# Map Zillow RegionID to metro area string
metroRegionID = {'Dallas-Fort Worth, TX':394514,
'Atlanta, GA':394347,
'Phoenix, AZ':394976,
'Las Vegas, NV':394775 }
metroString = {'Dallas':u'Dallas-Fort Worth, TX',
'Atlanta':u'Atlanta, GA',
'Phoenix':u'Phoenix, AZ',
'Vegas':u'Las Vegas, NV'}
In [3]:
df = pd.read_csv('Data/ZHVI/'+fname,index_col=[1])
df = df.drop(['RegionID','SizeRank'],axis=1)
df.columns = pd.DatetimeIndex(df.columns)
In [4]:
def generateResults(myfunc,funcNameString, startDate=None, endDate=None):
# Wrapper that will take a prediction function which is defined similar to the above.
# This function then generate plots of the results for each of the metro areas,
# Returns a Series with the RMSE (in percentage of housing price change)
cityList = list(metroString.keys())
cityList.sort()
cityResults = pd.Series(name=funcNameString)
fig, axes = plt.subplots(nrows=2, ncols=2,figsize=(12,8))
for i in range( len(metroString) ):
# Prep index data for plotting
twobits = '{0:02b}'.format(i)
row = int(twobits[-1])
col = int(twobits[-2])
city = cityList[i]
# Generate results
ts = df.loc[metroString[city], startDate:endDate ]
myResults = myfunc(ts)
myResults = myResults # Scale to be percentages
myResults['error'] = myResults['predicted'] - myResults['actual']
rmse = np.sqrt(myResults['error'].pow(2).sum()/float(myResults.shape[0]))
cityResults[city] = rmse
# Plot results
myResults.plot(style=[None,None,'r:'],ax=axes[row,col], title='%s RMSE= %.3f '%(city,rmse))
axes[row,col].set_ylabel('6-month percent change in ZHVI(%)')
# Clean up plot
plt.suptitle("Prediction using "+funcNameString,fontsize=16)
return cityResults
In [5]:
# Simplest possible model: Just look at the current slope with respect to last month's measurement, and assume that this will continue till
def lastMeasure(ts):
ts = ts['2006-01':]
results = pd.DataFrame(columns = ['predicted','actual'])
results['actual'] = ts.shift(-6)/ts - 1
results['predicted']= (ts/ts.shift(1) - 1) * 6
results = results.dropna()
return results * 100
In [6]:
cityResults = generateResults(lastMeasure,'Diff with Last Month')
pd.DataFrame(cityResults).T
Out[6]:
In [7]:
def laggedRegressor(ts):
# Takes in a time series of raw housing price data
# Creates the target and all variables needed for the regressor.
# Returns a dataframe with DateTimeIndex and columns ['predicted','actual'] where these are relative housing values
# nlags = 3
trainBefore = '2006-01' # Use all dates before this as training data. Start rolling predictions after this.
results = pd.DataFrame(columns = ['predicted','actual'])
## Toggle between these two lines to switch from original and rescaled OLS
X = pd.concat([ts.shift(i) for i in range(nlags)], axis=1,keys = ['L%s'%i for i in range(nlags)]) # Original data (dollars)
# X = pd.concat([ ((ts/ts.shift(i)-1)*(6. /i)) for i in range(1,nlags+1)], axis=1,keys = ['L%s'%i for i in range(nlags)]) # Scaled data (% change)
X['y'] = ts.shift(-6) / ts - 1 # De-mean (i.e. want everything to be +/- percentages, with 0 being neutral)
X['y'] = X['y'] * 100 # Convert to percentage
X = X.dropna()
X = X.assign(trend=np.arange(len(X)))
X.head()
# Convert that training date into a start and top time for our training set. Klugey, but it works (sort of)
trainEndIndex = X.index.get_loc(trainBefore).stop
trainingEnds = X.index[trainEndIndex-1]
testDate = X.index[trainEndIndex]
testDateList = X.loc[trainingEnds:,:].index
for testDate in testDateList:
trainSet = X.loc[:trainingEnds,:]
testSet = X.loc[testDate:testDate,:]
results.loc[testDate,'actual'] = X.loc[testDate,'y']
eqn = 'y ~ trend + ' + ' + '.join(['L%s'%i for i in range(nlags)])
mod_lagged = smf.ols(eqn, data=trainSet)
res_lagged = mod_lagged.fit()
results.loc[testDate,'predicted'] = res_lagged.predict(testSet.drop('y',axis=1)).values[0]
trainingEnds = testDate
return results
In [8]:
# Loop through a potential set of lags to see what the results are like:
# The model seems very prone to overfitting, so having a good set of results is helpful
cityResults = pd.DataFrame()
for nlags in range(2,4):
modelString = 'Lagged OLS, nlags=%s'%nlags
cityResults[modelString] = generateResults(laggedRegressor,modelString)
fname = 'Data/Plots/Eric/LaggedOLS/LaggedOLS'+str(nlags)+'.pdf'
plt.savefig(fname)
In [9]:
cityResults.T
Out[9]:
In [ ]: