Initial Regressors and Predictors

Implements two models:

  • A simple model in which we project forwards based on the difference with last month (surprisingly effective)
  • A lagged OLS model with 6 lag periods, which was tested in two ways:
    • Raw data ($ZHVI), without any rescaling
    • Rescaled data (percent change between months), which rescales the data to be all relative to the current month.

Surprisingly, the OLS model with the rescaled model does worse than the diff model in Atlanta. This may be because we are overfitting our model, i.e. we should be using fewer/different lags.


In [1]:
import pandas as pd
import numpy  as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm 
% matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm

import sys, os, copy
print("Using environment in "+sys.prefix)
print("Python version "+sys.version)


/Users/emunsing/Documents/Coding/github/cdips_hpi_forecast/env/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
Using environment in /Users/emunsing/Documents/Coding/github/cdips_hpi_forecast/env
Python version 3.5.2 (default, Oct 31 2016, 16:50:28) 
[GCC 4.2.1 Compatible Apple LLVM 7.3.0 (clang-703.0.31)]

In [2]:
fname = 'Metro_Zhvi_AllHomes.csv'

#  Map Zillow RegionID to metro area string
metroRegionID = {'Dallas-Fort Worth, TX':394514,
                     'Atlanta, GA':394347,
                     'Phoenix, AZ':394976,
                     'Las Vegas, NV':394775 }

metroString = {'Dallas':u'Dallas-Fort Worth, TX',
               'Atlanta':u'Atlanta, GA',
               'Phoenix':u'Phoenix, AZ',
               'Vegas':u'Las Vegas, NV'}

In [3]:
df = pd.read_csv('Data/ZHVI/'+fname,index_col=[1])
df = df.drop(['RegionID','SizeRank'],axis=1)
df.columns = pd.DatetimeIndex(df.columns)

In [4]:
def generateResults(myfunc,funcNameString, startDate=None, endDate=None):
    # Wrapper that will take a prediction function which is defined similar to the above.
    #  This function then generate plots of the results for each of the metro areas,
    #  Returns a Series with the RMSE (in percentage of housing price change)
    
    cityList = list(metroString.keys())
    cityList.sort()
    cityResults = pd.Series(name=funcNameString)
    fig, axes = plt.subplots(nrows=2, ncols=2,figsize=(12,8))

    for i in range( len(metroString) ):
        # Prep index data for plotting
        twobits = '{0:02b}'.format(i)
        row = int(twobits[-1])
        col = int(twobits[-2])
        city = cityList[i]

        # Generate results
        ts = df.loc[metroString[city], startDate:endDate ]
        myResults = myfunc(ts)
        myResults = myResults # Scale to be percentages
        myResults['error'] = myResults['predicted'] - myResults['actual']
        rmse = np.sqrt(myResults['error'].pow(2).sum()/float(myResults.shape[0]))
        cityResults[city] = rmse

        # Plot results
        myResults.plot(style=[None,None,'r:'],ax=axes[row,col], title='%s RMSE= %.3f '%(city,rmse))
        axes[row,col].set_ylabel('6-month percent change in ZHVI(%)')

    # Clean up plot
    plt.suptitle("Prediction using "+funcNameString,fontsize=16)
    
    return cityResults

Actual Models


In [5]:
# Simplest possible model: Just look at the current slope with respect to last month's measurement, and assume that this will continue till

def lastMeasure(ts):
    ts = ts['2006-01':]
    results = pd.DataFrame(columns = ['predicted','actual'])
    results['actual'] = ts.shift(-6)/ts - 1
    results['predicted']= (ts/ts.shift(1) - 1) * 6
    results = results.dropna()
    
    return results * 100

In [6]:
cityResults = generateResults(lastMeasure,'Diff with Last Month')
pd.DataFrame(cityResults).T


Out[6]:
Atlanta Dallas Phoenix Vegas
Diff with Last Month 2.077602 2.3345 3.257202 3.474052

In [7]:
def laggedRegressor(ts):  
    # Takes in a time series of raw housing price data
    # Creates the target and all variables needed for the regressor.
    # Returns a dataframe with DateTimeIndex and columns ['predicted','actual'] where these are relative housing values
    
#     nlags = 3
    trainBefore = '2006-01'  # Use all dates before this as training data. Start rolling predictions after this.
    results = pd.DataFrame(columns = ['predicted','actual'])

    ## Toggle between these two lines to switch from original and rescaled OLS
    X = pd.concat([ts.shift(i) for i in range(nlags)], axis=1,keys = ['L%s'%i for i in range(nlags)])  # Original data (dollars)
#     X = pd.concat([ ((ts/ts.shift(i)-1)*(6. /i)) for i in range(1,nlags+1)], axis=1,keys = ['L%s'%i for i in range(nlags)]) # Scaled data (% change)

    X['y'] = ts.shift(-6) / ts - 1  # De-mean (i.e. want everything to be +/- percentages, with 0 being neutral)
    X['y'] = X['y'] * 100  # Convert to percentage
    X = X.dropna()
    X = X.assign(trend=np.arange(len(X)))
    X.head()
    
    # Convert that training date into a start and top time for our training set. Klugey, but it works (sort of)
    trainEndIndex = X.index.get_loc(trainBefore).stop
    trainingEnds  = X.index[trainEndIndex-1]
    testDate = X.index[trainEndIndex]
    testDateList = X.loc[trainingEnds:,:].index

    for testDate in testDateList:

        trainSet = X.loc[:trainingEnds,:]
        testSet = X.loc[testDate:testDate,:]

        results.loc[testDate,'actual']  = X.loc[testDate,'y']

        eqn = 'y ~ trend + ' + ' + '.join(['L%s'%i for i in range(nlags)])
        mod_lagged = smf.ols(eqn, data=trainSet)
        res_lagged = mod_lagged.fit()
        results.loc[testDate,'predicted'] = res_lagged.predict(testSet.drop('y',axis=1)).values[0]

        trainingEnds = testDate
    
    return results

In [8]:
# Loop through a potential set of lags to see what the results are like: 
#  The model seems very prone to overfitting, so having a good set of results is helpful

cityResults = pd.DataFrame()

for nlags in range(2,4):
    modelString = 'Lagged OLS, nlags=%s'%nlags
    cityResults[modelString] = generateResults(laggedRegressor,modelString)
    fname = 'Data/Plots/Eric/LaggedOLS/LaggedOLS'+str(nlags)+'.pdf'
    plt.savefig(fname)



In [9]:
cityResults.T


Out[9]:
Atlanta Dallas Phoenix Vegas
Lagged OLS, nlags=2 1.949307 2.716096 3.738499 3.872352
Lagged OLS, nlags=3 1.964010 2.730967 3.682500 4.163534

In [ ]: