https://youtu.be/uLqmM6ExPvo?list=PLQVvvaa0QuDc-3szzjeP6N6b0aDrrKyL-
In [31]:
import quandl;
import pandas as pd;
import pickle;
import matplotlib.pyplot as plt;
from matplotlib import style;
style.use("ggplot");
import numpy as np;
from statistics import mean;
In [32]:
api_key = open("quandlapikey.txt", "r").read();
def mortgage_30y_resampled():
df = quandl.get("FMAC/MORTG", trim_start = "1975-01-01", authtoken = api_key);
df["Value"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0;
df.columns = ["M30"];
return df.resample("M").mean();
def state_list():
fiddy_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states");
return fiddy_states[0][0][1:];
def grap_initial_state_data_start_pct():
states = state_list();
main_df = pd.DataFrame();
for ab in states:
querry = "FMAC/HPI_" + ab;
df = quandl.get(querry, authtoken = api_key);
df.columns = [ab];
df[ab] = (df[ab] - df[ab][0]) / df[ab][0] * 100.0; # <-------
if main_df.empty:
main_df = df;
else:
main_df = main_df.join(df);
pickle_out = open("./data/fiddy_states.pickle", "wb");
pickle.dump(main_df, pickle_out);
pickle_out.close();
def HPI_Benchmark():
df = quandl.get("FMAC/HPI_USA", authtoken = api_key);
df.columns = ["US"];
df["US"] = (df["US"] - df["US"][0]) / df["US"][0] * 100.0; # <-------
return df;
In [33]:
def sp500_data():
df = quandl.get("YAHOO/INDEX_GSPC", trim_start = "1975-01-01", authtoken = api_key);
df["Adjusted Close"] = (df["Adjusted Close"] - df["Adjusted Close"][0]) / df["Adjusted Close"][0] * 100.0; # <-------
df = df.resample("M").mean();
df.rename(columns={"Adjusted Close":"sp500"}, inplace = True);
df = df["sp500"];
return df;
In [34]:
df = sp500_data();
print(df.head());
In [35]:
def gdp_data():
df = quandl.get("BCB/4385", trim_start = "1975-01-01", authtoken = api_key);
df["Value"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0; # <-------
df = df.resample("M").mean();
df.rename(columns={"Value":"GDP"}, inplace = True);
df = df["GDP"];
return df;
In [36]:
def us_unemployment():
df = quandl.get("ECPI/JOB_G", trim_start = "1975-01-01", authtoken = api_key);
df["Unemployment Rate"] = (df["Unemployment Rate"] - df["Unemployment Rate"][0]) / df["Unemployment Rate"][0] * 100.0; # <-------
df = df.resample("1D").mean();
df = df.resample("M").mean();
return df;
In [37]:
sp500 = sp500_data();
US_GDP = gdp_data();
US_uneployment = us_unemployment();
m30 = mortgage_30y_resampled();
HPI_data = pd.read_pickle("./data/fiddy_states.pickle");
HPI_bench = HPI_Benchmark();
In [38]:
HPI = HPI_data.join([HPI_bench, m30, US_uneployment, US_GDP, sp500]);
print(HPI.head());
print(HPI.corr().head());
# we have nans!!
In [39]:
HPI.dropna(inplace = True);
print(HPI.head());
print(HPI.corr().head());
In [40]:
HPI.to_pickle("./data/HPI.pickle");
In [41]:
housing_data = pd.read_pickle("./data/HPI.pickle");
housing_data = housing_data.pct_change();
print(housing_data.head());
In [42]:
housing_data.replace([np.inf, -np.inf], np.nan, inplace = True);
housing_data.dropna(inplace = True);
print(housing_data.head());
In [43]:
housing_data["US_HPI_future"] = housing_data["US"].shift(-1);
print(housing_data[["US_HPI_future", "US"]].head());
In [44]:
def create_labels(cur_hpi, fut_hpi):
if fut_hpi > cur_hpi:
return 1;
else:
return 0;
housing_data["label"] = list(map(create_labels, housing_data["US"], housing_data["US_HPI_future"])); # wow
#pd.Series.map may be useful also
print(housing_data[["US_HPI_future", "US", "label"]].head());
In [45]:
def moving_average(values):
return mean(values);
housing_data["ma_apply_example"] = housing_data["M30"].rolling(window = 10).apply(moving_average);
print(housing_data[["M30", "ma_apply_example"]]);
In [ ]: