https://youtu.be/t4319ffzRg0?list=PLQVvvaa0QuDc-3szzjeP6N6b0aDrrKyL-
http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html scikit-learn cheat-sheet
In [1]:
import quandl;
import pandas as pd;
import pickle;
import matplotlib.pyplot as plt;
from matplotlib import style;
style.use("ggplot");
import numpy as np;
from statistics import mean;
from sklearn import svm, preprocessing, cross_validation;
In [2]:
api_key = open("quandlapikey.txt", "r").read();
def mortgage_30y_resampled():
df = quandl.get("FMAC/MORTG", trim_start = "1975-01-01", authtoken = api_key);
df["Value"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0;
df.columns = ["M30"];
return df.resample("M").mean();
def state_list():
fiddy_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states");
return fiddy_states[0][0][1:];
def grap_initial_state_data_start_pct():
states = state_list();
main_df = pd.DataFrame();
for ab in states:
querry = "FMAC/HPI_" + ab;
df = quandl.get(querry, authtoken = api_key);
df.columns = [ab];
df[ab] = (df[ab] - df[ab][0]) / df[ab][0] * 100.0; # <-------
if main_df.empty:
main_df = df;
else:
main_df = main_df.join(df);
pickle_out = open("./data/fiddy_states.pickle", "wb");
pickle.dump(main_df, pickle_out);
pickle_out.close();
def HPI_Benchmark():
df = quandl.get("FMAC/HPI_USA", authtoken = api_key);
df.columns = ["US"];
df["US"] = (df["US"] - df["US"][0]) / df["US"][0] * 100.0; # <-------
return df;
In [3]:
def sp500_data():
df = quandl.get("YAHOO/INDEX_GSPC", trim_start = "1975-01-01", authtoken = api_key);
df["Adjusted Close"] = (df["Adjusted Close"] - df["Adjusted Close"][0]) / df["Adjusted Close"][0] * 100.0; # <-------
df = df.resample("M").mean();
df.rename(columns={"Adjusted Close":"sp500"}, inplace = True);
df = df["sp500"];
return df;
In [4]:
df = sp500_data();
print(df.head());
In [5]:
def gdp_data():
df = quandl.get("BCB/4385", trim_start = "1975-01-01", authtoken = api_key);
df["Value"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0; # <-------
df = df.resample("M").mean();
df.rename(columns={"Value":"GDP"}, inplace = True);
df = df["GDP"];
return df;
In [6]:
def us_unemployment():
df = quandl.get("ECPI/JOB_G", trim_start = "1975-01-01", authtoken = api_key);
df["Unemployment Rate"] = (df["Unemployment Rate"] - df["Unemployment Rate"][0]) / df["Unemployment Rate"][0] * 100.0; # <-------
df = df.resample("1D").mean();
df = df.resample("M").mean();
return df;
In [7]:
sp500 = sp500_data();
US_GDP = gdp_data();
US_uneployment = us_unemployment();
m30 = mortgage_30y_resampled();
HPI_data = pd.read_pickle("./data/fiddy_states.pickle");
HPI_bench = HPI_Benchmark();
In [8]:
HPI = HPI_data.join([HPI_bench, m30, US_uneployment, US_GDP, sp500]);
print(HPI.head());
print(HPI.corr().head());
# we have nans!!
In [9]:
HPI.dropna(inplace = True);
print(HPI.head());
print(HPI.corr().head());
In [10]:
HPI.to_pickle("./data/HPI.pickle");
In [11]:
housing_data = pd.read_pickle("./data/HPI.pickle");
housing_data = housing_data.pct_change();
print(housing_data.head());
In [12]:
housing_data.replace([np.inf, -np.inf], np.nan, inplace = True);
housing_data.dropna(inplace = True);
print(housing_data.head());
In [13]:
housing_data["US_HPI_future"] = housing_data["US"].shift(-1);
print(housing_data[["US_HPI_future", "US"]].head());
In [14]:
def create_labels(cur_hpi, fut_hpi):
if fut_hpi > cur_hpi:
return 1;
else:
return 0;
housing_data["label"] = list(map(create_labels, housing_data["US"], housing_data["US_HPI_future"])); # wow
#pd.Series.map may be useful also
print(housing_data[["US_HPI_future", "US", "label"]].head());
In [15]:
def moving_average(values):
return mean(values);
housing_data["ma_apply_example"] = housing_data["M30"].rolling(window = 10).apply(moving_average);
print(housing_data[["M30", "ma_apply_example"]]);
In [16]:
X = np.array(housing_data.drop(["US_HPI_future", "label", "ma_apply_example"], 1));
print(X);
X = preprocessing.scale(X);
print(X);
In [17]:
y = np.array(housing_data["label"]);
print(y);
In [18]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.2);
clf = svm.SVC(kernel = "linear");
clf.fit(X_train, y_train);
print(clf.score(X_test, y_test));
In [ ]: