In [1]:
import fiona
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import geopandas as gpd
from shapely.geometry import Point, Polygon
import zipfile
import requests
import os
import shutil
from sklearn.preprocessing import *
from lnks import scl_cols
#import tensorflow as tf
%matplotlib inline
import warnings #DANGER: I triggered a ton of warnings.
warnings.filterwarnings('ignore')
np.random.seed()
In [2]:
#Shift and shape vars
shiftmonths = 6
shapef = 'anc'
#Assign the split for holdout data.
holdout_date = 2015.5
#Get data
filestring = './data/'+shapef+'_out.csv'
df = pd.read_csv(filestring)
df = df.sort_values(['month', 'NAME'])# , 'ANC'])
df = df.reset_index(drop=True)
len(df.NAME.unique())
Out[2]:
Now we examine the columns and lag the data.
In [3]:
df.columns
Out[3]:
In [4]:
print(df.shape)
shiftnum= (((len(df.NAME.unique()))*(shiftmonths)))
#Also generate some lagged y data in the opposite direction.
df['y']= df['countBBL'].shift(-shiftnum)
df['countBBL_prev_month'] = df['countBBL'].shift((len(df.NAME.unique())))
df['countBBL_prev_cycle'] = df['countBBL'].shift((shiftnum))
df = df[shiftnum:-(shiftnum+(len(df.NAME.unique())))]
df = df.dropna()
df.shape
Out[4]:
The next cell cleans out vestigial columns and drops/fills/expands to dummies for our NA and categorical values.
In [5]:
df = pd.get_dummies(df, columns=['NAME'])
df = df.drop(['Unnamed: 0'], axis= 1)
print(df.shape)
df = df.astype('float')
df = df.dropna()
print(df.shape)
Here we start building our grid search inputs, beginning with the splits.
In [6]:
#Flexible adaptation of Dr. Braman's interactive gridsearch script
#implementation.
#TODO Clean up and streamline
import sklearn
from sklearn.neural_network import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.gaussian_process import *
from sklearn.gaussian_process.kernels import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn.discriminant_analysis import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
import random
#Frame up some separate DataFrames for scalar and stuff
scl_data = data = df
data = data.reset_index(drop=True)
X = data.drop(['y'], axis=1)
y = data['y']
XH_train = data[data['month'] <= holdout_date-1]
yH_train = XH_train['y']
XH_train = XH_train.drop(['y'], axis=1)
XH_val = scl_data[scl_data['month'] >= holdout_date-1]
XH_val = XH_val[XH_val['month'] <= holdout_date]
yH_val = XH_val['y']
XH_val = XH_val.drop(['y'], axis=1)
XH_test = data[data['month'] >= holdout_date]
yH_test = XH_test['y']
XH_test = XH_test.drop(['y'], axis=1)
ytr = sklearn.preprocessing.MinMaxScaler([0, 1]
).fit(y)
y = ytr.fit_transform(y)
y = pd.DataFrame(y, columns=['y'])
scl_data = scl_data.reset_index(drop=True)
In [7]:
y.y
Out[7]:
In [8]:
print(scl_data.month.max())
print(scl_data.shape)
scl_data = scl_data.dropna()
print(scl_data.shape)
sXH_train = scl_data[scl_data['month'] <= holdout_date-1]
syH_train = sXH_train['y']
sXH_train = sXH_train.drop(['y'], axis=1)
sXH_val = scl_data[scl_data['month'] >= holdout_date-1]
sXH_val = sXH_val[sXH_val['month'] <= holdout_date]
syH_val = sXH_val['y']
sXH_val = sXH_val.drop(['y'], axis=1)
sXH_test = scl_data[scl_data['month'] >= holdout_date]
syH_test = sXH_test['y']
sXH_test = sXH_test.drop(['y'], axis=1)
In [9]:
#Build scalers for the scl_data, other --------------------
scale_data_splits = [scl_data, sXH_train,sXH_test, syH_train, syH_test]
for scl_data in scale_data_splits:
scaler = sklearn.preprocessing.StandardScaler(
).fit(scl_data)
minmaxer = sklearn.preprocessing.MinMaxScaler([0, 1]
).fit(scl_data)
scl = scaler.transform(scl_data)
scl = minmaxer.transform(scl_data)
try:
scl_data = pd.DataFrame(scl, columns=scl_data.columns)
except AttributeError as e:
print(e)
scl_data = pd.DataFrame(scl, columns=['y'])
print(scl_data.shape)
scl_data = scl_data.dropna()
print(scl_data.shape)
assert np.all(np.isfinite(scl_data))
assert not np.any(np.isnan(scl_data))
#scl_data[scl_data.columns
# ] = scaler.fit_transform(scl_data[scl_data.columns])
#----------------------------------------------------------
Let's make sure our data came out of the scalers intact:
In [10]:
y;
In [11]:
print(sXH_train.shape)
print(syH_train.shape)
print(sXH_test.shape)
print(syH_test.shape)
In [12]:
scl_data.columns
Out[12]:
In [13]:
sX = scl_data.drop(['y'], axis=1)
sy = scl_data['y']
assert np.all(np.isfinite(X))
assert np.all(np.isfinite(y))
assert not np.any(np.isnan(X))
assert not np.any(np.isnan(y))
assert np.all(np.isfinite(sX))
assert np.all(np.isfinite(sy))
assert not np.any(np.isnan(sX))
assert not np.any(np.isnan(sy))
In [14]:
scl_data.columns
Out[14]:
In [15]:
scl_data.describe()
Out[15]:
This cell contains our a crude RNG, a list of regressors which benefit from scaled data, and hardcoded data used to generate our param_grid, et cetera.
In [16]:
#Make a short list of random states to insert into randomstate params.
scrambler = []
for scram in range(0, 10):
scrambler.append(random.randint(0, 10000))
print(scrambler)
to_scale = ['SVR']
names = ['AdaBoostRegressor',
'RandomForestRegressor',
'SVR',
#'KNeighborsRegressor',
#'BaggingRegressor',
'GradientBoostingRegressor',
#'LinearRegression',
#'MLPRegressor',
#'SGDRegressor',
'LassoLars'
]
regressors = [AdaBoostRegressor(),
RandomForestRegressor(),
SVR(),
#KNeighborsRegressor(),
#BaggingRegressor(),
GradientBoostingRegressor(),
#LinearRegression(),
#MLPRegressor(),
#SGDRegressor(),
LassoLars()
]
param_grids =[
['AdaBoostRegressor', dict(
n_estimators=[80, 60, 30],
learning_rate=[1, .5, .01],
loss=['linear', 'square', 'exponential'],
#random_state=scrambler[3:5]
)],
['RandomForestRegressor', dict(
max_depth=[5, 10, 15],
criterion=['mse', 'mae'],
#random_state=scrambler[:2]
)],
['SVR', dict( #Most params for SVR are turned off right now, too expensive
C=[1, .9],
epsilon=[.1, .05],
#kernel=['poly']
)],
['GradientBoostingRegressor', dict(
max_depth=[3, 6, 9, 12],
min_samples_split=[2, 4, 8],
presort=[False]
)],
['LassoLars', dict(
alpha=[0.1, 1, .5, .75],
#random_state=[random.randint(0, 10000)]
)],
]
Here we implement an iterator that executes GridSearchCV and reports the best explained variance. The best_params attribute is then extracted, and used those on the whole training set, then predict on the holdout data.
Testing indicates that for some models, the fit on our full dataset modestly outperforms the CV regularly.
In [17]:
outcomes = []
for name, rgsr in zip(names, regressors):
for item in param_grids:
if item[0]==name:
print(name + ':')
params= item[1]
cv = sklearn.model_selection.GridSearchCV(rgsr, param_grid=params,
verbose=True, n_jobs=12,
cv=3, pre_dispatch="2*n_jobs")
if name not in to_scale:
#X_train, y_train, X_test, y_test = sklearn.model_selection.train_test_split(X, y)
fitted = cv.fit(XH_train, yH_train)
score = cv.score(XH_val, yH_val)
print(score)
best = rgsr.set_params(**cv.best_params_)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
if name in to_scale:
#TODO: fix
#X_train, y_train, X_test, y_test = sklearn.model_selection.train_test_split(sX, sy)
fitted = cv.fit(sXH_train, syH_train)
score = cv.score(sXH_val, syH_val)
print(score)
best = rgsr.set_params(**cv.best_params_)
bestfit= best.fit(sXH_train, syH_train)
bestscore = best.score(sXH_test, syH_test)
print(name + " R2 with best model, score:")
print(bestscore)
outcomes.append((name, score, cv.cv_results_, cv.best_estimator_,
cv.best_params_, bestscore, [yH_test, ]))
for nm in range(0, len(outcomes)):
print()
print(outcomes[nm][0])
print(outcomes[nm][1])
print()
print('Best on real:')
print(outcomes[nm][-1])
In [18]:
#Adapted from https://pythonspot.com/en/matplotlib-bar-chart/
objects = [j[0] for j in outcomes]
y_pos = np.arange(len(objects))
performance = [j[-1] for j in outcomes]
for jm in range(len(performance)):
if performance[jm] < 0:
performance[jm] = 0
performance
Out[18]:
In [19]:
plt.barh(y_pos, performance, align='center', alpha=0.5)
plt.yticks(y_pos, objects)
plt.xlabel('R2 Score')
ti = "Scoring across models for "+shapef+", lagging by "+str(shiftmonths)+ " months."
plt.title(ti)
fl = './plots/' + shapef + "_shift" + str(shiftmonths)
plt.savefig(fl)
In [ ]:
In [20]:
for jm in range(0, 5):
print(outcomes[jm][0])
print(outcomes[jm][1])
print(outcomes[jm][4])
In [21]:
best = AdaBoostRegressor(learning_rate=1, loss='square', n_estimators=60)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[0][0])
print(bestscore)
best = RandomForestRegressor(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[1][0])
print(bestscore)
best = SVR(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[2][0])
print(bestscore)
best = GradientBoostingRegressor(max_depth=10)
bestfit= best.fit(XH_train, yH_train)
bestscore = best.score(XH_test, yH_test)
print(outcomes[3][0])
print(bestscore)
In [ ]:
Xtrain = dat_xtrain.drop(['y'], axis=1)
y16 = dat_ytrain['y']
X15 = dat15.drop(['y'], axis=1)
y15 = dat15['y']
fitted = outcomes[-2][3].fit(X15, y15)
predicted = fitted.predict(X16)
In [ ]:
pred = pd.DataFrame(predicted, columns=['predicted'])
dat16 = dat16.reset_index()
pred['y'] = dat16['y']
In [ ]:
def flagger_ranges(pred):
pred['flag15'] = 0
pred['flag15'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
] = 1
pred['flag05'] = 0
pred['flag05'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
] = 1
pred['flag10'] = 0
pred['flag10'][pred['predicted'].between(pred['y']*0.85, pred['y']*1.15)
] = 1
pred['flag_others']= 0
pred['flag_others'][pred['flag05'] == 0] = 1
return pred
pred = flagger_ranges(pred)
pred
In [ ]:
In [ ]: