In [48]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import validation_curve
In [2]:
path = '../Clean Data'
X_fn = 'X.csv'
y_fn = 'y.csv'
X_path = os.path.join(path, X_fn)
y_path = os.path.join(path, y_fn)
X = pd.read_csv(X_path)
y = pd.read_csv(y_path)
In [3]:
X.head()
Out[3]:
In [4]:
X.tail()
Out[4]:
In [5]:
y.tail(n=10)
Out[5]:
In [6]:
y.drop('Unnamed: 0', axis=1, inplace=True)
y.fillna(0, inplace=True)
In [7]:
X_cols = ['nameplate_capacity', 'GROSS LOAD (MW)', 'ERCOT Load, MW',
'Total Wind Installed, MW', 'Total Wind Output, MW', 'Net Load Change (MW)',
'NG Price ($/mcf)', 'All coal', 'Lignite', 'Subbituminous']
X_train = X.loc[(X['Year']<2012)&(X['cluster_id_6']==0),X_cols]
y_train = y.loc[(X['Year']<2012)&(X['cluster_id_6']==0),:]
X_cv = X.loc[(X['Year'].isin([2012, 2013]))&(X['cluster_id_6']==0),X_cols]
y_cv = y.loc[(X['Year'].isin([2012, 2013]))&(X['cluster_id_6']==0),:]
X_test = X.loc[(X['Year']>=2013)&(X['cluster_id_6']==0),X_cols]
y_test = y.loc[(X['Year']>=2013)&(X['cluster_id_6']==0),'Gen Change (MW)']
In [8]:
X_cv.shape
y_cv.shape
Out[8]:
In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
In [10]:
lr = LinearRegression(normalize=True)
In [11]:
lr.fit(X_train.fillna(0), y_train.loc[:,'Gen Change (MW)'])
Out[11]:
In [12]:
lr.score(X_cv, y_cv.loc[:,'Gen Change (MW)'])
Out[12]:
In [13]:
svm = SVR()
In [14]:
parameters = {'C':np.logspace(-5, 1, num=5)}
In [15]:
lm = GridSearchCV(svm, parameters, n_jobs=-1, verbose=3)
In [16]:
X_train_scale = StandardScaler().fit_transform(X_train.fillna(0))
In [17]:
lm.fit(X_train_scale, y_train.loc[:,'Gen Change (MW)'])
Out[17]:
In [18]:
lm.cv_results_
Out[18]:
In [19]:
lm.score(StandardScaler().fit_transform(X_cv), y_cv.loc[:,'Gen Change (MW)'])
Out[19]:
In [20]:
lsvr = LinearSVR()
In [21]:
lsvr.fit(X_train_scale, y_train.loc[:,'Gen Change (MW)'])
Out[21]:
In [22]:
X_cv_scaled = StandardScaler().fit_transform(X_cv)
In [23]:
lsvr.score(X_cv_scaled, y_cv.loc[:,'Gen Change (MW)'])
Out[23]:
In [24]:
X_train = X.loc[(X['Year']<2012),X_cols+['cluster_id_6']]
X_train.fillna(0, inplace=True)
y_train = y.loc[(X['Year']<2012),:]
y_train.fillna(0, inplace=True)
In [25]:
X_train.head()
Out[25]:
In [26]:
gbr = GradientBoostingRegressor()
In [27]:
params = {'n_estimators': [100, 200, 400],
'max_depth': [1, 3, 5]
}
gb_gs = GridSearchCV(gbr, params, n_jobs=-1, verbose=3)
In [28]:
gb_gs.fit(X_train, y_train.loc[:,'Gen Change (MW)'])
Out[28]:
In [29]:
X_cv = X.loc[(X['Year'].isin([2012, 2013])),X_cols+['cluster_id_6']]
X_cv.fillna(0, inplace=True)
y_cv = y.loc[(X['Year'].isin([2012, 2013])),:]
y_cv.fillna(0, inplace=True)
In [30]:
gb_gs.score(X_cv, y_cv.loc[:,'Gen Change (MW)'])
Out[30]:
In [58]:
train_scores, valid_scores = validation_curve(GradientBoostingRegressor(), X_train, y_train.loc[:,"Gen Change (MW)"], "n_estimators", [100, 200, 400])
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)
plt.title("Validation Curve - SVR")
plt.xlabel("C")
plt.ylabel("Score")
plt.ylim(0.0, 1)
lw = 2
param_range = range(0, 3)
plt.semilogx(param_range, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.semilogx(param_range, valid_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_range, valid_scores_mean - valid_scores_std,
valid_scores_mean + valid_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
plt.show()
In [31]:
gb_gs.best_params_
gb_gs.best_index_
Out[31]:
In [32]:
gbr2 = GradientBoostingRegressor(max_depth= 5, n_estimators= 100)
In [33]:
gbr2.fit(X_train, y_train.loc[:,'Gen Change (MW)'])
Out[33]:
In [34]:
gbr2.score(X_cv, y_cv.loc[:,'Gen Change (MW)'])
Out[34]:
In [35]:
X_cv.tail()
Out[35]:
In [36]:
y_pr = gbr2.predict(X_cv)
In [37]:
y_resids = y_cv
y_resids.loc[:,'y_pr'] = y_pr
In [38]:
y_resids.head()
Out[38]:
In [39]:
y_resids.loc[:,'residuals'] = y_resids.loc[:, 'y_pr'] - y_resids.loc[:,'Gen Change (MW)']
In [40]:
g = sns.FacetGrid(y_resids, hue='cluster_id_6', col='cluster_id_6',
col_wrap=3)
g.map(plt.scatter, 'Gen Change (MW)', 'residuals')
g.add_legend()
Out[40]:
In [41]:
plt.scatter(y_pr-y_cv.loc[:,'Gen Change (MW)'], y_cv.loc[:,'Gen Change (MW)'])
plt.ylabel('y_pr - y_va')
plt.xlabel('y_va')
Out[41]:
In [42]:
gbr2.predict(X_cv.tail())
Out[42]:
In [43]:
y_cv.head()
y_cv.tail()
Out[43]:
In [44]:
zip(X_train.columns, gbr2.feature_importances_)
Out[44]:
In [46]:
gbr2.feature_importances_
Out[46]:
In [ ]:
zip(X_train.columns, gbr.feature_importances_)
In [ ]:
zip(X_train.columns, gb_gs.feature_importances_)
In [ ]:
gbr.get_params
In [ ]: