In [59]:
import pandas as pd
import numpy as np
np.random.seed(12345)
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
This is the regression analysis for http://www.dcc.fc.up.pt/~ltorgo/Regression/DataSets.html.
In [60]:
data_df = pd.read_csv('Elevators/elevators.data')
columns = open('Elevators/elevators.domain').readlines()
columns = [x.split(':')[0] for x in columns]
data_df.columns = columns
In [61]:
data_df.head()
Out[61]:
In [62]:
data_df.info()
In [63]:
data_df.describe().T
Out[63]:
In [64]:
ax = data_df.loc[:, 'Goal'].plot(kind='hist')
In [65]:
corrs = data_df.corr()
_, ax = plt.subplots(1, 1, figsize=(10, 10))
ax = sns.heatmap(corrs, ax=ax, linewidths=1.0)
In [66]:
_ = sns.lmplot(x='Sa', y='Goal', data=data_df)
In [67]:
cols = ['SaTime1', 'SaTime2', 'SaTime3', 'SaTime4']
figure, ax = plt.subplots(1, 4, figsize=(12, 4))
for i, col in enumerate(cols):
sns.regplot(x=col, y='Goal', data=data_df, ax=ax[i])
In [68]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, random_state=12345)
pca = pca.fit(data_df.drop('Goal', axis=1))
pca_s = pd.Series(pca.explained_variance_ratio_)
ax = pca_s.plot(kind='bar')
In [69]:
pca_data = pca.transform(data_df.drop('Goal', axis=1))
reduced_df = pd.DataFrame(data=pca_data, columns=['col_1', 'col_2'])
reduced_df.loc[:, 'Goal'] = data_df.Goal
In [70]:
_, ax = plt.subplots(1, 2, figsize=(20, 10))
reduced_df.plot(x='col_1', y='Goal', kind='scatter', ax=ax[0])
reduced_df.plot(x='col_2', y='Goal', kind='scatter', ax=ax[1])
Out[70]:
In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, cross_val_predict, KFold
model = Pipeline([
('regressor', LinearRegression())
])
cv = KFold(n_splits=10, random_state=12345)
predictions = cross_val_predict(
model,
data_df.drop('Goal', axis=1).values,
data_df.Goal.values,
cv=cv
)
results = pd.DataFrame(np.column_stack([predictions, data_df.Goal.values]), columns=['predicted', 'actual'])
figure, ax = plt.subplots(1, 2, figsize=(12, 6))
_ = results.plot(ax=ax[0])
cv = KFold(n_splits=10, random_state=12345)
scores = cross_validate(
model,
data_df.drop('Goal', axis=1).values,
data_df.Goal.values,
cv=cv,
scoring='r2',
return_train_score=True
)
scores = pd.DataFrame.from_dict(scores)
_ = scores.loc[:, ['train_score', 'test_score']].plot(ax=ax[1], kind='bar')
In [73]:
scores.mean()
Out[73]:
In [74]:
residuals = results.diff(axis=1)
residuals = residuals.drop('predicted', axis=1)
ax = residuals.plot(kind='hist')
In [75]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
In [85]:
def baseline_model(n_features=18, n=50, optimizer='adam'):
# create model
model = Sequential()
model.add(Dense(n, input_dim=n_features, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer=optimizer)
return model
In [115]:
model = KerasRegressor(build_fn=baseline_model, epochs=20, batch_size=20,verbose=1)
In [117]:
from sklearn.model_selection import cross_val_score, KFold
pipeline = Pipeline([
('standard', StandardScaler()),
('model', model)
])
pipeline = pipeline.fit(
data_df.drop('Goal', axis=1).values,
data_df.Goal.values
)
kfold = KFold(n_splits=10, random_state=12345)
results = cross_val_score(
pipeline,
data_df.drop('Goal', axis=1).values,
data_df.Goal.values,
scoring='r2',
cv=kfold)
In [118]:
pd.Series(results).abs().plot(kind='bar')
Out[118]:
In [120]:
results.mean()
Out[120]: