In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
%matplotlib inline
In [77]:
data_folder = r'C:\Users\ocni\PycharmProjects\delphin_6_automation\data_process\simtime_prediction\data'
excel_file = os.path.join(data_folder, 'sim_time.xlsx')
data = pd.read_excel(excel_file)
data.shape
Out[77]:
In [89]:
plt.figure(figsize=(16, 8), dpi= 80, facecolor='w', edgecolor='k')
(data['time'][data['time'] < 1500 * 60] / 60).plot('hist', bins=50, color='#003399')
plt.xlabel('Simulation Time in minutes')
#plt.savefig('simulation_time_histogram.pdf')
In [90]:
(data['time'][data['time'] < 1500 * 60] / 60).describe()
Out[90]:
In [85]:
hist, edges = np.histogram((data['time'][data['time'] < 1500 * 60] / 60), density=True, bins=50)
dx = edges[1] - edges[0]
cdf = np.cumsum(hist) * dx
plt.figure(figsize=(16, 8), dpi= 80, facecolor='w', edgecolor='k')
plt.plot(edges[:-1], cdf)
Out[85]:
In [23]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
In [24]:
y_data = data['time']
x_data = data.loc[:, data.columns != 'time']
x_data.loc[:, 'exterior_climate'] = np.ones(len(x_data['exterior_climate']))
x_data = x_data.fillna(0.0)
x_data.loc[x_data.loc[:, 'interior_climate'] == 'a', 'interior_climate'] = 0.0
x_data.loc[x_data.loc[:, 'interior_climate'] == 'b', 'interior_climate'] = 1.0
x_data.loc[x_data.loc[:, 'system_name'] == 'ClimateBoard', 'system_name'] = 1.0
x_data.head()
Out[24]:
In [25]:
x_data.columns
Out[25]:
In [102]:
processed_data = x_data.assign(time=y_data/60)
plt_data = [
go.Parcoords(
line = dict(color = processed_data['time'],
colorscale = 'Jet',
showscale = True,
cmin = 0,
cmax = 1500),
dimensions = list([
dict(range = [0,1440],
label = 'Time', values = processed_data['time'],
tickformat='r'),
dict(range = [0, 5],
label = 'Ext. Heat\nTransfer Coef. Slope',
values = processed_data['exterior_heat_transfer_coefficient_slope']),
dict(range = [4 * 10 ** -9, 10 ** -8],
label = 'Ext. Moisture Transfer Coef.',
values = processed_data['exterior_moisture_transfer_coefficient'],
tickformat='e'),
dict(range = [0.4, 0.8],
label = 'Solar Absorption', values = processed_data['solar_absorption'],
tickformat='.1f'),
dict(range = [0.0, 2.0],
label = 'Rain Scale Factor', values = processed_data['rain_scale_factor']),
dict(range = [0.0, 1.0],
label = 'Int. Climate', values = processed_data['interior_climate']),
dict(range = [4.0, 11.0],
label = 'Int. Heat Transfer Coef.',
values = processed_data['interior_heat_transfer_coefficient']),
dict(range = [4 * 10 ** -9, 10 ** -8],
label = 'Int. Moisture Transfer Coef.',
values = processed_data['interior_moisture_transfer_coefficient'],
tickformat='e'),
dict(range = [0.0, 0.6],
label = 'Int. Sd Value', values = processed_data['interior_sd_value'],
tickformat='.1f'),
dict(range = [0.0, 360.0],
label = 'Wall Orientation', values = processed_data['wall_orientation']),
dict(range = [0.0, 1.0],
label = 'Wall Core Width', values = processed_data['wall_core_width']),
dict(range = [0.0, 1000],
label = 'Wall Core Material', values = processed_data['wall_core_material'],
tickformat='r'),
dict(range = [0.01, 0.02],
label = 'Plaster Width', values = processed_data['plaster_width'],
tickformat='.2f'),
dict(range = [0.0, 1000],
label = 'Plaster Material', values = processed_data['plaster_material'],
tickformat='r'),
dict(range = [0.0, 1.0],
label = 'Ext. Plaster', values = processed_data['exterior_plaster']),
dict(range = [0.0, 1.0],
label = 'System', values = processed_data['system_name']),
dict(range = [0.0, 1000],
label = 'Insulation Material', values = processed_data['insulation_material'],
tickformat='r'),
dict(range = [0.0, 1000],
label = 'Finish Material', values = processed_data['finish_material'],
tickformat='r'),
dict(range = [0.0, 1000],
label = 'Detail Material', values = processed_data['detail_material'],
tickformat='r'),
dict(range = [0.0, 200],
label = 'Insulation Thickness', values = processed_data['insulation_thickness']),
])
)
]
layout = go.Layout(
plot_bgcolor = '#E5E5E5',
paper_bgcolor = '#E5E5E5'
)
fig = go.Figure(data = plt_data, layout = layout)
plot(fig, filename = 'sim_time.html')
Out[102]:
In [27]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, random_state=0)
In [12]:
# Linear Model
linreg = linear_model.LinearRegression(normalize=True)
linreg.fit(X_train, y_train)
print('linear model intercept: {}'.format(linreg.intercept_))
print('linear model coeff:\n{}'.format(linreg.coef_))
print('R-squared score (training): {:.3f}'.format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(linreg.score(X_test, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linreg.coef_ != 0)))
In [13]:
# Ridge Model
linridge = linear_model.Ridge(alpha=20.0).fit(X_train, y_train)
print('ridge regression linear model intercept: {}'.format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'.format(linridge.coef_))
print('R-squared score (training): {:.3f}'.format(linridge.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge.score(X_test, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge.coef_ != 0)))
In [14]:
# Ridge Model Normalized
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
linridge_normal = linear_model.Ridge(alpha=20.0).fit(X_train_scaled, y_train)
print('ridge regression linear model intercept: {}'.format(linridge_normal.intercept_))
print('ridge regression linear model coeff:\n{}'.format(linridge_normal.coef_))
print('R-squared score (training): {:.3f}'.format(linridge_normal.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge_normal.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge_normal.coef_ != 0)))
In [15]:
# K-nearest regression - 5 neighbors
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn_reg5_uni = KNeighborsRegressor(n_neighbors=5).fit(X_train_scaled, y_train)
#print(knn_reg5_uni.predict(X_test_scaled))
print('R-squared train score: {:.5f}'.format(knn_reg5_uni.score(X_train_scaled, y_train)))
print('R-squared test score: {:.5f}'.format(knn_reg5_uni.score(X_test_scaled, y_test)))
In [16]:
# K-nearest regression - 3 neighbors
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn_reg5_uni = KNeighborsRegressor(n_neighbors=3).fit(X_train_scaled, y_train)
#print(knn_reg5_uni.predict(X_test_scaled))
print('R-squared train score: {:.5f}'.format(knn_reg5_uni.score(X_train_scaled, y_train)))
print('R-squared test score: {:.5f}'.format(knn_reg5_uni.score(X_test_scaled, y_test)))
In [17]:
# K-nearest regression - 5 neighbors, weights = distance
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn_reg5 = KNeighborsRegressor(n_neighbors=3, weights='distance').fit(X_train_scaled, y_train)
#print(knn_reg5.predict(X_test_scaled))
print('R-squared train score: {:.5f}'.format(knn_reg5.score(X_train_scaled, y_train)))
print('R-squared test score: {:.5f}'.format(knn_reg5.score(X_test_scaled, y_test)))
In [28]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=47)
scaler = MinMaxScaler()
test_scores = []
for train_index, test_index in ss.split(x_data):
x_train = scaler.fit_transform(x_data.iloc[train_index, :])
x_test = scaler.transform(x_data.iloc[test_index, :])
y_train = y_data.iloc[train_index]
y_test = y_data.iloc[test_index]
knn_reg = KNeighborsRegressor(n_neighbors=5, weights='distance').fit(x_train, y_train)
#knn_reg = KNeighborsRegressor(n_neighbors=5).fit(x_train, y_train)
test_scores.append(knn_reg.score(x_test, y_test))
mean_score = np.mean(test_scores)
print(f'Average R-squared test score: {mean_score:.5f}')
In [71]:
# Cross Validation Score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=47)
scaler = MinMaxScaler()
knn_reg = KNeighborsRegressor(n_neighbors=5, weights='distance')
#knn_reg = KNeighborsRegressor(n_neighbors=5)
validated_test_scores = cross_val_score(knn_reg, scaler.fit_transform(x_data), y_data, cv=ss)
print(f'Accuracy: {validated_test_scores.mean():.5f} (+/- {validated_test_scores.std()*2:.5f})')
In [65]:
# Feature Importance
features = x_data.columns
col_del = []
feature_scores = []
for feat in features:
feature_less_data = x_data.loc[:, x_data.columns != feat]
test_scores = cross_val_score(knn_reg, scaler.fit_transform(feature_less_data), y_data, cv=ss, scoring='r2')
feature_scores.append((feat, test_scores.mean()))
if test_scores.mean() >= validated_test_scores.mean():
col_del.append(feat)
feature_scores = sorted(feature_scores, key=lambda x: x[1])
width = len('exterior heat transfer coefficient slope')
print('Feature'.ljust(width, ' ') + ' Accuracy')
for i in feature_scores:
print(f'{i[0].ljust(width, " ")} - {i[1]:.5f}')
In [66]:
print('Columns to delete:\n')
for col in col_del:
print(f'\t{col}')
In [72]:
clean_col = x_data.columns[[c not in col_del for c in x_data.columns.tolist()]]
cleaned_data = x_data.loc[:, clean_col]
clean_scores = cross_val_score(knn_reg, scaler.fit_transform(cleaned_data), y_data, cv=ss, scoring='r2')
print(f'Accuracy: {clean_scores.mean():.5f} (+/- {clean_scores.std()*2:.5f})')