This is an analysis of the Combined Cycle Power Plant dataset in the UCI archive.
In [1]:
import numpy as np
import pandas as pd
%pylab inline
pylab.style.use('ggplot')
import seaborn as sns
In [2]:
pp_data = pd.read_csv('ccpp.csv')
In [3]:
pp_data.head()
Out[3]:
Features consist of hourly average ambient variables
- Temperature (T) in the range 1.81°C and 37.11°C,
- Ambient Pressure (AP) in the range 992.89-1033.30 milibar,
- Relative Humidity (RH) in the range 25.56% to 100.16%
- Exhaust Vacuum (V) in teh range 25.36-81.56 cm Hg
- Net hourly electrical energy output (EP) 420.26-495.76 MW
The averages are taken from various sensors located around the plant that record the ambient variables every second. The variables are given without normalization.
In [4]:
for c in pp_data.columns:
_ = pylab.figure()
pp_data.loc[:, c].plot(kind='hist')
In [5]:
feature_data = pp_data.drop('AT', axis=1)
corrs = feature_data.corrwith(pp_data.loc[:, 'AT'])
corrs.sort_values(ascending=False).plot(kind='barh')
Out[5]:
In [6]:
f_corrs = feature_data.corr()
sns.heatmap(f_corrs, annot=True)
Out[6]:
In [7]:
fig, axes = pylab.subplots(1, 4, figsize=(16, 8))
for i, c in enumerate(feature_data.columns):
sns.regplot(x=c, y='AT', data=pp_data, ax=axes[i])
In [8]:
from sklearn.feature_selection import f_regression
In [9]:
f_scores, f_probs = f_regression(feature_data, pp_data.loc[:, 'AT'])
f_imp_df = pd.DataFrame({'scores': f_scores, 'probs': f_probs}, index=feature_data.columns)
f_imp_df.plot(kind='barh', subplots=True)
Out[9]:
In [10]:
import statsmodels.formula.api as sm
model = sm.ols(formula='AT ~ PE + RH + V', data=pp_data)
result = model.fit()
result.summary()
Out[10]:
In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
In [20]:
n_splits = 10
fold = KFold(n_splits=n_splits, shuffle=True)
scores = []
for train_idx, test_idx in fold.split(pp_data):
model = sm.ols(formula='AT ~ PE + RH + V', data=pp_data.loc[train_idx])
result = model.fit()
test_features = pp_data.loc[test_idx].drop('AT', axis=1)
predictions = result.predict(test_features)
actual = pp_data.loc[test_idx, 'AT']
score = r2_score(actual, predictions)
scores.append(score)
scores = pd.Series(scores)
In [22]:
scores.plot(kind='bar')
Out[22]: