This is a regression analysis of the UCI Computer Analysis Dataset.
In [2]:
import numpy as np
import pandas as pd
%pylab inline
pylab.style.use('ggplot')
import seaborn as sns
In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data'
data = pd.read_csv(url, header=None)
In [20]:
data.head()
Out[20]:
Attribute Information:
In [5]:
data.columns = ['VENDOR', 'MODEL', 'MYCT', 'MMIN',
'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
In [6]:
# Drop the ERP column - this is an estimate
data = data.drop('ERP', axis=1)
In [7]:
data.VENDOR.value_counts().plot(kind='barh')
Out[7]:
In [8]:
# Drop the model column as well
data = data.drop('MODEL', axis=1)
In [9]:
feature_names = data.columns.drop('VENDOR')
for fname in feature_names:
_ = pylab.figure()
_ = data.loc[:, fname].plot(kind='hist', title=fname)
In [10]:
_, axes = pylab.subplots(6, figsize=(10, 21))
n_columns = data.columns.drop(['VENDOR', 'PRP'])
for i, fname in enumerate(n_columns):
sns.regplot(x=fname, y='PRP', data=data, ax=axes[i])
pylab.tight_layout()
In [11]:
corrs = data.loc[:, n_columns].corrwith(data.loc[:, 'PRP'])
corrs.plot(kind='barh')
Out[11]:
In [12]:
f_corrs = data.loc[:, n_columns].corr()
sns.heatmap(f_corrs, annot=True)
Out[12]:
In [13]:
import statsmodels.formula.api as sm
In [14]:
model = sm.ols(formula='PRP ~ MMAX + MMIN + CACH + CHMAX', data=data)
result = model.fit()
result.summary()
Out[14]:
In [15]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
In [26]:
n_splits = 3
fold = KFold(n_splits=n_splits, shuffle=True)
scores = []
for train_idx, test_idx in fold.split(data):
model = sm.ols(formula='PRP ~ MMAX + MMIN + CACH + CHMAX', data=data.loc[train_idx])
result = model.fit()
test_features = data.loc[test_idx].drop('PRP', axis=1)
predictions = result.predict(test_features)
actual = data.loc[test_idx, 'PRP']
score = r2_score(actual, predictions)
scores.append(score)
scores = pd.Series(scores)
In [27]:
scores.plot(kind='bar')
Out[27]: