Analysis of the concrete slump test dataset from UCI.
In [1]:
import numpy as np
import pandas as pd
%pylab inline
pylab.style.use('ggplot')
import seaborn as sns
In [2]:
data = pd.read_csv('concrete_slump.csv')
In [3]:
data = data.drop('No', axis=1)
In [4]:
data.head()
Out[4]:
In [5]:
_, axes = pylab.subplots(len(data.columns), 1, figsize=(5, 20))
for i, fname in enumerate(data.columns):
data.loc[:, fname].plot(kind='hist', title=fname, ax=axes[i])
pylab.tight_layout()
In [6]:
target_df = data.loc[:, data.columns[-3:]]
In [7]:
target_df.head()
Out[7]:
In [8]:
feature_df = data.loc[:, data.columns.difference(target_df.columns)]
In [9]:
feature_df.head()
Out[9]:
In [10]:
corrs = target_df.apply(lambda t: feature_df.corrwith(t))
In [11]:
corrs
Out[11]:
In [12]:
corrs.plot(kind='bar', subplots=True, rot='30')
Out[12]:
In [13]:
f_corrs = feature_df.corr()
sns.heatmap(f_corrs, annot=True)
Out[13]:
In [14]:
_, axes = pylab.subplots(len(feature_df.columns), len(target_df.columns), figsize=(20, 30))
for i, fname in enumerate(feature_df.columns):
for j, tname in enumerate(target_df.columns):
sns.regplot(x=fname, y=tname, data=data, ax=axes[i][j])
In [15]:
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
In [16]:
model = SVR(kernel='rbf', C=100, gamma=0.1)
preprocessor = StandardScaler()
estimator = make_pipeline(preprocessor, model)
scores = target_df.apply(lambda t:
pd.Series(data=cross_val_score(estimator=estimator, X=feature_df, y=t, cv=5),
name=t.name))
In [17]:
scores
Out[17]:
In [18]:
scores.plot(kind='bar', subplots=True)
Out[18]: