In [1]:
%pylab inline
pylab.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns
In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
data_df = pd.read_csv(url, header=None)
In [3]:
data_df.head()
Out[3]:
Given is the attribute name, attribute type, the measurement unit and a brief description. The number of rings is the value to predict: either as a continuous value or as a classification problem.
Name Data Type Meas. Description
---- --------- ----- -----------
Sex nominal M, F, and I (infant)
Length continuous mm Longest shell measurement
Diameter continuous mm perpendicular to length
Height continuous mm with meat in shell
Whole weight continuous grams whole abalone
Shucked weight continuous grams weight of meat
Viscera weight continuous grams gut weight (after bleeding)
Shell weight continuous grams after being dried
Rings integer +1.5 gives the age in years
In [4]:
data_df.columns = ['Sex', 'Length', 'Diameter', 'Height',
'Whole_Weight', 'Shucked_Weight', 'Viscera_Weight', 'Shell_Weight',
'Rings']
In [5]:
g = sns.FacetGrid(col='Sex', data=data_df)
g = g.map(pylab.hist, 'Rings')
In [6]:
features = data_df.columns.drop(['Sex', 'Rings'])
_, axes = pylab.subplots(2, 4, figsize=(16, 10))
for i, fname in enumerate(features):
row, col = divmod(i, 4)
sns.regplot(data=data_df, x=fname, y='Rings', ax=axes[row][col])
In [7]:
f_corrs = data_df.loc[:, features].corrwith(data_df.loc[:, 'Rings'])
f_corrs.plot(kind='barh')
Out[7]:
In [8]:
f_corrs = data_df.loc[:, features].corr()
sns.heatmap(f_corrs, annot=True)
Out[8]:
In [9]:
import statsmodels.formula.api as sm
model = sm.ols(formula='Rings ~ Shell_Weight', data=data_df)
result = model.fit()
result.summary()
Out[9]:
In [10]:
all_features = ' + '.join(features)
formula = ' ~ '.join(['Rings', all_features])
print(formula)
model = sm.ols(formula=formula, data=data_df)
result = model.fit()
result.summary()
Out[10]:
In [18]:
from sklearn.preprocessing import MultiLabelBinarizer
sorted_labels = sorted(pd.unique(data_df.Sex))
encoder = MultiLabelBinarizer(classes=sorted_labels)
encoded = encoder.fit_transform(data_df.Sex)
encoded_sex = pd.DataFrame(index=data_df.index, data=encoded, columns=['sex_{}'.format(l) for l in sorted_labels])
encoded_df = data_df.drop('Sex', axis=1).merge(encoded_sex, left_index=True, right_index=True)
In [19]:
encoded_df.head()
Out[19]:
In [28]:
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
In [30]:
features = encoded_df.drop('Rings', axis=1)
target = encoded_df.Rings
model = SVR(C=1000, gamma=0.001, kernel='rbf')
prep = StandardScaler()
estimator = make_pipeline(prep, model)
scores = cross_val_score(estimator=estimator, X=features, y=target, scoring='r2', cv=10)
scores = pd.Series(scores)
scores.plot(kind='bar')
Out[30]:
In [ ]: