In [1]:
! wget -N http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data


--2018-05-25 05:54:57--  http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘archive.ics.uci.edu’

In [72]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
data = pd.read_csv('abalone.data', names=['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings'])
data.head()


Out[3]:
Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings
0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7

Now let's convert categorical feature 'Sex' to numerical via one-hot encoding


In [5]:
data = pd.get_dummies(data)
data.head()


Out[5]:
Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings Sex_F Sex_I Sex_M
0 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 0 0 1
1 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 0 0 1
2 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 1 0 0
3 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 0 0 1
4 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 0 1 0

Analysis


In [6]:
data.describe()


Out[6]:
Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings Sex_F Sex_I Sex_M
count 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000
mean 0.523992 0.407881 0.139516 0.828742 0.359367 0.180594 0.238831 9.933684 0.312904 0.321283 0.365813
std 0.120093 0.099240 0.041827 0.490389 0.221963 0.109614 0.139203 3.224169 0.463731 0.467025 0.481715
min 0.075000 0.055000 0.000000 0.002000 0.001000 0.000500 0.001500 1.000000 0.000000 0.000000 0.000000
25% 0.450000 0.350000 0.115000 0.441500 0.186000 0.093500 0.130000 8.000000 0.000000 0.000000 0.000000
50% 0.545000 0.425000 0.140000 0.799500 0.336000 0.171000 0.234000 9.000000 0.000000 0.000000 0.000000
75% 0.615000 0.480000 0.165000 1.153000 0.502000 0.253000 0.329000 11.000000 1.000000 1.000000 1.000000
max 0.815000 0.650000 1.130000 2.825500 1.488000 0.760000 1.005000 29.000000 1.000000 1.000000 1.000000

In [77]:
data.corr()


Out[77]:
Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings Sex_F Sex_I Sex_M
Length 1.000000 0.986812 0.827554 0.925261 0.897914 0.903018 0.897706 0.556720 0.309666 -0.551465 0.236543
Diameter 0.986812 1.000000 0.833684 0.925452 0.893162 0.899724 0.905330 0.574660 0.318626 -0.564315 0.240376
Height 0.827554 0.833684 1.000000 0.819221 0.774972 0.798319 0.817338 0.557467 0.298421 -0.518552 0.215459
Whole weight 0.925261 0.925452 0.819221 1.000000 0.969405 0.966375 0.955355 0.540390 0.299741 -0.557592 0.252038
Shucked weight 0.897914 0.893162 0.774972 0.969405 1.000000 0.931961 0.882617 0.420884 0.263991 -0.521842 0.251793
Viscera weight 0.903018 0.899724 0.798319 0.966375 0.931961 1.000000 0.907656 0.503819 0.308444 -0.556081 0.242194
Shell weight 0.897706 0.905330 0.817338 0.955355 0.882617 0.907656 1.000000 0.627574 0.306319 -0.546953 0.235391
Rings 0.556720 0.574660 0.557467 0.540390 0.420884 0.503819 0.627574 1.000000 0.250279 -0.436063 0.181831
Sex_F 0.309666 0.318626 0.298421 0.299741 0.263991 0.308444 0.306319 0.250279 1.000000 -0.464298 -0.512528
Sex_I -0.551465 -0.564315 -0.518552 -0.557592 -0.521842 -0.556081 -0.546953 -0.436063 -0.464298 1.000000 -0.522541
Sex_M 0.236543 0.240376 0.215459 0.252038 0.251793 0.242194 0.235391 0.181831 -0.512528 -0.522541 1.000000

In [7]:
X = data.drop(columns=['Rings'])
X = StandardScaler().fit_transform(X)
y = data['Rings']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

Classification


In [71]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [79]:
def score(model):
    model.fit(X_train, y_train)
    print('Train score: {}'.format(model.score(X_train, y_train)))
    print('Test score: {}'.format(model.score(X_test, y_test)))

K-Neighbors


In [80]:
score(KNeighborsClassifier(29))


Train score: 0.3273766976411723
Test score: 0.24945612762871647

SVM + linear kernel


In [81]:
score(SVC(kernel='linear'))


Train score: 0.27662616154395997
Test score: 0.25598259608411894

Decision tree


In [82]:
score(DecisionTreeClassifier(max_depth=4))


Train score: 0.2948534667619728
Test score: 0.2574329224075417

Random forest


In [83]:
score(RandomForestClassifier(max_depth=4, n_estimators=10, max_features=2))


Train score: 0.3012866333095068
Test score: 0.25670775924583034

Multi-layer perceptron


In [84]:
score(MLPClassifier(alpha=1))


Train score: 0.29592566118656183
Test score: 0.2625090645395214

AdaBoost


In [85]:
score(AdaBoostClassifier())


Train score: 0.21300929235167976
Test score: 0.2189992748368383

Regression


In [89]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

Linear regression


In [86]:
score(LinearRegression())


Train score: 0.5346804750082439
Test score: 0.5423371485898663

SVM + RBF kernel


In [87]:
score(SVR(kernel='rbf', C=1e3, gamma=0.1))


Train score: 0.6513960441280235
Test score: 0.4730994087829582

SVM + polynomial kernel


In [88]:
score(SVR(kernel='poly', C=1e3, degree=2))


Train score: 0.3880144594339444
Test score: 0.2779446024994301