In [1]:
! wget -N http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data


--2018-06-05 08:17:39--  http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘archive.ics.uci.edu’

Abalone

Abalone vary in size from 20 mm (0.79 in) (Haliotis pulcherrima) to 200 mm (7.9 in) while Haliotis rufescens is the largest of the genus at 12 in (30 cm).

The shell of abalones is convex, rounded to oval in shape, and may be highly arched or very flattened. The shell of the majority of species has a small, flat spire and two to three whorls. The last whorl, known as the body whorl, is auriform, meaning that the shell resembles an ear, giving rise to the common name "ear shell". Haliotis asinina has a somewhat different shape, as it is more elongated and distended. The shell of Haliotis cracherodii cracherodii is also unusual as it has an ovate form, is imperforate, shows an exserted spire, and has prickly ribs.

A mantle cleft in the shell impresses a groove in the shell, in which are the row of holes characteristic of the genus. These holes are respiratory apertures for venting water from the gills and for releasing sperm and eggs into the water column. They make up what is known as the selenizone which forms as the shell grows. This series of eight to 38 holes is near the anterior margin. Only a small number is generally open. The older holes are gradually sealed up as the shell grows and new holes form. Each species has a typical number of open holes, between four and 10, in the selenizone. An abalone has no operculum. The aperture of the shell is very wide and nacreous.


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA

In [3]:
data = pd.read_csv('abalone.data', names=['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings'])
data.head()


Out[3]:
Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings
0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7

Now let's convert categorical feature 'Sex' to numerical via one-hot encoding


In [4]:
data = pd.get_dummies(data)
data.head()


Out[4]:
Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings Sex_F Sex_I Sex_M
0 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 0 0 1
1 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 0 0 1
2 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 1 0 0
3 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 0 0 1
4 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 0 1 0

Analysis


In [5]:
data.describe()


Out[5]:
Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings Sex_F Sex_I Sex_M
count 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000
mean 0.523992 0.407881 0.139516 0.828742 0.359367 0.180594 0.238831 9.933684 0.312904 0.321283 0.365813
std 0.120093 0.099240 0.041827 0.490389 0.221963 0.109614 0.139203 3.224169 0.463731 0.467025 0.481715
min 0.075000 0.055000 0.000000 0.002000 0.001000 0.000500 0.001500 1.000000 0.000000 0.000000 0.000000
25% 0.450000 0.350000 0.115000 0.441500 0.186000 0.093500 0.130000 8.000000 0.000000 0.000000 0.000000
50% 0.545000 0.425000 0.140000 0.799500 0.336000 0.171000 0.234000 9.000000 0.000000 0.000000 0.000000
75% 0.615000 0.480000 0.165000 1.153000 0.502000 0.253000 0.329000 11.000000 1.000000 1.000000 1.000000
max 0.815000 0.650000 1.130000 2.825500 1.488000 0.760000 1.005000 29.000000 1.000000 1.000000 1.000000

In [6]:
corr = data.corr()
fig, ax = plt.subplots(figsize=(18,10)) 
sns.heatmap(corr)
corr


Out[6]:
Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings Sex_F Sex_I Sex_M
Length 1.000000 0.986812 0.827554 0.925261 0.897914 0.903018 0.897706 0.556720 0.309666 -0.551465 0.236543
Diameter 0.986812 1.000000 0.833684 0.925452 0.893162 0.899724 0.905330 0.574660 0.318626 -0.564315 0.240376
Height 0.827554 0.833684 1.000000 0.819221 0.774972 0.798319 0.817338 0.557467 0.298421 -0.518552 0.215459
Whole weight 0.925261 0.925452 0.819221 1.000000 0.969405 0.966375 0.955355 0.540390 0.299741 -0.557592 0.252038
Shucked weight 0.897914 0.893162 0.774972 0.969405 1.000000 0.931961 0.882617 0.420884 0.263991 -0.521842 0.251793
Viscera weight 0.903018 0.899724 0.798319 0.966375 0.931961 1.000000 0.907656 0.503819 0.308444 -0.556081 0.242194
Shell weight 0.897706 0.905330 0.817338 0.955355 0.882617 0.907656 1.000000 0.627574 0.306319 -0.546953 0.235391
Rings 0.556720 0.574660 0.557467 0.540390 0.420884 0.503819 0.627574 1.000000 0.250279 -0.436063 0.181831
Sex_F 0.309666 0.318626 0.298421 0.299741 0.263991 0.308444 0.306319 0.250279 1.000000 -0.464298 -0.512528
Sex_I -0.551465 -0.564315 -0.518552 -0.557592 -0.521842 -0.556081 -0.546953 -0.436063 -0.464298 1.000000 -0.522541
Sex_M 0.236543 0.240376 0.215459 0.252038 0.251793 0.242194 0.235391 0.181831 -0.512528 -0.522541 1.000000

In [7]:
fig, ((ax1, ax2), (ax3, ax4),(ax5, ax6),(ax7,ax8)) = plt.subplots(4, 2, figsize = (15,10), sharex=False)
axs = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8]
plt.tight_layout()
for n in range(0, 8):
    axs[n].hist(data[data.columns[n]], bins=30)
    axs[n].set_title(data.columns[n], fontsize=10)



In [8]:
plt.figure(figsize=(18, 10))
plt.hist(data['Rings'], bins=30)
plt.title("Rings", fontsize=16)
plt.show()



In [9]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Rings']), data['Rings'], test_size=.2, random_state=17)
sc = StandardScaler().fit(X_train)
X_train, X_test = sc.transform(X_train), sc.transform(X_test)

Classification


In [29]:
def approx(y_pred, y_true):
    predictions = list(zip(y_pred, y_true))
    return [len(list(filter(lambda a: abs(a[0] - a[1]) <= d, predictions))) / len(predictions) for d in [0.5, 1, 2]]

def score(model):
    model.fit(X_train, y_train)
    print('Train score: {}'.format(approx(model.predict(X_train), y_train)))
    print('Test score: {}'.format(approx(model.predict(X_test), y_test)))
    
def grid_search(model, params):
    gs = GridSearchCV(model, params)
    return gs.fit(X_train, y_train)

K-Neighbors


In [11]:
score(KNeighborsClassifier(29))


Train score: [0.3214606405267884, 0.6596827297216402, 0.7955701885662975]
Test score: [0.2619617224880383, 0.6363636363636364, 0.7990430622009569]

SVM + linear kernel


In [12]:
score(SVC(kernel='linear'))


Train score: [0.27357078718946426, 0.6381322957198443, 0.7898832684824902]
Test score: [0.25478468899521534, 0.6411483253588517, 0.7858851674641149]

Decision tree


In [13]:
import graphviz
from sklearn.tree import export_graphviz
dt = DecisionTreeClassifier(max_depth=5)
score(dt)
dot_data = export_graphviz(dt, out_file=None, 
                         feature_names=data.drop(columns=['Rings']).columns, 
                         class_names=[str(i + 1) for i in range(29)],
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(dot_data)
graph


Train score: [0.31188266985932356, 0.6390302304699191, 0.8054474708171206]
Test score: [0.24162679425837322, 0.6267942583732058, 0.7966507177033493]
Out[13]:
Tree 0 Shell weight ≤ -0.674 gini = 0.895 samples = 3341 value = [1, 13, 50, 98, 200, 307, 451, 549, 516, 392, 212 162, 105, 89, 53, 44, 30, 24, 18, 9, 5, 8, 1, 1 2, 1] class = 8 1 Diameter ≤ -1.858 gini = 0.844 samples = 944 value = [1, 13, 50, 98, 187, 234, 158, 90, 55, 24, 17, 10 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 6 0->1 True 32 Shell weight ≤ 0.115 gini = 0.872 samples = 2397 value = [0, 0, 0, 0, 13, 73, 293, 459, 461, 368, 195, 152 102, 86, 52, 44, 30, 24, 18, 9, 5, 8, 1, 1, 2 1] class = 9 0->32 False 2 Viscera weight ≤ -1.512 gini = 0.75 samples = 180 value = [1, 13, 48, 64, 37, 11, 5, 1, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 4 1->2 17 Sex_I ≤ 0.39 gini = 0.814 samples = 764 value = [0, 0, 2, 34, 150, 223, 153, 89, 55, 24, 17, 10 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 6 1->17 3 Length ≤ -2.249 gini = 0.665 samples = 73 value = [1, 13, 37, 15, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 3 2->3 10 Whole weight ≤ -1.463 gini = 0.676 samples = 107 value = [0, 0, 11, 49, 33, 9, 4, 1, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 4 2->10 4 Shucked weight ≤ -1.596 gini = 0.627 samples = 68 value = [1, 13, 37, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 3 3->4 7 Diameter ≤ -2.11 gini = 0.64 samples = 5 value = [0, 0, 0, 2, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 4 3->7 5 gini = 0.444 samples = 3 value = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 2 4->5 6 gini = 0.604 samples = 65 value = [0, 11, 37, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 3 4->6 8 gini = 0.5 samples = 2 value = [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 4 7->8 9 gini = 0.444 samples = 3 value = [0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 6 7->9 11 Shell weight ≤ -1.489 gini = 0.653 samples = 71 value = [0, 0, 9, 37, 16, 6, 3, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 4 10->11 14 Shucked weight ≤ -1.356 gini = 0.654 samples = 36 value = [0, 0, 2, 12, 17, 3, 1, 1, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 5 10->14 12 gini = 0.686 samples = 53 value = [0, 0, 9, 23, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 4 11->12 13 gini = 0.37 samples = 18 value = [0, 0, 0, 14, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 4 11->13 15 gini = 0.626 samples = 21 value = [0, 0, 1, 3, 12, 3, 1, 1, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 5 14->15 16 gini = 0.524 samples = 15 value = [0, 0, 1, 9, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 4 14->16 18 Shucked weight ≤ -0.613 gini = 0.843 samples = 250 value = [0, 0, 1, 4, 18, 44, 51, 51, 43, 16, 12, 7, 1 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 7 17->18 25 Shell weight ≤ -1.002 gini = 0.764 samples = 514 value = [0, 0, 1, 30, 132, 179, 102, 38, 12, 8, 5, 3, 2 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 6 17->25 19 Viscera weight ≤ -1.178 gini = 0.845 samples = 218 value = [0, 0, 1, 4, 13, 34, 38, 49, 42, 15, 12, 7, 1 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 8 18->19 22 Length ≤ -0.544 gini = 0.707 samples = 32 value = [0, 0, 0, 0, 5, 10, 13, 2, 1, 1, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 7 18->22 20 gini = 0.829 samples = 53 value = [0, 0, 1, 2, 7, 10, 3, 8, 15, 5, 0, 2, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 9 19->20 21 gini = 0.834 samples = 165 value = [0, 0, 0, 2, 6, 24, 35, 41, 27, 10, 12, 5, 1 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 8 19->21 23 gini = 0.0 samples = 5 value = [0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 7 22->23 24 gini = 0.733 samples = 27 value = [0, 0, 0, 0, 5, 10, 8, 2, 1, 1, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 6 22->24 26 Viscera weight ≤ -1.333 gini = 0.733 samples = 308 value = [0, 0, 1, 28, 106, 107, 41, 11, 7, 3, 2, 1, 1 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 6 25->26 29 Shell weight ≤ -0.847 gini = 0.755 samples = 206 value = [0, 0, 0, 2, 26, 72, 61, 27, 5, 5, 3, 2, 1, 1 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 6 25->29 27 gini = 0.689 samples = 50 value = [0, 0, 0, 8, 25, 7, 6, 1, 1, 1, 0, 1, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 5 26->27 28 gini = 0.725 samples = 258 value = [0, 0, 1, 20, 81, 100, 35, 10, 6, 2, 2, 0, 1 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 6 26->28 30 gini = 0.743 samples = 109 value = [0, 0, 0, 2, 19, 44, 25, 10, 2, 4, 1, 1, 1, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 6 29->30 31 gini = 0.741 samples = 97 value = [0, 0, 0, 0, 7, 28, 36, 17, 3, 1, 2, 1, 0, 1 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 7 29->31 33 Sex_I ≤ 0.39 gini = 0.836 samples = 945 value = [0, 0, 0, 0, 10, 62, 224, 242, 142, 86, 59, 37 30, 25, 9, 6, 4, 3, 2, 2, 1, 1, 0, 0, 0, 0] class = 8 32->33 48 Shell weight ≤ 1.131 gini = 0.868 samples = 1452 value = [0, 0, 0, 0, 3, 11, 69, 217, 319, 282, 136, 115 72, 61, 43, 38, 26, 21, 16, 7, 4, 7, 1, 1, 2 1] class = 9 32->48 34 Shucked weight ≤ 0.04 gini = 0.857 samples = 628 value = [0, 0, 0, 0, 6, 43, 119, 145, 104, 62, 52, 30 24, 22, 8, 5, 1, 2, 1, 2, 1, 1, 0, 0, 0, 0] class = 8 33->34 41 Shell weight ≤ -0.174 gini = 0.771 samples = 317 value = [0, 0, 0, 0, 4, 19, 105, 97, 38, 24, 7, 7, 6 3, 1, 1, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0] class = 7 33->41 35 Shell weight ≤ -0.31 gini = 0.88 samples = 438 value = [0, 0, 0, 0, 5, 28, 57, 79, 76, 52, 49, 29, 23 21, 7, 4, 1, 2, 1, 2, 1, 1, 0, 0, 0, 0] class = 8 34->35 38 Viscera weight ≤ 0.479 gini = 0.742 samples = 190 value = [0, 0, 0, 0, 1, 15, 62, 66, 28, 10, 3, 1, 1, 1 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 8 34->38 36 gini = 0.864 samples = 230 value = [0, 0, 0, 0, 5, 27, 39, 45, 41, 26, 18, 8, 7 8, 3, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0] class = 8 35->36 37 gini = 0.879 samples = 208 value = [0, 0, 0, 0, 0, 1, 18, 34, 35, 26, 31, 21, 16 13, 4, 3, 0, 2, 1, 1, 1, 1, 0, 0, 0, 0] class = 9 35->37 39 gini = 0.745 samples = 165 value = [0, 0, 0, 0, 1, 15, 57, 52, 27, 6, 2, 1, 1, 1 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 7 38->39 40 gini = 0.618 samples = 25 value = [0, 0, 0, 0, 0, 0, 5, 14, 1, 4, 1, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 8 38->40 42 Shucked weight ≤ -0.708 gini = 0.756 samples = 232 value = [0, 0, 0, 0, 2, 19, 91, 62, 19, 14, 5, 6, 5, 3 1, 0, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0] class = 7 41->42 45 Height ≤ 0.429 gini = 0.738 samples = 85 value = [0, 0, 0, 0, 2, 0, 14, 35, 19, 10, 2, 1, 1, 0 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 8 41->45 43 gini = 0.844 samples = 39 value = [0, 0, 0, 0, 2, 1, 7, 11, 2, 6, 1, 3, 3, 0 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0] class = 8 42->43 44 gini = 0.721 samples = 193 value = [0, 0, 0, 0, 0, 18, 84, 51, 17, 8, 4, 3, 2, 3 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0] class = 7 42->44 46 gini = 0.756 samples = 78 value = [0, 0, 0, 0, 2, 0, 13, 29, 19, 10, 2, 1, 1, 0 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 8 45->46 47 gini = 0.245 samples = 7 value = [0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] class = 8 45->47 49 Shucked weight ≤ 0.393 gini = 0.852 samples = 1024 value = [0, 0, 0, 0, 3, 10, 64, 194, 258, 173, 80, 72 47, 39, 21, 21, 14, 12, 6, 3, 2, 4, 0, 1, 0 0] class = 9 48->49 56 Shucked weight ≤ 1.178 gini = 0.872 samples = 428 value = [0, 0, 0, 0, 0, 1, 5, 23, 61, 109, 56, 43, 25 22, 22, 17, 12, 9, 10, 4, 2, 3, 1, 0, 2, 1] class = 10 48->56 50 Shell weight ≤ 0.376 gini = 0.897 samples = 398 value = [0, 0, 0, 0, 2, 3, 19, 43, 79, 50, 31, 43, 30 27, 20, 17, 8, 11, 6, 2, 2, 4, 0, 1, 0, 0] class = 9 49->50 53 Diameter ≤ 0.759 gini = 0.807 samples = 626 value = [0, 0, 0, 0, 1, 7, 45, 151, 179, 123, 49, 29, 17 12, 1, 4, 6, 1, 0, 1, 0, 0, 0, 0, 0, 0] class = 9 49->53 51 gini = 0.872 samples = 198 value = [0, 0, 0, 0, 0, 3, 16, 31, 47, 27, 16, 17, 12 8, 5, 7, 0, 4, 0, 1, 2, 2, 0, 0, 0, 0] class = 9 50->51 52 gini = 0.906 samples = 200 value = [0, 0, 0, 0, 2, 0, 3, 12, 32, 23, 15, 26, 18 19, 15, 10, 8, 7, 6, 1, 0, 2, 0, 1, 0, 0] class = 9 50->52 54 gini = 0.788 samples = 272 value = [0, 0, 0, 0, 1, 5, 32, 65, 93, 35, 20, 9, 5, 3 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0] class = 9 53->54 55 gini = 0.807 samples = 354 value = [0, 0, 0, 0, 0, 2, 13, 86, 86, 88, 29, 20, 12 9, 0, 4, 4, 0, 0, 1, 0, 0, 0, 0, 0, 0] class = 10 53->55 57 Shell weight ≤ 2.811 gini = 0.911 samples = 153 value = [0, 0, 0, 0, 0, 0, 3, 2, 21, 14, 15, 19, 12 16, 16, 8, 7, 5, 7, 3, 1, 2, 1, 0, 1, 0] class = 9 56->57 60 Shell weight ≤ 2.599 gini = 0.819 samples = 275 value = [0, 0, 0, 0, 0, 1, 2, 21, 40, 95, 41, 24, 13 6, 6, 9, 5, 4, 3, 1, 1, 1, 0, 0, 1, 1] class = 10 56->60 58 gini = 0.909 samples = 147 value = [0, 0, 0, 0, 0, 0, 3, 2, 21, 14, 15, 19, 12 16, 12, 8, 6, 5, 7, 2, 1, 2, 1, 0, 1, 0] class = 9 57->58 59 gini = 0.5 samples = 6 value = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0] class = 15 57->59 61 gini = 0.793 samples = 243 value = [0, 0, 0, 0, 0, 1, 2, 21, 39, 92, 33, 24, 8, 5 4, 3, 3, 4, 1, 1, 1, 0, 0, 0, 0, 1] class = 10 60->61 62 gini = 0.854 samples = 32 value = [0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 8, 0, 5, 1, 2 6, 2, 0, 2, 0, 0, 1, 0, 0, 1, 0] class = 11 60->62

Random forest


In [14]:
score(RandomForestClassifier(max_depth=4, n_estimators=83, max_features=1))


Train score: [0.29841364860820113, 0.6438192158036516, 0.7832984136486082]
Test score: [0.27751196172248804, 0.6435406698564593, 0.7834928229665071]

Multi-layer perceptron


In [15]:
score(MLPClassifier(alpha=2))


Train score: [0.2837473810236456, 0.6569889254714157, 0.8021550434001796]
Test score: [0.26674641148325356, 0.6686602870813397, 0.8086124401913876]

AdaBoost


In [16]:
score(AdaBoostClassifier())


Train score: [0.21430709368452558, 0.5501346902125113, 0.7306195749775516]
Test score: [0.23205741626794257, 0.569377990430622, 0.7296650717703349]

Regression


In [17]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

Linear regression


In [18]:
score(LinearRegression())


Train score: [0.23585752768632146, 0.43998802753666566, 0.7357078718946424]
Test score: [0.23205741626794257, 0.4258373205741627, 0.7165071770334929]

SVM + RBF kernel


In [19]:
score(SVR(C=250, gamma=0.01))


Train score: [0.2927267285243939, 0.5175097276264592, 0.7803052978150254]
Test score: [0.27392344497607657, 0.49401913875598086, 0.7763157894736842]

SVM + polynomial kernel


In [20]:
score(SVR(kernel='poly', C=100, degree=4))


Train score: [0.3163723436096977, 0.5474408859622868, 0.7880873989823406]
Test score: [0.25239234449760767, 0.4880382775119617, 0.757177033492823]

Decision tree


In [21]:
score(DecisionTreeRegressor(max_depth=6, criterion="mse", min_samples_leaf=20))


Train score: [0.26578868602214906, 0.4890751272074229, 0.7692307692307693]
Test score: [0.23205741626794257, 0.45454545454545453, 0.7332535885167464]

Multi-layer perceptron


In [22]:
score(MLPRegressor(alpha=1e-2))


Train score: [0.2529182879377432, 0.4681233163723436, 0.7482789583956899]
Test score: [0.2583732057416268, 0.465311004784689, 0.7332535885167464]

TensorFlow


In [23]:
import urllib
import tempfile

import tensorflow as tf

In [24]:
FLAGS = None
LEARNING_RATE = 0.001

tf.logging.set_verbosity(tf.logging.INFO)

In [25]:
def maybe_download(train_data=None, test_data=None, predict_data=None):
    """Maybe downloads training data and returns train and test file names."""
    if train_data:
        train_file_name = train_data
    else:
        train_file = tempfile.NamedTemporaryFile(delete=False)
        urllib.request.urlretrieve(
            "http://download.tensorflow.org/data/abalone_train.csv",
            train_file.name)
        train_file_name = train_file.name
        train_file.close()
        print("Training data is downloaded to %s" % train_file_name)
    
    if test_data:
        test_file_name = test_data
    else:
        test_file = tempfile.NamedTemporaryFile(delete=False)
        urllib.request.urlretrieve(
            "http://download.tensorflow.org/data/abalone_test.csv", test_file.name)
        test_file_name = test_file.name
        test_file.close()
        print("Test data is downloaded to %s" % test_file_name)
        
    if predict_data:
        predict_file_name = predict_data
    else:
        predict_file = tempfile.NamedTemporaryFile(delete=False)
        urllib.request.urlretrieve(
            "http://download.tensorflow.org/data/abalone_predict.csv",
            predict_file.name)
        predict_file_name = predict_file.name
        predict_file.close()
        print("Prediction data is downloaded to %s" % predict_file_name)

    return train_file_name, test_file_name, predict_file_name

In [26]:
def model_fn(features, labels, mode, params):

    first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)

    second_hidden_layer = tf.layers.dense(
      first_hidden_layer, 10, activation=tf.nn.relu)

    output_layer = tf.layers.dense(second_hidden_layer, 1)

    predictions = tf.reshape(output_layer, [-1])

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"ages": predictions})

    loss = tf.losses.mean_squared_error(labels, predictions)

    optimizer = tf.train.GradientDescentOptimizer(
      learning_rate=params["learning_rate"])
    train_op = optimizer.minimize(
      loss=loss, global_step=tf.train.get_global_step())

    eval_metric_ops = {
      "rmse": tf.metrics.root_mean_squared_error(
          tf.cast(labels, tf.float64), predictions)
    }

    return tf.estimator.EstimatorSpec(
      mode=mode,
      loss=loss,
      train_op=train_op,
      eval_metric_ops=eval_metric_ops)

In [27]:
abalone_train, abalone_test, abalone_predict = maybe_download()

training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
  filename=abalone_train, target_dtype=np.int, features_dtype=np.float64)

test_set = tf.contrib.learn.datasets.base.load_csv_without_header(
  filename=abalone_test, target_dtype=np.int, features_dtype=np.float64)

prediction_set = tf.contrib.learn.datasets.base.load_csv_without_header(
  filename=abalone_predict, target_dtype=np.int, features_dtype=np.float64)


---------------------------------------------------------------------------
gaierror                                  Traceback (most recent call last)
/opt/conda/lib/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1317                 h.request(req.get_method(), req.selector, req.data, headers,
-> 1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error

/opt/conda/lib/python3.6/http/client.py in request(self, method, url, body, headers, encode_chunked)
   1238         """Send a complete request to the server."""
-> 1239         self._send_request(method, url, body, headers, encode_chunked)
   1240 

/opt/conda/lib/python3.6/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1284             body = _encode(body, 'body')
-> 1285         self.endheaders(body, encode_chunked=encode_chunked)
   1286 

/opt/conda/lib/python3.6/http/client.py in endheaders(self, message_body, encode_chunked)
   1233             raise CannotSendHeader()
-> 1234         self._send_output(message_body, encode_chunked=encode_chunked)
   1235 

/opt/conda/lib/python3.6/http/client.py in _send_output(self, message_body, encode_chunked)
   1025         del self._buffer[:]
-> 1026         self.send(msg)
   1027 

/opt/conda/lib/python3.6/http/client.py in send(self, data)
    963             if self.auto_open:
--> 964                 self.connect()
    965             else:

/opt/conda/lib/python3.6/http/client.py in connect(self)
    935         self.sock = self._create_connection(
--> 936             (self.host,self.port), self.timeout, self.source_address)
    937         self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)

/opt/conda/lib/python3.6/socket.py in create_connection(address, timeout, source_address)
    703     err = None
--> 704     for res in getaddrinfo(host, port, 0, SOCK_STREAM):
    705         af, socktype, proto, canonname, sa = res

/opt/conda/lib/python3.6/socket.py in getaddrinfo(host, port, family, type, proto, flags)
    744     addrlist = []
--> 745     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    746         af, socktype, proto, canonname, sa = res

gaierror: [Errno -3] Temporary failure in name resolution

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-27-5fb2968259b2> in <module>()
----> 1 abalone_train, abalone_test, abalone_predict = maybe_download()
      2 
      3 training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
      4   filename=abalone_train, target_dtype=np.int, features_dtype=np.float64)
      5 

<ipython-input-25-c525ea9a9b4f> in maybe_download(train_data, test_data, predict_data)
      7         urllib.request.urlretrieve(
      8             "http://download.tensorflow.org/data/abalone_train.csv",
----> 9             train_file.name)
     10         train_file_name = train_file.name
     11         train_file.close()

/opt/conda/lib/python3.6/urllib/request.py in urlretrieve(url, filename, reporthook, data)
    246     url_type, path = splittype(url)
    247 
--> 248     with contextlib.closing(urlopen(url, data)) as fp:
    249         headers = fp.info()
    250 

/opt/conda/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

/opt/conda/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    524             req = meth(req)
    525 
--> 526         response = self._open(req, data)
    527 
    528         # post-process response

/opt/conda/lib/python3.6/urllib/request.py in _open(self, req, data)
    542         protocol = req.type
    543         result = self._call_chain(self.handle_open, protocol, protocol +
--> 544                                   '_open', req)
    545         if result:
    546             return result

/opt/conda/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

/opt/conda/lib/python3.6/urllib/request.py in http_open(self, req)
   1344 
   1345     def http_open(self, req):
-> 1346         return self.do_open(http.client.HTTPConnection, req)
   1347 
   1348     http_request = AbstractHTTPHandler.do_request_

/opt/conda/lib/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error
-> 1320                 raise URLError(err)
   1321             r = h.getresponse()
   1322         except:

URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

In [ ]:
model_params = {"learning_rate": LEARNING_RATE}

nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)

train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": np.array(training_set.data)},
    y=np.array(training_set.target),
    num_epochs=None,
    shuffle=True)

nn.train(input_fn=train_input_fn, steps=5000)

test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": np.array(test_set.data)},
    y=np.array(test_set.target),
    num_epochs=1,
    shuffle=False)

ev = nn.evaluate(input_fn=test_input_fn)
print("Loss: %s" % ev["loss"])
print("Root Mean Squared Error: %s" % ev["rmse"])

predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": prediction_set.data},
    num_epochs=1,
    shuffle=False)
predictions = nn.predict(input_fn=predict_input_fn)
for i, p in enumerate(predictions):
    print("Prediction %s: %s" % (i + 1, p["ages"]))

In [ ]:
t_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": test_set.data},
    num_epochs=1,
    shuffle=False)
t_pred = nn.predict(input_fn=t_fn)
t_pred = list(map(lambda x: x['ages'], t_pred))

approx(t_pred, test_set.target)