In [1]:
! wget -N http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data
Abalone vary in size from 20 mm (0.79 in) (Haliotis pulcherrima) to 200 mm (7.9 in) while Haliotis rufescens is the largest of the genus at 12 in (30 cm).
The shell of abalones is convex, rounded to oval in shape, and may be highly arched or very flattened. The shell of the majority of species has a small, flat spire and two to three whorls. The last whorl, known as the body whorl, is auriform, meaning that the shell resembles an ear, giving rise to the common name "ear shell". Haliotis asinina has a somewhat different shape, as it is more elongated and distended. The shell of Haliotis cracherodii cracherodii is also unusual as it has an ovate form, is imperforate, shows an exserted spire, and has prickly ribs.
A mantle cleft in the shell impresses a groove in the shell, in which are the row of holes characteristic of the genus. These holes are respiratory apertures for venting water from the gills and for releasing sperm and eggs into the water column. They make up what is known as the selenizone which forms as the shell grows. This series of eight to 38 holes is near the anterior margin. Only a small number is generally open. The older holes are gradually sealed up as the shell grows and new holes form. Each species has a typical number of open holes, between four and 10, in the selenizone. An abalone has no operculum. The aperture of the shell is very wide and nacreous.
In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA
In [3]:
data = pd.read_csv('abalone.data', names=['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings'])
data.head()
Out[3]:
Now let's convert categorical feature 'Sex' to numerical via one-hot encoding
In [4]:
data = pd.get_dummies(data)
data.head()
Out[4]:
In [5]:
data.describe()
Out[5]:
In [6]:
corr = data.corr()
fig, ax = plt.subplots(figsize=(18,10))
sns.heatmap(corr)
corr
Out[6]:
In [7]:
fig, ((ax1, ax2), (ax3, ax4),(ax5, ax6),(ax7,ax8)) = plt.subplots(4, 2, figsize = (15,10), sharex=False)
axs = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8]
plt.tight_layout()
for n in range(0, 8):
axs[n].hist(data[data.columns[n]], bins=30)
axs[n].set_title(data.columns[n], fontsize=10)
In [8]:
plt.figure(figsize=(18, 10))
plt.hist(data['Rings'], bins=30)
plt.title("Rings", fontsize=16)
plt.show()
In [9]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Rings']), data['Rings'], test_size=.2, random_state=17)
sc = StandardScaler().fit(X_train)
X_train, X_test = sc.transform(X_train), sc.transform(X_test)
In [29]:
def approx(y_pred, y_true):
predictions = list(zip(y_pred, y_true))
return [len(list(filter(lambda a: abs(a[0] - a[1]) <= d, predictions))) / len(predictions) for d in [0.5, 1, 2]]
def score(model):
model.fit(X_train, y_train)
print('Train score: {}'.format(approx(model.predict(X_train), y_train)))
print('Test score: {}'.format(approx(model.predict(X_test), y_test)))
def grid_search(model, params):
gs = GridSearchCV(model, params)
return gs.fit(X_train, y_train)
In [11]:
score(KNeighborsClassifier(29))
In [12]:
score(SVC(kernel='linear'))
In [13]:
import graphviz
from sklearn.tree import export_graphviz
dt = DecisionTreeClassifier(max_depth=5)
score(dt)
dot_data = export_graphviz(dt, out_file=None,
feature_names=data.drop(columns=['Rings']).columns,
class_names=[str(i + 1) for i in range(29)],
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[13]:
In [14]:
score(RandomForestClassifier(max_depth=4, n_estimators=83, max_features=1))
In [15]:
score(MLPClassifier(alpha=2))
In [16]:
score(AdaBoostClassifier())
In [17]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
In [18]:
score(LinearRegression())
In [19]:
score(SVR(C=250, gamma=0.01))
In [20]:
score(SVR(kernel='poly', C=100, degree=4))
In [21]:
score(DecisionTreeRegressor(max_depth=6, criterion="mse", min_samples_leaf=20))
In [22]:
score(MLPRegressor(alpha=1e-2))
In [23]:
import urllib
import tempfile
import tensorflow as tf
In [24]:
FLAGS = None
LEARNING_RATE = 0.001
tf.logging.set_verbosity(tf.logging.INFO)
In [25]:
def maybe_download(train_data=None, test_data=None, predict_data=None):
"""Maybe downloads training data and returns train and test file names."""
if train_data:
train_file_name = train_data
else:
train_file = tempfile.NamedTemporaryFile(delete=False)
urllib.request.urlretrieve(
"http://download.tensorflow.org/data/abalone_train.csv",
train_file.name)
train_file_name = train_file.name
train_file.close()
print("Training data is downloaded to %s" % train_file_name)
if test_data:
test_file_name = test_data
else:
test_file = tempfile.NamedTemporaryFile(delete=False)
urllib.request.urlretrieve(
"http://download.tensorflow.org/data/abalone_test.csv", test_file.name)
test_file_name = test_file.name
test_file.close()
print("Test data is downloaded to %s" % test_file_name)
if predict_data:
predict_file_name = predict_data
else:
predict_file = tempfile.NamedTemporaryFile(delete=False)
urllib.request.urlretrieve(
"http://download.tensorflow.org/data/abalone_predict.csv",
predict_file.name)
predict_file_name = predict_file.name
predict_file.close()
print("Prediction data is downloaded to %s" % predict_file_name)
return train_file_name, test_file_name, predict_file_name
In [26]:
def model_fn(features, labels, mode, params):
first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
second_hidden_layer = tf.layers.dense(
first_hidden_layer, 10, activation=tf.nn.relu)
output_layer = tf.layers.dense(second_hidden_layer, 1)
predictions = tf.reshape(output_layer, [-1])
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(
mode=mode,
predictions={"ages": predictions})
loss = tf.losses.mean_squared_error(labels, predictions)
optimizer = tf.train.GradientDescentOptimizer(
learning_rate=params["learning_rate"])
train_op = optimizer.minimize(
loss=loss, global_step=tf.train.get_global_step())
eval_metric_ops = {
"rmse": tf.metrics.root_mean_squared_error(
tf.cast(labels, tf.float64), predictions)
}
return tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops)
In [27]:
abalone_train, abalone_test, abalone_predict = maybe_download()
training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
filename=abalone_train, target_dtype=np.int, features_dtype=np.float64)
test_set = tf.contrib.learn.datasets.base.load_csv_without_header(
filename=abalone_test, target_dtype=np.int, features_dtype=np.float64)
prediction_set = tf.contrib.learn.datasets.base.load_csv_without_header(
filename=abalone_predict, target_dtype=np.int, features_dtype=np.float64)
In [ ]:
model_params = {"learning_rate": LEARNING_RATE}
nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": np.array(training_set.data)},
y=np.array(training_set.target),
num_epochs=None,
shuffle=True)
nn.train(input_fn=train_input_fn, steps=5000)
test_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": np.array(test_set.data)},
y=np.array(test_set.target),
num_epochs=1,
shuffle=False)
ev = nn.evaluate(input_fn=test_input_fn)
print("Loss: %s" % ev["loss"])
print("Root Mean Squared Error: %s" % ev["rmse"])
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": prediction_set.data},
num_epochs=1,
shuffle=False)
predictions = nn.predict(input_fn=predict_input_fn)
for i, p in enumerate(predictions):
print("Prediction %s: %s" % (i + 1, p["ages"]))
In [ ]:
t_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": test_set.data},
num_epochs=1,
shuffle=False)
t_pred = nn.predict(input_fn=t_fn)
t_pred = list(map(lambda x: x['ages'], t_pred))
approx(t_pred, test_set.target)