In [1]:
# -*- coding: utf-8 -*-
In [2]:
import os
import sys
import numpy as np
import pandas as pd
import sklearn as sk
import pickle as pkl
Predicting the age of abalone from physical measurements. The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- a boring and time-consuming task. Other measurements, which are easier to obtain, are used to predict the age. Further information, such as weather patterns and location (hence food availability) may be required to solve the problem.
In [3]:
# variable names
names = [
'sex',
'length',
'diameter',
'height',
'whole_weight',
'shucked_weight',
'viscera_weight',
'shell_weight',
'rings'
]
# reading dataset
df = pd.read_csv('data/abalone.data', header=None, names=names)
# building prediction target
df['target'] = (df['rings'] >= 10).astype(int)
df = df.drop('rings', axis=1)
In [4]:
df.head()
Out[4]:
In [5]:
# seperating target from features
y = np.array(df['target'])
X = df.drop('target', axis=1)
In [6]:
# shuffling and splitting data into training and test sets
from sklearn.model_selection import train_test_split
SPLIT = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SPLIT, random_state=42)
In [7]:
# one-hot encoding categorical features
CATEGORICALS = ['sex']
DUMMIES = {
'sex':['M','F','I']
}
def dummy_encode(in_df, dummies):
out_df = in_df.copy()
for feature, values in dummies.items():
for value in values:
dummy_name = '{}__{}'.format(feature, value)
out_df[dummy_name] = (out_df[feature] == value).astype(int)
del out_df[feature]
# print('Dummy-encoded feature\t\t{}'.format(feature))
return out_df
X_train = dummy_encode(in_df=X_train, dummies=DUMMIES)
X_test = dummy_encode(in_df=X_test, dummies=DUMMIES)
X_train.head()
Out[7]:
In [8]:
# rescaling numerical features
NUMERICS = ['length','diameter','height','whole_weight','shucked_weight','viscera_weight','shell_weight']
BOUNDARIES = {
'length': (0.075000, 0.815000),
'diameter': (0.055000, 0.650000),
'height': (0.000000, 1.130000),
'whole_weight': (0.002000, 2.825500),
'shucked_weight': (0.001000, 1.488000),
'viscera_weight': (0.000500, 0.760000),
'shell_weight': (0.001500, 1.005000)
}
def minmax_scale(in_df, boundaries):
out_df = in_df.copy()
for feature, (min_val, max_val) in boundaries.items():
col_name = '{}__norm'.format(feature)
out_df[col_name] = round((out_df[feature] - min_val)/(max_val - min_val),3)
out_df.loc[out_df[col_name] < 0, col_name] = 0
out_df.loc[out_df[col_name] > 1, col_name] = 1
del out_df[feature]
# print('MinMax Scaled feature\t\t{}'.format(feature))
return out_df
X_train = minmax_scale(in_df=X_train, boundaries=BOUNDARIES)
X_test = minmax_scale(in_df=X_test, boundaries=BOUNDARIES)
X_train.head()
In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(
n_estimators=100, # number of trees
n_jobs=-1, # parallelization
random_state=1337, # random seed
max_depth=10, # maximum tree depth
min_samples_leaf=10
)
In [11]:
%time model = clf.fit(X_train, y_train)
In [12]:
# computing ROC AUC over training set
train_auc = sk.metrics.roc_auc_score(y_train, model.predict(X_train))
print('Training ROC AUC:\t', round(train_auc, 3))
In [13]:
# computing ROC AUC over test set
test_auc = sk.metrics.roc_auc_score(y_test, model.predict(X_test))
print('Test ROC AUC:\t\t', round(test_auc, 3))
In [14]:
pkl.dump(model, open('pickles/model_v1.pkl','wb'))
In [15]:
m = pkl.load(open('pickles/model_v1.pkl','rb'))
m
Out[15]: