In [142]:
import sklearn
import sklearn.tree
import sklearn.datasets
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.ensemble

import urllib2
import bs4
import itertools

import pandas as pd
import pandas.tools.plotting 

import numpy as np

import matplotlib
from matplotlib.colors import ListedColormap
%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style("white")

cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#000000'])

In [95]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
url = 'abalone.csv'
columns = ['sex','length','diameter','height','wholeWeight','shuckedWeight','visceraWeight','shellWeight','rings']

In [165]:
data_orig = pd.io.parsers.read_csv(url, header=None, names=columns)
data_orig.head(3)


Out[165]:
sex length diameter height wholeWeight shuckedWeight visceraWeight shellWeight rings
0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.15 15
1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.07 7
2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.21 9

In [166]:
## need to convert the label
#0:infant, 1:female, 2:male
def parse_label(label):
    options = {'I': 0, 'F': 1, 'M': 2}
    return options[label]

data['sex'] = data_orig['sex'].map(parse_label)
data.head()


Out[166]:
sex length diameter height wholeWeight shuckedWeight visceraWeight shellWeight rings
0 2 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 3
1 2 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 0
2 1 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 1
3 2 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 2
4 0 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 0

In [98]:
sns.pairplot(data, hue='sex')


Out[98]:
<seaborn.axisgrid.PairGrid at 0x40a13898>

In [99]:
def group_rings(ring):
    if ring<9: 
        return 0
    if ring == 9: 
        return 1
    if ring == 10:
        return 2
    return 3

data['rings'] = data['rings'].map(group_rings)
data.head()


Out[99]:
sex length diameter height wholeWeight shuckedWeight visceraWeight shellWeight rings
0 2 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 3
1 2 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 0
2 1 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 1
3 2 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 2
4 0 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 0

In [172]:
data_orig.head()


Out[172]:
sex length diameter height wholeWeight shuckedWeight visceraWeight shellWeight rings
0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7

In [174]:
one_hot_encoding = pd.core.reshape.get_dummies(data_orig.sex)
one_hot_encoding.head()


Out[174]:
F I M
0 0 0 1
1 0 0 1
2 1 0 0
3 0 0 1
4 0 1 0

In [100]:
sns.pairplot(data, hue='rings', palette='Set2')


Out[100]:
<seaborn.axisgrid.PairGrid at 0x41c058d0>

In [129]:
# split the data into data and label
labels_y = data.rings.copy()
data_x = data.drop(labels=['rings','sex'], axis=1)

In [130]:
## plot labels against each other projected with svd
## http://peekaboo-vision.blogspot.com/2012/12/another-look-at-mnist.html

fig, plots = plt.subplots(4, 4)
fig.set_size_inches(10, 10)

x_train_centered = data_x - np.mean(data_x,axis=0)
X_train = x_train_centered.as_matrix()
Y_train = labels_y.as_matrix()

svd = sklearn.decomposition.TruncatedSVD(n_components=2)

for i, j in itertools.product(xrange(4), repeat=2):
    X_ = X_train[(Y_train == i) + (Y_train == j)]
    y_ = Y_train[(Y_train == i) + (Y_train == j)]
    
    X_t = svd.fit_transform(X_)

    X_t_i = X_t[(y_ == i)]
    X_t_j = X_t[(y_ == j)]

    if i < j:
        plots[i, j].scatter(X_t_i[:, 0], X_t_i[:, 1], c='r', s=4.0)
        plots[i, j].scatter(X_t_j[:, 0], X_t_j[:, 1], c='g', s=4.0)
        plots[i, j].set_xticks(())
        plots[i, j].set_yticks(())
        
        plots[j, i].scatter(X_t_j[:, 0], X_t_j[:, 1], c='g', s=4.0)
        plots[j, i].scatter(X_t_i[:, 0], X_t_i[:, 1], c='r', s=4.0)
        plots[j, i].set_xticks(())
        plots[j, i].set_yticks(())    
    elif i==j:
        plots[i, j].scatter(X_t_i[:, 0], X_t_i[:, 1], c='b', s=4.0)
        plots[i, j].set_xticks(())
        plots[i, j].set_yticks(())    

    if i == 0:
        plots[i, j].set_title(j)
        plots[j, i].set_ylabel(j)



In [131]:
decisionTree = sklearn.tree.DecisionTreeClassifier()
decisionTree.fit(data_x, labels_y)


Out[131]:
DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

In [132]:
print decisionTree.feature_importances_
print data_x.columns


[ 0.08858798  0.07813831  0.10239257  0.1072299   0.1497993   0.13731056
  0.33654138]
Index([u'length', u'diameter', u'height', u'wholeWeight', u'shuckedWeight', u'visceraWeight', u'shellWeight'], dtype='object')

In [133]:
with open("tree.dot", "w") as output_file:
    sklearn.tree.export_graphviz(decisionTree, feature_names=data_x.columns,
                         out_file=output_file)

In [134]:
## execute dot to convert .dot file to .png on the command line

In [135]:
from IPython.core.display import Image 
Image(filename='tree.png')


Out[135]:

In [140]:
scores = sklearn.cross_validation.cross_val_score(decisionTree, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)


0.492175261324 0.0313737554848

In [144]:
bagging = sklearn.ensemble.BaggingClassifier(n_estimators=10)
scores = sklearn.cross_validation.cross_val_score(bagging, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)


0.547266812285 0.0242239012691

In [146]:
bagging = sklearn.ensemble.BaggingClassifier(n_estimators=50)
scores_bagging = sklearn.cross_validation.cross_val_score(bagging, data_x, labels_y, cv=10)
print np.mean(scores_bagging), np.std(scores_bagging)


0.566897342224 0.0282901274665

In [155]:
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=10)
scores = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)


0.554471987619 0.0160168135819

In [156]:
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
scores = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)


0.579625240593 0.0306497400456

In [157]:
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=10)
scores = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)


0.584401340284 0.0275324233063

In [159]:
scores_tree = sklearn.cross_validation.cross_val_score(decisionTree, data_x, labels_y, cv=10)

bagging = sklearn.ensemble.BaggingClassifier(n_estimators=100)
scores_bagging = sklearn.cross_validation.cross_val_score(bagging, data_x, labels_y, cv=10)

forest = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
scores_forest = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)

In [164]:
sns.boxplot([scores_tree, scores_bagging, scores_forest], names=['tree','bagging','forest'])
sns.set_context('poster')
plt.ylabel("score")


Out[164]:
<matplotlib.text.Text at 0x3b2ecb70>

In [ ]: