notebook.community

Edit and run



In [142]:

    
import sklearn
import sklearn.tree
import sklearn.datasets
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.ensemble

import urllib2
import bs4
import itertools

import pandas as pd
import pandas.tools.plotting 

import numpy as np

import matplotlib
from matplotlib.colors import ListedColormap
%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style("white")

cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#000000'])



In [95]:

    
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
url = 'abalone.csv'
columns = ['sex','length','diameter','height','wholeWeight','shuckedWeight','visceraWeight','shellWeight','rings']



In [165]:

    
data_orig = pd.io.parsers.read_csv(url, header=None, names=columns)
data_orig.head(3)









    Out[165]:






  
    
      
      sex
      length
      diameter
      height
      wholeWeight
      shuckedWeight
      visceraWeight
      shellWeight
      rings
    
  
  
    
      0
       M
       0.455
       0.365
       0.095
       0.5140
       0.2245
       0.1010
       0.15
       15
    
    
      1
       M
       0.350
       0.265
       0.090
       0.2255
       0.0995
       0.0485
       0.07
        7
    
    
      2
       F
       0.530
       0.420
       0.135
       0.6770
       0.2565
       0.1415
       0.21
        9



In [166]:

    
## need to convert the label
#0:infant, 1:female, 2:male
def parse_label(label):
    options = {'I': 0, 'F': 1, 'M': 2}
    return options[label]

data['sex'] = data_orig['sex'].map(parse_label)
data.head()









    Out[166]:






  
    
      
      sex
      length
      diameter
      height
      wholeWeight
      shuckedWeight
      visceraWeight
      shellWeight
      rings
    
  
  
    
      0
       2
       0.455
       0.365
       0.095
       0.5140
       0.2245
       0.1010
       0.150
       3
    
    
      1
       2
       0.350
       0.265
       0.090
       0.2255
       0.0995
       0.0485
       0.070
       0
    
    
      2
       1
       0.530
       0.420
       0.135
       0.6770
       0.2565
       0.1415
       0.210
       1
    
    
      3
       2
       0.440
       0.365
       0.125
       0.5160
       0.2155
       0.1140
       0.155
       2
    
    
      4
       0
       0.330
       0.255
       0.080
       0.2050
       0.0895
       0.0395
       0.055
       0



In [98]:

    
sns.pairplot(data, hue='sex')









    Out[98]:





<seaborn.axisgrid.PairGrid at 0x40a13898>



In [99]:

    
def group_rings(ring):
    if ring<9: 
        return 0
    if ring == 9: 
        return 1
    if ring == 10:
        return 2
    return 3

data['rings'] = data['rings'].map(group_rings)
data.head()









    Out[99]:






  
    
      
      sex
      length
      diameter
      height
      wholeWeight
      shuckedWeight
      visceraWeight
      shellWeight
      rings
    
  
  
    
      0
       2
       0.455
       0.365
       0.095
       0.5140
       0.2245
       0.1010
       0.150
       3
    
    
      1
       2
       0.350
       0.265
       0.090
       0.2255
       0.0995
       0.0485
       0.070
       0
    
    
      2
       1
       0.530
       0.420
       0.135
       0.6770
       0.2565
       0.1415
       0.210
       1
    
    
      3
       2
       0.440
       0.365
       0.125
       0.5160
       0.2155
       0.1140
       0.155
       2
    
    
      4
       0
       0.330
       0.255
       0.080
       0.2050
       0.0895
       0.0395
       0.055
       0



In [172]:

    
data_orig.head()









    Out[172]:






  
    
      
      sex
      length
      diameter
      height
      wholeWeight
      shuckedWeight
      visceraWeight
      shellWeight
      rings
    
  
  
    
      0
       M
       0.455
       0.365
       0.095
       0.5140
       0.2245
       0.1010
       0.150
       15
    
    
      1
       M
       0.350
       0.265
       0.090
       0.2255
       0.0995
       0.0485
       0.070
        7
    
    
      2
       F
       0.530
       0.420
       0.135
       0.6770
       0.2565
       0.1415
       0.210
        9
    
    
      3
       M
       0.440
       0.365
       0.125
       0.5160
       0.2155
       0.1140
       0.155
       10
    
    
      4
       I
       0.330
       0.255
       0.080
       0.2050
       0.0895
       0.0395
       0.055
        7



In [174]:

    
one_hot_encoding = pd.core.reshape.get_dummies(data_orig.sex)
one_hot_encoding.head()



In [100]:

    
sns.pairplot(data, hue='rings', palette='Set2')









    Out[100]:





<seaborn.axisgrid.PairGrid at 0x41c058d0>



In [129]:

    
# split the data into data and label
labels_y = data.rings.copy()
data_x = data.drop(labels=['rings','sex'], axis=1)



In [130]:

    
## plot labels against each other projected with svd
## http://peekaboo-vision.blogspot.com/2012/12/another-look-at-mnist.html

fig, plots = plt.subplots(4, 4)
fig.set_size_inches(10, 10)

x_train_centered = data_x - np.mean(data_x,axis=0)
X_train = x_train_centered.as_matrix()
Y_train = labels_y.as_matrix()

svd = sklearn.decomposition.TruncatedSVD(n_components=2)

for i, j in itertools.product(xrange(4), repeat=2):
    X_ = X_train[(Y_train == i) + (Y_train == j)]
    y_ = Y_train[(Y_train == i) + (Y_train == j)]
    
    X_t = svd.fit_transform(X_)

    X_t_i = X_t[(y_ == i)]
    X_t_j = X_t[(y_ == j)]

    if i < j:
        plots[i, j].scatter(X_t_i[:, 0], X_t_i[:, 1], c='r', s=4.0)
        plots[i, j].scatter(X_t_j[:, 0], X_t_j[:, 1], c='g', s=4.0)
        plots[i, j].set_xticks(())
        plots[i, j].set_yticks(())
        
        plots[j, i].scatter(X_t_j[:, 0], X_t_j[:, 1], c='g', s=4.0)
        plots[j, i].scatter(X_t_i[:, 0], X_t_i[:, 1], c='r', s=4.0)
        plots[j, i].set_xticks(())
        plots[j, i].set_yticks(())    
    elif i==j:
        plots[i, j].scatter(X_t_i[:, 0], X_t_i[:, 1], c='b', s=4.0)
        plots[i, j].set_xticks(())
        plots[i, j].set_yticks(())    

    if i == 0:
        plots[i, j].set_title(j)
        plots[j, i].set_ylabel(j)



In [131]:

    
decisionTree = sklearn.tree.DecisionTreeClassifier()
decisionTree.fit(data_x, labels_y)









    Out[131]:





DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')



In [132]:

    
print decisionTree.feature_importances_
print data_x.columns









    



[ 0.08858798  0.07813831  0.10239257  0.1072299   0.1497993   0.13731056
  0.33654138]
Index([u'length', u'diameter', u'height', u'wholeWeight', u'shuckedWeight', u'visceraWeight', u'shellWeight'], dtype='object')



In [133]:

    
with open("tree.dot", "w") as output_file:
    sklearn.tree.export_graphviz(decisionTree, feature_names=data_x.columns,
                         out_file=output_file)



In [134]:

    
## execute dot to convert .dot file to .png on the command line



In [135]:

    
from IPython.core.display import Image 
Image(filename='tree.png')









    Out[135]:



In [140]:

    
scores = sklearn.cross_validation.cross_val_score(decisionTree, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)









    



0.492175261324 0.0313737554848



In [144]:

    
bagging = sklearn.ensemble.BaggingClassifier(n_estimators=10)
scores = sklearn.cross_validation.cross_val_score(bagging, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)









    



0.547266812285 0.0242239012691



In [146]:

    
bagging = sklearn.ensemble.BaggingClassifier(n_estimators=50)
scores_bagging = sklearn.cross_validation.cross_val_score(bagging, data_x, labels_y, cv=10)
print np.mean(scores_bagging), np.std(scores_bagging)









    



0.566897342224 0.0282901274665



In [155]:

    
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=10)
scores = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)









    



0.554471987619 0.0160168135819



In [156]:

    
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
scores = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)









    



0.579625240593 0.0306497400456



In [157]:

    
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=10)
scores = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)









    



0.584401340284 0.0275324233063



In [159]:

    
scores_tree = sklearn.cross_validation.cross_val_score(decisionTree, data_x, labels_y, cv=10)

bagging = sklearn.ensemble.BaggingClassifier(n_estimators=100)
scores_bagging = sklearn.cross_validation.cross_val_score(bagging, data_x, labels_y, cv=10)

forest = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
scores_forest = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)



In [164]:

    
sns.boxplot([scores_tree, scores_bagging, scores_forest], names=['tree','bagging','forest'])
sns.set_context('poster')
plt.ylabel("score")









    Out[164]:





<matplotlib.text.Text at 0x3b2ecb70>



In [ ]:

	sex	length	diameter	height	wholeWeight	shuckedWeight	visceraWeight	shellWeight	rings
0	M	0.455	0.365	0.095	0.5140	0.2245	0.1010	0.15	15
1	M	0.350	0.265	0.090	0.2255	0.0995	0.0485	0.07	7
2	F	0.530	0.420	0.135	0.6770	0.2565	0.1415	0.21	9