In [142]:
import sklearn
import sklearn.tree
import sklearn.datasets
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.ensemble
import urllib2
import bs4
import itertools
import pandas as pd
import pandas.tools.plotting
import numpy as np
import matplotlib
from matplotlib.colors import ListedColormap
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#000000'])
In [95]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
url = 'abalone.csv'
columns = ['sex','length','diameter','height','wholeWeight','shuckedWeight','visceraWeight','shellWeight','rings']
In [165]:
data_orig = pd.io.parsers.read_csv(url, header=None, names=columns)
data_orig.head(3)
Out[165]:
In [166]:
## need to convert the label
#0:infant, 1:female, 2:male
def parse_label(label):
options = {'I': 0, 'F': 1, 'M': 2}
return options[label]
data['sex'] = data_orig['sex'].map(parse_label)
data.head()
Out[166]:
In [98]:
sns.pairplot(data, hue='sex')
Out[98]:
In [99]:
def group_rings(ring):
if ring<9:
return 0
if ring == 9:
return 1
if ring == 10:
return 2
return 3
data['rings'] = data['rings'].map(group_rings)
data.head()
Out[99]:
In [172]:
data_orig.head()
Out[172]:
In [174]:
one_hot_encoding = pd.core.reshape.get_dummies(data_orig.sex)
one_hot_encoding.head()
Out[174]:
In [100]:
sns.pairplot(data, hue='rings', palette='Set2')
Out[100]:
In [129]:
# split the data into data and label
labels_y = data.rings.copy()
data_x = data.drop(labels=['rings','sex'], axis=1)
In [130]:
## plot labels against each other projected with svd
## http://peekaboo-vision.blogspot.com/2012/12/another-look-at-mnist.html
fig, plots = plt.subplots(4, 4)
fig.set_size_inches(10, 10)
x_train_centered = data_x - np.mean(data_x,axis=0)
X_train = x_train_centered.as_matrix()
Y_train = labels_y.as_matrix()
svd = sklearn.decomposition.TruncatedSVD(n_components=2)
for i, j in itertools.product(xrange(4), repeat=2):
X_ = X_train[(Y_train == i) + (Y_train == j)]
y_ = Y_train[(Y_train == i) + (Y_train == j)]
X_t = svd.fit_transform(X_)
X_t_i = X_t[(y_ == i)]
X_t_j = X_t[(y_ == j)]
if i < j:
plots[i, j].scatter(X_t_i[:, 0], X_t_i[:, 1], c='r', s=4.0)
plots[i, j].scatter(X_t_j[:, 0], X_t_j[:, 1], c='g', s=4.0)
plots[i, j].set_xticks(())
plots[i, j].set_yticks(())
plots[j, i].scatter(X_t_j[:, 0], X_t_j[:, 1], c='g', s=4.0)
plots[j, i].scatter(X_t_i[:, 0], X_t_i[:, 1], c='r', s=4.0)
plots[j, i].set_xticks(())
plots[j, i].set_yticks(())
elif i==j:
plots[i, j].scatter(X_t_i[:, 0], X_t_i[:, 1], c='b', s=4.0)
plots[i, j].set_xticks(())
plots[i, j].set_yticks(())
if i == 0:
plots[i, j].set_title(j)
plots[j, i].set_ylabel(j)
In [131]:
decisionTree = sklearn.tree.DecisionTreeClassifier()
decisionTree.fit(data_x, labels_y)
Out[131]:
In [132]:
print decisionTree.feature_importances_
print data_x.columns
In [133]:
with open("tree.dot", "w") as output_file:
sklearn.tree.export_graphviz(decisionTree, feature_names=data_x.columns,
out_file=output_file)
In [134]:
## execute dot to convert .dot file to .png on the command line
In [135]:
from IPython.core.display import Image
Image(filename='tree.png')
Out[135]:
In [140]:
scores = sklearn.cross_validation.cross_val_score(decisionTree, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)
In [144]:
bagging = sklearn.ensemble.BaggingClassifier(n_estimators=10)
scores = sklearn.cross_validation.cross_val_score(bagging, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)
In [146]:
bagging = sklearn.ensemble.BaggingClassifier(n_estimators=50)
scores_bagging = sklearn.cross_validation.cross_val_score(bagging, data_x, labels_y, cv=10)
print np.mean(scores_bagging), np.std(scores_bagging)
In [155]:
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=10)
scores = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)
In [156]:
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
scores = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)
In [157]:
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=10)
scores = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
print np.mean(scores), np.std(scores)
In [159]:
scores_tree = sklearn.cross_validation.cross_val_score(decisionTree, data_x, labels_y, cv=10)
bagging = sklearn.ensemble.BaggingClassifier(n_estimators=100)
scores_bagging = sklearn.cross_validation.cross_val_score(bagging, data_x, labels_y, cv=10)
forest = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
scores_forest = sklearn.cross_validation.cross_val_score(forest, data_x, labels_y, cv=10)
In [164]:
sns.boxplot([scores_tree, scores_bagging, scores_forest], names=['tree','bagging','forest'])
sns.set_context('poster')
plt.ylabel("score")
Out[164]:
In [ ]: