Dataset : https://archive.ics.uci.edu/ml/datasets/Gesture+Phase+Segmentation
The dataset is composed by features extracted from 7 videos with people gesticulating, aiming at studying Gesture Phase Segmentation. Each video is represented by two files: a raw file, which contains the position of hands, wrists, head and spine of the user in each frame; and a processed file, which contains velocity and acceleration of hands and wrists. See the data set description for more information on the dataset.
Raw files: 18 numeric attributes (double), a timestamp and a class attribute (nominal). Processed files: 32 numeric attributes (double) and a class attribute (nominal). A feature vector with up to 50 numeric attributes can be generated with the two files mentioned above.
In [5]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn import preprocessing
import matplotlib
import matplotlib.pyplot as plt
In [6]:
# read .csv from provided dataset
csv_filename1="a1_raw.csv"
csv_filename2="a1_va3.csv"
# df=pd.read_csv(csv_filename,index_col=0)
df1=pd.read_csv(csv_filename1 , skiprows=[1,2,3,4])
df2=pd.read_csv(csv_filename2)
In [7]:
df1.head()
Out[7]:
In [8]:
df1.shape
Out[8]:
In [9]:
df1['phase'].unique()
Out[9]:
In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['phase'] = le.fit_transform(df1['phase'])
In [11]:
df1['phase'].unique()
Out[11]:
In [12]:
df2.head()
Out[12]:
In [13]:
df2.shape
Out[13]:
In [14]:
df2['Phase'].unique()
Out[14]:
In [15]:
df1.columns
Out[15]:
In [16]:
df2.columns
Out[16]:
In [17]:
df2.rename(columns={'Phase': 'phase'}, inplace=True)
In [18]:
df1.phase.unique()
Out[18]:
In [19]:
df2.phase.unique()
Out[19]:
In [20]:
a = df2.phase == 'D'
b = df2.phase == 'P'
c = df2.phase == 'S'
d = df2.phase == 'H'
e = df2.phase == 'R'
df2.loc[a,'phase'] = 'Rest'
df2.loc[b,'phase'] = 'Preparation'
df2.loc[c,'phase'] = 'Stroke'
df2.loc[d,'phase'] = 'Hold'
df2.loc[e,'phase'] = 'Retraction'
In [21]:
df2.head(3)
Out[21]:
In [22]:
df2.phase.unique()
Out[22]:
In [23]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2['phase'] = le.fit_transform(df2['phase'])
In [24]:
df2.phase.unique()
Out[24]:
In [25]:
df1.groupby('phase').count()
Out[25]:
In [26]:
df2.groupby('phase').count()
Out[26]:
In [27]:
df1.sort('phase',inplace=True)
In [28]:
df2.sort('phase',inplace=True)
In [29]:
df2.tail()
Out[29]:
In [30]:
left = pd.DataFrame({
....: 'key2': ['0', '2', '1', '3','0','1'],
....: 'A': ['A0', 'A1', 'A2', 'A3','A4','A5'],
....: 'B': ['B0', 'B1', 'B2', 'B3','B4','B5']})
....:
right = pd.DataFrame({
....: 'key2': ['0', '1', '2', '0', '1', '3'],
....: 'C': ['C0', 'C1', 'C2', 'C3', 'C4', 'C5'],
....: 'D': ['D0', 'D1', 'D2', 'D3', 'D4', 'D5']})
....:
In [31]:
left
Out[31]:
In [32]:
right
Out[32]:
In [33]:
left.sort('key2',inplace=True)
In [34]:
left
Out[34]:
In [35]:
right.sort('key2',inplace=True)
In [36]:
right
Out[36]:
In [37]:
result = pd.merge(left, right, on=['key2'])
In [38]:
result
Out[38]:
In [39]:
result2 = pd.merge(left, right, on=['key2'], how='right')
In [40]:
result2
Out[40]:
In [41]:
df = pd.merge(df1, df2, on='phase')
In [42]:
df.head()
Out[42]:
In [43]:
df.columns
Out[43]:
In [44]:
df[:1]
Out[44]:
In [45]:
df1.shape,df2.shape,df.shape
Out[45]:
In [46]:
df.drop('timestamp', axis=1, inplace=True)
In [47]:
cols = list(df.columns)
features = cols
features.remove('phase')
In [48]:
len(features)
Out[48]:
In [49]:
df1.shape,df2.shape,df.shape
Out[49]:
In [50]:
df1.drop('phase',axis=1,inplace=True)
In [51]:
df_1 = pd.concat([df1,df2],axis=1)
In [52]:
df_1.drop('timestamp' , axis=1, inplace=True )
In [53]:
df_1.shape
Out[53]:
In [54]:
cols = list(df_1.columns)
features = cols
features.remove('phase')
In [55]:
X = df_1[features]
y = df_1['phase']
In [56]:
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [58]:
print (X_train.shape, y_train.shape)
In [59]:
len(features)
Out[59]:
In [60]:
# Apply PCA with the same number of dimensions as variables in the dataset
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca.fit(X)
# Print the components and the amount of variance in the data contained in each dimension
print(pca.components_)
print(pca.explained_variance_ratio_)
In [61]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(list(pca.explained_variance_ratio_),'-o')
plt.title('Explained variance ratio as function of PCA components')
plt.ylabel('Explained variance ratio')
plt.xlabel('Component')
plt.show()
In [62]:
# First we reduce the data to two dimensions using PCA to capture variation
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X)
print(reduced_data[:10]) # print upto 10 elements
In [63]:
# Import clustering modules
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
In [64]:
kmeans = KMeans(n_clusters=5)
clusters = kmeans.fit(reduced_data)
print(clusters)
In [65]:
# Plot the decision boundary by building a mesh grid to populate a graph.
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
hx = (x_max-x_min)/1000.
hy = (y_max-y_min)/1000.
xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy))
# Obtain labels for each point in mesh. Use last trained model.
Z = clusters.predict(np.c_[xx.ravel(), yy.ravel()])
In [66]:
# Find the centroids for KMeans or the cluster means for GMM
centroids = kmeans.cluster_centers_
print('*** K MEANS CENTROIDS ***')
print(centroids)
# TRANSFORM DATA BACK TO ORIGINAL SPACE FOR ANSWERING 7
print('*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***')
print(pca.inverse_transform(centroids))
In [67]:
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.title('Clustering on the seeds dataset (PCA-reduced data)\n'
'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
In [68]:
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='complete')
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)
In [69]:
X = df_1[features]
y = df_1['phase']
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [70]:
from sklearn import cluster
clf = cluster.KMeans(init='k-means++', n_clusters=5, random_state=5)
clf.fit(X_train)
print clf.labels_.shape
print clf.labels_
In [ ]:
# Predict clusters on testing data
y_pred = clf.predict(X_test)
In [ ]:
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred))
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
In [ ]:
# Affinity propagation
aff = cluster.AffinityPropagation()
aff.fit(X_train)
print aff.cluster_centers_indices_.shape
In [ ]:
y_pred = aff.predict(X_test)
In [ ]:
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred))
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
In [ ]:
ms = cluster.MeanShift()
ms.fit(X_train)
In [ ]:
y_pred = ms.predict(X_test)
In [ ]:
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred))
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
In [ ]:
from sklearn import mixture
# Define a heldout dataset to estimate covariance type
X_train_heldout, X_test_heldout, y_train_heldout, y_test_heldout = train_test_split(
X_train, y_train,test_size=0.25, random_state=42)
for covariance_type in ['spherical','tied','diag','full']:
gm=mixture.GMM(n_components=100, covariance_type=covariance_type, random_state=42, n_init=5)
gm.fit(X_train_heldout)
y_pred=gm.predict(X_test_heldout)
print "Adjusted rand score for covariance={}:{:.2}".format(covariance_type,
metrics.adjusted_rand_score(y_test_heldout, y_pred))
In [71]:
pca = PCA(n_components=2)
X = pca.fit_transform(X)
In [72]:
c = []
from matplotlib.pyplot import cm
n=6
color=iter(cm.rainbow(np.linspace(0,1,n)))
for i in range(n):
c.append(next(color))
In [73]:
n = 5
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,6))
km = KMeans(n_clusters= n , random_state=0)
y_km = km.fit_predict(X)
for i in range(n):
ax1.scatter(X[y_km==i,0], X[y_km==i,1], c=c[i], marker='o', s=40, label='cluster{}'.format(i))
ax1.set_title('K-means clustering')
ac = AgglomerativeClustering(n_clusters=n, affinity='euclidean', linkage='complete')
y_ac = ac.fit_predict(X)
for i in range(n):
ax2.scatter(X[y_ac==i,0], X[y_ac==i,1], c=c[i], marker='o', s=40, label='cluster{}'.format(i))
ax2.set_title('Agglomerative clustering')
# Put a legend below current axis
plt.legend()
plt.tight_layout()
#plt.savefig('./figures/kmeans_and_ac.png', dpi=300)
plt.show()
In [74]:
import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
In [75]:
X = df_1[features]
y = df_1['phase']
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [77]:
print (X_train.shape, y_train.shape,X_test.shape, y_test.shape)
In [79]:
t0=time()
print ("DecisionTree")
dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)
clf_dt=dt.fit(X_train,y_train)
print ("Acurracy: ", clf_dt.score(X_test,y_test))
t1=time()
print ("time elapsed: ", t1-t0)
In [80]:
tt0=time()
print ("cross result========")
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print (scores)
print (scores.mean())
tt1=time()
print ("time elapsed: ", tt1-tt0)
In [83]:
t2=time()
print ("RandomForest")
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
print ("Acurracy: ", clf_rf.score(X_test,y_test))
t3=time()
print ("time elapsed: ", t3-t2)
In [84]:
tt2=time()
print ("cross result========")
scores = cross_validation.cross_val_score(rf, X,y, cv=5)
print (scores)
print (scores.mean())
tt3=time()
print ("time elapsed: ", tt3-tt2)
In [85]:
t4=time()
print ("NaiveBayes")
nb = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
print ("Acurracy: ", clf_nb.score(X_test,y_test))
t5=time()
print ("time elapsed: ", t5-t4)
In [89]:
tt4=time()
print ("cross result========")
scores = cross_validation.cross_val_score(nb, X,y, cv=5)
print (scores)
print (scores.mean())
tt5=time()
print ("time elapsed: ", tt5-tt4)
In [91]:
t6=time()
print ("KNN")
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier()
clf_knn=knn.fit(X_train, y_train)
print ("Acurracy: ", clf_knn.score(X_test,y_test) )
t7=time()
print ("time elapsed: ", t7-t6)
In [92]:
tt6=time()
print ("cross result========")
scores = cross_validation.cross_val_score(knn, X,y, cv=5)
print (scores)
print (scores.mean())
tt7=time()
print ("time elapsed: ", tt7-tt6)
In [93]:
t7=time()
print ("SVM")
svc = SVC()
clf_svc=svc.fit(X_train, y_train)
print ("Acurracy: ", clf_svc.score(X_test,y_test) )
t8=time()
print ("time elapsed: ", t8-t7)
In [94]:
tt7=time()
print ("cross result========")
scores = cross_validation.cross_val_score(svc, X,y, cv=5)
print (scores)
print (scores.mean())
tt8=time()
print ("time elapsed: ", tt7-tt6)
In [95]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search
svc = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid.fit(X_train, y_train)
print ('Best score: %0.3f' % grid.best_score_)
print ('Best parameters set:')
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print ('\t%s: %r' % (param_name, best_parameters[param_name]))
predictions = grid.predict(X_test)
print (classification_report(y_test, predictions))
In [96]:
pipeline = Pipeline([
('clf', SVC(kernel='rbf', gamma=0.01, C=100))
])
parameters = {
'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
print ('Best score: %0.3f' % grid_search.best_score_)
print ('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print ('\t%s: %r' % (param_name, best_parameters[param_name]))
predictions = grid_search.predict(X_test)
print (classification_report(y_test, predictions))
In [97]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator
class MajorityVoteClassifier(BaseEstimator,
ClassifierMixin):
""" A majority vote ensemble classifier
Parameters
----------
classifiers : array-like, shape = [n_classifiers]
Different classifiers for the ensemble
vote : str, {'classlabel', 'probability'} (default='label')
If 'classlabel' the prediction is based on the argmax of
class labels. Else if 'probability', the argmax of
the sum of probabilities is used to predict the class label
(recommended for calibrated classifiers).
weights : array-like, shape = [n_classifiers], optional (default=None)
If a list of `int` or `float` values are provided, the classifiers
are weighted by importance; Uses uniform weights if `weights=None`.
"""
def __init__(self, classifiers, vote='classlabel', weights=None):
self.classifiers = classifiers
self.named_classifiers = {key: value for key, value
in _name_estimators(classifiers)}
self.vote = vote
self.weights = weights
def fit(self, X, y):
""" Fit classifiers.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Matrix of training samples.
y : array-like, shape = [n_samples]
Vector of target class labels.
Returns
-------
self : object
"""
if self.vote not in ('probability', 'classlabel'):
raise ValueError("vote must be 'probability' or 'classlabel'"
"; got (vote=%r)"
% self.vote)
if self.weights and len(self.weights) != len(self.classifiers):
raise ValueError('Number of classifiers and weights must be equal'
'; got %d weights, %d classifiers'
% (len(self.weights), len(self.classifiers)))
# Use LabelEncoder to ensure class labels start with 0, which
# is important for np.argmax call in self.predict
self.lablenc_ = LabelEncoder()
self.lablenc_.fit(y)
self.classes_ = self.lablenc_.classes_
self.classifiers_ = []
for clf in self.classifiers:
fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
self.classifiers_.append(fitted_clf)
return self
def predict(self, X):
""" Predict class labels for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Matrix of training samples.
Returns
----------
maj_vote : array-like, shape = [n_samples]
Predicted class labels.
"""
if self.vote == 'probability':
maj_vote = np.argmax(self.predict_proba(X), axis=1)
else: # 'classlabel' vote
# Collect results from clf.predict calls
predictions = np.asarray([clf.predict(X)
for clf in self.classifiers_]).T
maj_vote = np.apply_along_axis(
lambda x:
np.argmax(np.bincount(x,
weights=self.weights)),
axis=1,
arr=predictions)
maj_vote = self.lablenc_.inverse_transform(maj_vote)
return maj_vote
def predict_proba(self, X):
""" Predict class probabilities for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
----------
avg_proba : array-like, shape = [n_samples, n_classes]
Weighted average probability for each class per sample.
"""
probas = np.asarray([clf.predict_proba(X)
for clf in self.classifiers_])
avg_proba = np.average(probas, axis=0, weights=self.weights)
return avg_proba
def get_params(self, deep=True):
""" Get classifier parameter names for GridSearch"""
if not deep:
return super(MajorityVoteClassifier, self).get_params(deep=False)
else:
out = self.named_classifiers.copy()
for name, step in six.iteritems(self.named_classifiers):
for key, value in six.iteritems(step.get_params(deep=True)):
out['%s__%s' % (name, key)] = value
return out
In [98]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import StandardScaler
clf1 = LogisticRegression(penalty='l2',
C=0.001,
random_state=0)
clf2 = DecisionTreeClassifier(max_depth=1,
criterion='entropy',
random_state=0)
clf3 = KNeighborsClassifier(n_neighbors=1,
p=2,
metric='minkowski')
pipe1 = Pipeline([['sc', StandardScaler()],
['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()],
['clf', clf3]])
clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']
print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
scores = cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
In [99]:
# Majority Rule (hard) Voting
mv_clf = MajorityVoteClassifier(
classifiers=[pipe1, clf2, pipe3])
clf_labels += ['Majority Voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]
for clf, label in zip(all_clf, clf_labels):
scores = cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
In [100]:
mv_clf.get_params()
Out[100]:
In [101]:
from sklearn.grid_search import GridSearchCV
params = {'decisiontreeclassifier__max_depth': [1, 2],
'pipeline-1__clf__C': [0.001, 0.1, 100.0]}
grid = GridSearchCV(estimator=mv_clf,
param_grid=params,
cv=10,
scoring='accuracy')
grid.fit(X_train, y_train)
for params, mean_score, scores in grid.grid_scores_:
print("%0.3f+/-%0.2f %r"
% (mean_score, scores.std() / 2, params))
In [102]:
print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)
In [103]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy',
max_depth=None)
bag = BaggingClassifier(base_estimator=tree,
n_estimators=500,
max_samples=1.0,
max_features=1.0,
bootstrap=True,
bootstrap_features=False,
n_jobs=1,
random_state=1)
from sklearn.metrics import accuracy_score
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
% (tree_train, tree_test))
bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)
bag_train = accuracy_score(y_train, y_train_pred)
bag_test = accuracy_score(y_test, y_test_pred)
print('Bagging train/test accuracies %.3f/%.3f'
% (bag_train, bag_test))
In [104]:
from sklearn.ensemble import AdaBoostClassifier
tree = DecisionTreeClassifier(criterion='entropy',
max_depth=1)
ada = AdaBoostClassifier(base_estimator=tree,
n_estimators=500,
learning_rate=0.1,
random_state=0)
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
% (tree_train, tree_test))
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print('AdaBoost train/test accuracies %.3f/%.3f'
% (ada_train, ada_test))