In [16]:
%matplotlib inline
from sklearn.cross_validation import KFold
from sklearn import tree
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_predict, cross_val_score
import sklearn_pandas as skpd
from sklearn import cross_validation
import pandas as pd
import matplotlib.pyplot as plt
from time import time
print('reading the data')
X_train = pd.read_csv('../data/UCI_HAR_Dataset/train/X_train.txt', header=None, delim_whitespace=True)
y_train = pd.read_csv('../data/UCI_HAR_Dataset/train/y_train.txt', header=None, delim_whitespace=True)
X_test = pd.read_csv('../data/UCI_HAR_Dataset/test/X_test.txt', header=None, delim_whitespace=True)
y_test = pd.read_csv('../data/UCI_HAR_Dataset/test/y_test.txt', header=None, delim_whitespace=True)
print('finished reading data')
In [2]:
clf = tree.DecisionTreeClassifier()
start = time()
clf = clf.fit(X_train, y_train)
print('Random forest took %.2f seconds' % ((time() - start)))
y_predict = clf.predict(X_test)
print(classification_report(y_test, y_predict))
In [3]:
print('------------------- Cross Validation ---------------')
# cross_val_predict returns an array of the same size as target where each entry
# is a prediction obtained by cross validated:
X_train_np = X_train.as_matrix()
y_train_np = y_train.as_matrix()
y_train_np = y_train_np.flatten() # This converts y from dim (n,1) to (n,) i.e. 2D to 1D.
assert X_train_np.shape == X_train.shape
#assert y_train_np.shape == y_train[:].shape # this will fail.
(X_train_np.shape, y_train_np.shape, y_train[:].shape)
Out[3]:
In [4]:
clf_cv = tree.DecisionTreeClassifier()
predicted = cross_val_predict(clf_cv, X_train_np, y_train_np, cv=10)
In [5]:
y_test_np = y_test.as_matrix().flatten()
y_train_np.shape == y_test_np.shape
predicted.shape
Out[5]:
In [6]:
scores = cross_validation.cross_val_score(
clf_cv, X_train_np, y_train_np, cv=10)
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
In [17]:
fig,ax = plt.subplots()
ax.scatter(y_train_np, predicted)
ax.plot([y_train_np.min(), y_train_np.max()], [y_train_np.min(), y_train_np.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
Out[17]:
In [37]:
from sklearn import decomposition
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
trees = tree.DecisionTreeClassifier()
pca = decomposition.RandomizedPCA(n_components=500)
pipe = Pipeline(steps=[('pca', pca), ('tree', trees)])
X_train = StandardScaler().fit_transform(X_train_np) # Normalize data
pca.fit(X_train)
Out[37]:
In [38]:
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
Out[38]:
In [39]:
X = pca.transform(X_train_np)
trees.fit(X, y_train_np)
trees.score(pca.transform(X_test), y_test)
Out[39]:
In [40]:
scores = cross_validation.cross_val_score(
estimator, X, y_train_np, cv=10)
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
In [13]:
In [ ]: