In [89]:
from sklearn.datasets import make_classification
import pandas as pd
X, y = make_classification(n_samples=500, n_features=3, n_redundant=0, n_informative=2,
n_clusters_per_class=1, hypercube=True, random_state=0)
# Make the dataset slightly unbalanced
num_pos = y.sum()
pos_subsample_factor = 0.7
ind = np.where(y == 0)[0].tolist() + np.where(y == 1)[0].tolist()[:int(num_pos * pos_subsample_factor)]
np.random.seed(0)
np.random.shuffle(ind)
X = X[ind]
y = y[ind]
df = pd.DataFrame(X)
pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y)
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], marker='o', c=y, s=40)
Out[89]:
In [98]:
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import average_precision_score
param_grid = {
'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9],
'l1_ratio': [0.15, 0.3, 0.45, 0.6]
}
clf = SGDClassifier(loss='log', n_iter=10, penalty='elasticnet', class_weight='auto')
for x in [X[:, 0][:, np.newaxis], X[:, 1][:, np.newaxis], X[:, 2][:, np.newaxis], X[:, :2], X]:
grid_clf = GridSearchCV(clf, param_grid).fit(x, y)
y_pred = grid_clf.predict_proba(x)[:, 1]
ap = average_precision_score(y, y_pred)
print(ap)
In [97]:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import average_precision_score
param_grid = {
'C': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9],
}
clf = SVC(kernel='linear', class_weight='auto', probability=True)
for x in [X[:, 0][:, np.newaxis], X[:, 1][:, np.newaxis], X[:, 2][:, np.newaxis], X[:, :2], X]:
grid_clf = GridSearchCV(clf, param_grid).fit(x, y)
y_pred = grid_clf.predict_proba(x)[:, 1]
ap = average_precision_score(y, y_pred)
print(ap)
In [92]:
# Save the features and labels to test support directory
%load_ext autoreload
%autoreload 2
import vislab.util
import vislab.vw3
import vislab.tests.test_context
dirname = vislab.util.makedirs(
vislab.tests.test_context.support_dirname + '/simple')
vislab.vw3.write_data_in_vw_format(df[[0, 1]], 'first', dirname + '/first.txt')
vislab.vw3.write_data_in_vw_format(df[[2]], 'second', dirname + '/second.txt.gz')
y[y == 0] = -1
label_df = pd.DataFrame({'label': y, 'importance': np.ones_like(y)})
label_df.index = label_df.index.astype(str)
label_df.to_hdf(dirname + '/label_df.h5', 'df', mode='w')
In [119]:
import sklearn.datasets
import sklearn.cross_validation
iris = sklearn.datasets.load_iris()
X, Xt, y, yt = sklearn.cross_validation.train_test_split(iris.data, iris.target)
df = pd.DataFrame(X)
pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y)
Out[119]:
In [133]:
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import average_precision_score, accuracy_score
param_grid = {
'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9],
'l1_ratio': [0.15, 0.3, 0.45, 0.6]
}
clf = SGDClassifier(loss='log', n_iter=10, penalty='elasticnet', class_weight='auto')
for x, xt in [
(X[:, 0][:, np.newaxis], Xt[:, 0][:, np.newaxis]),
(X[:, 1][:, np.newaxis], Xt[:, 1][:, np.newaxis]),
(X[:, 2][:, np.newaxis], Xt[:, 2][:, np.newaxis]),
(X[:, :2], Xt[:, :2]),
(X, Xt)
]:
grid_clf = GridSearchCV(clf, param_grid).fit(x, y)
yt_pred = grid_clf.predict_proba(xt)
aps = [average_precision_score(yt == i, yt_pred[:, i]) for i in range(3)]
aps.append(np.mean(aps))
print('aps', aps)
yt_pred_max = yt_pred.argmax(1)
print('accuracy', accuracy_score(yt, yt_pred_max))
print
In [127]:
# Save the features and labels to test support directory
%load_ext autoreload
%autoreload 2
import vislab.util
import vislab.vw3
import vislab.dataset
import vislab.tests.test_context
dirname = vislab.util.makedirs(
vislab.tests.test_context.support_dirname + '/iris')
vislab.vw3.write_data_in_vw_format(df, 'all', dirname + '/all.txt.gz')
label_df = pd.DataFrame({'label': [str(_) for _ in y]})
label_df.index = label_df.index.astype(str)
label_df = vislab.dataset.get_bool_df(label_df, 'label')
label_df.to_hdf(dirname + '/label_df.h5', 'df', mode='w')
In [ ]:
vw = VW('temp_vw')
vw.fit(X, Y)
y_pred = vw.predict(X)
print y_pred
print sklearn.metrics.accuracy_score(Y, y_pred)
In [118]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
Y = iris.target
h = .02 # step size in the mesh
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0 # SVM regularization parameter
classifiers = {
'LinearSVC (linear kernel)': svm.LinearSVC(C=C).fit(X, Y),
'SVC with 2-poly kernel': svm.SVC(kernel='poly', degree=2, C=C).fit(X, Y),
'VW with hinge loss': VW('temp_vw').fit(X, Y),
'VW with hinge loss and 2-poly expansion': VW('temp_vw_q').fit(X, Y, quadratic='::')
}
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
fig = plt.figure(figsize=(12,12))
i = 0
for title, clf in classifiers.iteritems():
i += 1
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
ax = fig.add_subplot(2, 2, i)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=plt.cm.Paired)
ax.axis('off')
# Plot also the training points
ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolor='k')
ax.set_title(title)
In [ ]: