In [ ]:
from sklearn.datasets import make_classification, make_moons, load_iris, make_circles
from sklearn.decomposition import PCA, KernelPCA
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

Example: Arbitrary classification


In [ ]:
X, y = make_classification(n_informative=2, random_state=11)
print(X.shape)

Training simple logistic regression model


In [ ]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=2)
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
prediction = lr.predict(xtest)
print("F1 score: %f" % f1_score(ytest, prediction))

Reducing dimensionality with PCA


In [ ]:
pca = PCA(n_components=2)
x_red = pca.fit_transform(X)
print(x_red.shape)

Training on reduced dimensions


In [ ]:
xtrain, xtest, ytrain, ytest = train_test_split(x_red, y, random_state=2)
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
prediction = lr.predict(xtest)
print("F1 score: %f" % f1_score(ytest, prediction))

In [ ]:
plt.scatter(x_red[:, 0], x_red[:, 1], c=y)

Exercise:

1. Plot top 2 principal components of the iris dataset (already provided below)

2. Use LinearSVC to train on full iris dataset and on PCA of iris dataset, check the difference in F1 score.


In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [ ]:
# enter code here

Q: What if there are only two features?


In [ ]:
X, y = make_circles(factor=0.3, noise=0.05)
print(X.shape)
plt.scatter(X[:, 0], X[:, 1], c=y, marker="o", s=50)

Fitting Logistic Regression to overlapping data


In [ ]:
lr = LogisticRegression()
lr.fit(X, y)
# draw boundary
xmin, xmax = X[:, 0].min() - 1, X[:, 0].max() + 1
ymin, ymax = X[:, 1].min() - 1, X[:, 1].max() + 1
h = 0.02
xx, yy = np.meshgrid(np.arange(xmin, xmax, h), np.arange(ymin, ymax, h))
Z = lr.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, s=50)

Increasing, then reducing dimensionality


In [ ]:
kpca = KernelPCA(n_components=2, kernel="rbf", gamma=5)
x_kpca = kpca.fit_transform(X)
print(x_kpca.shape)

In [ ]:
plt.scatter(x_kpca[:, 0], x_kpca[:, 1], c=y, s=50)

In [ ]:
lr = LogisticRegression()
lr.fit(x_kpca, y)
# draw boundary
xmin, xmax = x_kpca[:, 0].min() - 1, x_kpca[:, 0].max() + 1
ymin, ymax = x_kpca[:, 1].min() - 1, x_kpca[:, 1].max() + 1
h = 0.02
xx, yy = np.meshgrid(np.arange(xmin, xmax, h), np.arange(ymin, ymax, h))
Z = lr.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(x_kpca[:, 0], x_kpca[:, 1], c=y, cmap=plt.cm.Paired, s=50)

Exercise:

Use KernelPCA to make the following dataset linearly separable

Hint: Use kernel="rbf" and find the right value of gamma


In [ ]:
X, y = make_moons()
plt.scatter(X[:, 0], X[:, 1], c=y)

In [ ]:
# enter code here