In [ ]:

    
from sklearn.datasets import make_classification, make_moons, load_iris, make_circles
from sklearn.decomposition import PCA, KernelPCA
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

Example: Arbitrary classification



In [ ]:

    
X, y = make_classification(n_informative=2, random_state=11)
print(X.shape)

Training simple logistic regression model



In [ ]:

    
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=2)
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
prediction = lr.predict(xtest)
print("F1 score: %f" % f1_score(ytest, prediction))

Reducing dimensionality with PCA



In [ ]:

    
pca = PCA(n_components=2)
x_red = pca.fit_transform(X)
print(x_red.shape)

Training on reduced dimensions



In [ ]:

    
xtrain, xtest, ytrain, ytest = train_test_split(x_red, y, random_state=2)
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
prediction = lr.predict(xtest)
print("F1 score: %f" % f1_score(ytest, prediction))



In [ ]:

    
plt.scatter(x_red[:, 0], x_red[:, 1], c=y)

Exercise:

1. Plot top 2 principal components of the iris dataset (already provided below)

2. Use `LinearSVC` to train on full iris dataset and on PCA of iris dataset, check the difference in F1 score.



In [ ]:

    
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target



In [ ]:

    
# enter code here

Q: What if there are only two features?



In [ ]:

    
X, y = make_circles(factor=0.3, noise=0.05)
print(X.shape)
plt.scatter(X[:, 0], X[:, 1], c=y, marker="o", s=50)

Fitting Logistic Regression to overlapping data



In [ ]:

    
lr = LogisticRegression()
lr.fit(X, y)
# draw boundary
xmin, xmax = X[:, 0].min() - 1, X[:, 0].max() + 1
ymin, ymax = X[:, 1].min() - 1, X[:, 1].max() + 1
h = 0.02
xx, yy = np.meshgrid(np.arange(xmin, xmax, h), np.arange(ymin, ymax, h))
Z = lr.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, s=50)

Increasing, then reducing dimensionality



In [ ]:

    
kpca = KernelPCA(n_components=2, kernel="rbf", gamma=5)
x_kpca = kpca.fit_transform(X)
print(x_kpca.shape)



In [ ]:

    
plt.scatter(x_kpca[:, 0], x_kpca[:, 1], c=y, s=50)



In [ ]:

    
lr = LogisticRegression()
lr.fit(x_kpca, y)
# draw boundary
xmin, xmax = x_kpca[:, 0].min() - 1, x_kpca[:, 0].max() + 1
ymin, ymax = x_kpca[:, 1].min() - 1, x_kpca[:, 1].max() + 1
h = 0.02
xx, yy = np.meshgrid(np.arange(xmin, xmax, h), np.arange(ymin, ymax, h))
Z = lr.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(x_kpca[:, 0], x_kpca[:, 1], c=y, cmap=plt.cm.Paired, s=50)

Exercise:

Use KernelPCA to make the following dataset linearly separable

Hint: Use `kernel="rbf"` and find the right value of `gamma`



In [ ]:

    
X, y = make_moons()
plt.scatter(X[:, 0], X[:, 1], c=y)



In [ ]:

    
# enter code here

Example: Arbitrary classification

Training simple logistic regression model

Reducing dimensionality with PCA

Training on reduced dimensions

Exercise:

1. Plot top 2 principal components of the iris dataset (already provided below)

2. Use LinearSVC to train on full iris dataset and on PCA of iris dataset, check the difference in F1 score.

Q: What if there are only two features?

Fitting Logistic Regression to overlapping data

Increasing, then reducing dimensionality

Exercise:

Use KernelPCA to make the following dataset linearly separable

Hint: Use kernel="rbf" and find the right value of gamma

2. Use `LinearSVC` to train on full iris dataset and on PCA of iris dataset, check the difference in F1 score.

Hint: Use `kernel="rbf"` and find the right value of `gamma`