In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Transformers

Unsupervised transformations for preprocessing


In [ ]:
from sklearn.datasets import load_boston
boston = load_boston()
boston.keys()

In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)

In [ ]:
np.set_printoptions(suppress=True)
print(X_train)

In [ ]:
print("mean : %s " % X_train.mean(axis=0))
print("standard deviation : %s " % X_train.std(axis=0))

In [ ]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [ ]:
scaler.fit(X_train)

In [ ]:
X_scaled = scaler.transform(X_train)

In [ ]:
print(X.shape)
print(X_scaled.shape)

In [ ]:
print("mean : %s " % X_scaled.mean(axis=0))
print("standard deviation : %s " % X_scaled.std(axis=0))

In [ ]:
X_scaled_test = scaler.transform(X_test)

Principal Component Analysis


In [ ]:
rnd = np.random.RandomState(42)
X_blob = np.dot(rnd.normal(size=(100, 2)), rnd.normal(size=(2, 2))) + rnd.normal(size=2)
plt.scatter(X_blob[:, 0], X_blob[:, 1])
plt.xlabel("feature 1")
plt.ylabel("feature 2")

In [ ]:
from sklearn.decomposition import PCA
pca = PCA()

In [ ]:
pca.fit(X_blob)

In [ ]:
X_pca = pca.transform(X_blob)

plt.scatter(X_pca[:, 0], X_pca[:, 1])
plt.xlabel("first principal component")
plt.ylabel("second principal component")

PCA for dimensionality Reduction


In [ ]:
from sklearn.datasets import load_digits

digits = load_digits(n_class=5)
X, y = digits.data, digits.target
print(X.shape)

In [ ]:
pca = PCA(n_components=2)
pca.fit(X)

In [ ]:
X_reduced = pca.transform(X)
print("Reduced dataset shape: %s" % (X_reduced.shape, ))

In [ ]:
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y)