In [1]:
from sklearn import datasets
import numpy as np
In [2]:
datasets.*?
In [3]:
boston = datasets.load_boston()
print(boston.DESCR)
In [4]:
X, y = boston.data, boston.target
In [5]:
datasets.make_*?
In [6]:
X, y = datasets.make_regression(n_samples=1000, n_features=1,
n_informative=1, noise=15,
bias=1000, random_state=0)
In [7]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(X, y);
In [8]:
X, y = datasets.make_blobs(n_samples=300, centers=4,
cluster_std=0.6, random_state=0)
In [9]:
plt.scatter(X[:, 0], X[:, 1], s=50);
In [10]:
from sklearn import preprocessing
X, y = boston.data, boston.target
X[:, :3].mean(axis=0)
Out[10]:
In [11]:
X[:, :3].std(axis=0)
Out[11]:
In [12]:
plt.plot(X[:, :3]);
scale centers and scales the data using the following formula:
In [13]:
X_2 = preprocessing.scale(X[:, :3])
In [14]:
X_2.mean(axis=0)
Out[14]:
In [15]:
X_2.std(axis=0)
Out[15]:
In [16]:
plt.plot(X_2);
Same as preprocessing.scale but persists scale settings across uses.
In [17]:
scaler = preprocessing.StandardScaler()
scaler.fit(X[:, :3])
X_3 = scaler.transform(X[:, :3])
X_3.mean(axis=0)
Out[17]:
In [18]:
X_3.std(axis=0)
Out[18]:
In [19]:
plt.plot(X_3);
Scales data within a specified range.
In [20]:
scaler = preprocessing.MinMaxScaler()
scaler.fit(X[:, :3])
X_4 = scaler.transform(X[:, :3])
X_4.max(axis=0)
Out[20]:
In [21]:
X_4.std(axis=0)
Out[21]:
In [22]:
plt.plot(X_4);
In [23]:
scaler = preprocessing.MinMaxScaler(feature_range=(-4, 4))
scaler.fit(X[:, :3])
X_5 = scaler.transform(X[:, :3])
In [24]:
plt.plot(X_5);
In [25]:
new_target = preprocessing.binarize(boston.target, threshold=boston.target.mean())
new_target[:, :5]
Out[25]:
In [26]:
(boston.target[:5] > boston.target.mean()).astype(int)
Out[26]:
In [27]:
bin = preprocessing.Binarizer(boston.target.mean())
new_target = bin.fit_transform(boston.target)
new_target[:, :5]
Out[27]:
In [28]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
In [29]:
d = np.column_stack((X, y))
In [30]:
encoder = preprocessing.OneHotEncoder()
encoder.fit_transform(d[:, -1:]).toarray()[:5]
Out[30]:
In [31]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
dict = [{'species': iris.target_names[i]} for i in y]
dv.fit_transform(dict).toarray()[:5]
Out[31]:
In [32]:
import patsy
patsy.dmatrix('0 + C(species)', {'species': iris.target})
Out[32]:
In [33]:
from sklearn.preprocessing import LabelBinarizer
binarizer = LabelBinarizer()
new_target = binarizer.fit_transform(y)
y.shape, new_target.shape
Out[33]:
In [34]:
new_target[:5]
Out[34]:
In [35]:
new_target[-5:]
Out[35]:
In [36]:
binarizer.classes_
Out[36]:
In [37]:
binarizer = LabelBinarizer(neg_label=-1000, pos_label=1000)
binarizer.fit_transform(y)[:5]
Out[37]:
In [38]:
iris = datasets.load_iris()
iris_X = iris.data
masking_array = np.random.binomial(1, .25, iris_X.shape).astype(bool)
iris_X[masking_array] = np.nan
In [39]:
masking_array[:5]
Out[39]:
In [40]:
iris_X[:5]
Out[40]:
By default, Imputer fills in missing values with the mean.
In [41]:
impute = preprocessing.Imputer()
iris_X_prime = impute.fit_transform(iris_X)
iris_X_prime[:5]
Out[41]:
In [42]:
impute = preprocessing.Imputer(strategy='median')
iris_X_prime = impute.fit_transform(iris_X)
iris_X_prime[:5]
Out[42]:
In [43]:
iris_X[np.isnan(iris_X)] = -1
iris_X[:5]
Out[43]:
In [44]:
impute = preprocessing.Imputer(missing_values=-1)
iris_X_prime = impute.fit_transform(iris_X)
iris_X_prime[:5]
Out[44]:
In [45]:
mat = datasets.make_spd_matrix(10)
masking_array = np.random.binomial(1, .1, mat.shape).astype(bool)
mat[masking_array] = np.nan
mat[:4, :4]
Out[45]:
How to create a pipeline:
In [46]:
from sklearn import pipeline
pipe = pipeline.Pipeline([('impute', impute), ('scaler', scaler)])
pipe
Out[46]:
In [47]:
new_mat = pipe.fit_transform(mat)
new_mat[:4, :4]
Out[47]:
To be included in Pipeline, objects should have fit, transform, and fit_transform methods.
In [48]:
iris = datasets.load_iris()
iris_X = iris.data
In [49]:
from sklearn import decomposition
pca = decomposition.PCA()
pca
Out[49]:
In [50]:
iris_pca = pca.fit_transform(iris_X)
iris_pca[:5]
Out[50]:
PCA transforms the covariances of the data into column vectors that show certain percentages of the variance:
In [51]:
pca.explained_variance_ratio_
Out[51]:
High-dimensionality is problematic in data analysis. Consider representing data in fewer dimensions when models overfit on high-dimensional datasets.
In [52]:
pca = decomposition.PCA(n_components=2)
iris_X_prime = pca.fit_transform(iris_X)
iris_X.shape, iris_X_prime.shape
Out[52]:
In [53]:
plt.scatter(iris_X_prime[:50, 0], iris_X_prime[:50, 1]);
plt.scatter(iris_X_prime[50:100, 0], iris_X_prime[50:100, 1]);
plt.scatter(iris_X_prime[100:150, 0], iris_X_prime[100:150, 1]);
In [54]:
pca.explained_variance_ratio_.sum()
Out[54]:
You can create a PCA with the desired variance to be explained:
In [55]:
pca = decomposition.PCA(n_components=.98)
iris_X_prime = pca.fit(iris_X)
pca.explained_variance_ratio_.sum()
Out[55]:
Factor analysis differs from PCA in that it makes assumptions about which implicit features underlie the explicit features of a dataset.
In [56]:
from sklearn.decomposition import FactorAnalysis
In [57]:
fa = FactorAnalysis(n_components=2)
iris_two_dim = fa.fit_transform(iris.data)
iris_two_dim[:5]
Out[57]:
When data is not lineraly seperable, Kernel PCA can help. Here, data is projected by the kernel function and then PCA is performed.
In [58]:
A1_mean = [1, 1]
A1_cov = [[2, .99], [1, 1]]
A1 = np.random.multivariate_normal(A1_mean, A1_cov, 50)
A2_mean = [5, 5]
A2_cov = [[2, .99], [1, 1]]
A2 = np.random.multivariate_normal(A2_mean, A2_cov, 50)
A = np.vstack((A1, A2))
B_mean = [5, 0]
B_cov = [[.5, -1], [-.9, .5]]
B = np.random.multivariate_normal(B_mean, B_cov, 100)
In [59]:
plt.scatter(A[:, 0], A[:, 1]);
plt.scatter(B[:, 0], B[:, 1]);
In [60]:
kpca = decomposition.KernelPCA(kernel='cosine', n_components=1)
AB = np.vstack((A, B))
AB_transformed = kpca.fit_transform(AB)
In [61]:
plt.scatter(AB_transformed[:50], np.zeros(AB_transformed[:50].shape), alpha=0.5);
plt.scatter(AB_transformed[50:], np.zeros(AB_transformed[50:].shape)+0.001, alpha=0.5);
In [62]:
pca = decomposition.PCA(n_components=2)
AB_prime = pca.fit_transform(AB)
plt.scatter(AB_prime[:, 0], np.zeros(AB_prime[:, 0].shape), alpha=0.5);
plt.scatter(AB_prime[:, 1], np.zeros(AB_prime[:, 1].shape)+0.001, alpha=0.5);
Singular Value Decomposition (SVD) factors a matrix M into three matrices: U, Σ, and V. Whereas PCA factors the covariance matrix, SVD factors the data matrix itself.
Given an n x n matrix, SVD will create an n-column matrix. Truncated SVD will create an arbitrary columned dataset based on the specified number.
In [63]:
iris = datasets.load_iris()
iris_data = iris.data
itis_target = iris.target
In [64]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(2)
iris_transformed = svd.fit_transform(iris_data)
iris_data[:5]
Out[64]:
In [65]:
iris_transformed[:5]
Out[65]:
In [66]:
plt.scatter(iris_data[:50, 0], iris_data[:50, 2]);
plt.scatter(iris_data[50:100, 0], iris_data[50:100, 2]);
plt.scatter(iris_data[100:150, 0], iris_data[100:150, 2]);
In [67]:
plt.scatter(iris_transformed[:50, 0], -iris_transformed[:50, 1]);
plt.scatter(iris_transformed[50:100, 0], -iris_transformed[50:100, 1]);
plt.scatter(iris_transformed[100:150, 0], -iris_transformed[100:150, 1]);
In [68]:
from scipy.linalg import svd
D = np.array([[1, 2], [1, 3], [1, 4]])
D
Out[68]:
In [69]:
U, S, V = svd(D, full_matrices=False)
U.shape, S.shape, V.shape
Out[69]:
In [70]:
np.dot(U.dot(np.diag(S)), V)
Out[70]:
In [71]:
new_S = S[0]
new_U = U[:, 0]
new_U.dot(new_S)
Out[71]:
DictionaryLearning assumes that the features are the basis for the resulting datasets.
In [72]:
from sklearn.decomposition import DictionaryLearning
dl = DictionaryLearning(3) # 3 species of iris
transformed = dl.fit_transform(iris_data[::2])
transformed[:5]
Out[72]:
In [73]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(transformed[0:25, 0], transformed[0:25, 1], transformed[0:25, 2]);
ax.scatter(transformed[25:50, 0], transformed[25:50, 1], transformed[25:50, 2]);
ax.scatter(transformed[50:75, 0], transformed[50:75, 1], transformed[50:75, 2]);
In [74]:
transformed = dl.transform(iris_data[1::2])
In [75]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(transformed[0:25, 0], transformed[0:25, 1], transformed[0:25, 2]);
ax.scatter(transformed[25:50, 0], transformed[25:50, 1], transformed[25:50, 2]);
ax.scatter(transformed[50:75, 0], transformed[50:75, 1], transformed[50:75, 2]);
In [76]:
iris = datasets.load_iris()
iris_data = iris.data
mask = np.random.binomial(1, .25, iris_data.shape).astype(bool)
iris_data[mask] = np.nan
iris_data[:5]
Out[76]:
In [77]:
pca = decomposition.PCA()
imputer = preprocessing.Imputer()
pipe = pipeline.Pipeline([('imputer', imputer), ('pca', pca)])
iris_data_transformed = pipe.fit_transform(iris_data)
iris_data_transformed[:5]
Out[77]:
In [78]:
pipe2 = pipeline.make_pipeline(imputer, pca)
pipe2.steps
Out[78]:
In [79]:
iris_data_transformed2 = pipe2.fit_transform(iris_data)
iris_data_transformed2[:5]
Out[79]:
In [108]:
boston = datasets.load_boston()
boston_X = boston.data
boston_y = boston.target
train_set = np.random.choice([True, False], len(boston_y), p=[.75, .25])
In [109]:
from sklearn.gaussian_process import GaussianProcess
gp = GaussianProcess()
gp.fit(boston_X[train_set], boston_y[train_set])
Out[109]:
In [110]:
test_preds = gp.predict(boston_X[~train_set])
In [111]:
f, ax = plt.subplots(figsize=(10, 7), nrows=3)
f.tight_layout()
ax[0].plot(range(len(test_preds)), test_preds, label='Predicted Values');
ax[0].plot(range(len(test_preds)), boston_y[~train_set], label='Actual Values');
ax[0].set_title('Predicted vs Actual');
ax[0].legend(loc='best');
ax[1].plot(range(len(test_preds)), test_preds - boston_y[~train_set]);
ax[1].set_title('Plotted Residuals');
ax[2].hist(test_preds - boston_y[~train_set]);
ax[2].set_title('Histogram of Residuals');
You can tune regr and thea0 to get different predictions:
In [112]:
gp = GaussianProcess(regr='linear', theta0=5e-1)
gp.fit(boston_X[train_set], boston_y[train_set]);
linear_preds = gp.predict(boston_X[~train_set])
f, ax = plt.subplots(figsize=(7, 5))
f.tight_layout()
ax.hist(test_preds - boston_y[~train_set], label='Residuals Original', color='b', alpha=.5);
ax.hist(linear_preds - boston_y[~train_set], label='Residuals Linear', color='r', alpha=.5);
ax.set_title('Residuals');
ax.legend(loc='best');
In [114]:
f, ax = plt.subplots(figsize=(10, 7), nrows=3)
f.tight_layout()
ax[0].plot(range(len(linear_preds)), linear_preds, label='Predicted Linear Values');
ax[0].plot(range(len(linear_preds)), boston_y[~train_set], label='Actual Values');
ax[0].set_title('Predicted Linear vs Actual');
ax[0].legend(loc='best');
ax[1].plot(range(len(linear_preds)), linear_preds - boston_y[~train_set]);
ax[1].set_title('Plotted Residuals');
ax[2].hist(linear_preds - boston_y[~train_set]);
ax[2].set_title('Histogram of Residuals');
In [113]:
np.power(test_preds - boston_y[~train_set], 2).mean(), np.power(linear_preds - boston_y[~train_set], 2).mean()
Out[113]:
In [115]:
test_preds, MSE = gp.predict(boston_X[~train_set], eval_MSE=True)
MSE[:5]
Out[115]:
In [119]:
f, ax = plt.subplots(figsize=(7, 5))
n = 20
rng = range(n)
ax.scatter(rng, test_preds[:n]);
ax.errorbar(rng, test_preds[:n], yerr=1.96*MSE[:n]);
ax.set_title('Predictions with Error Bars');
ax.set_xlim((-1, 21));
In [125]:
from sklearn.gaussian_process import regression_models
X, y = datasets.make_regression(1000, 1, 1)
In [126]:
regression_models.constant(X)[:5]
Out[126]:
In [128]:
regression_models.linear(X)[:5]
Out[128]:
In [129]:
regression_models.quadratic(X)[:5]
Out[129]:
In [130]:
X, y = datasets.make_regression((int(1e6)))
Size of the regression (MB):
In [132]:
X.nbytes / 1e6
Out[132]:
In [133]:
from sklearn import linear_model
sgd = linear_model.SGDRegressor()
train = np.random.choice([True, False], size=len(y), p=[.75, .25])
sgd.fit(X[train], y[train])
Out[133]: