In [1]:
# importing numpy, pandas & matplotlib
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import random
%matplotlib inline
In [2]:
# Load Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
In [3]:
iris.feature_names
Out[3]:
In [4]:
print(iris.DESCR)
X and lables are vectors declared by y.
In [5]:
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)
In [6]:
print('X.shape =', X.shape)
print('y.shape =', y.shape)
In [7]:
X.head()
Out[7]:
In [8]:
y.head()
Out[8]:
In [9]:
plt.figure(figsize=(6, 6));
plt.scatter(X.values[:, 0], X.values[:, 1], c=y, cmap=plt.cm.rainbow);
In [10]:
# Correlation plots are a good way to understand the dataset
pd.plotting.scatter_matrix(X, figsize=(10, 10));
In [11]:
from sklearn.datasets import load_boston
boston = load_boston()
In [12]:
boston.feature_names
Out[12]:
In [13]:
print(boston.DESCR)
In [14]:
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target)
In [15]:
print('X.shape =', X.shape)
print('y.shape =', y.shape)
In [16]:
X.head()
Out[16]:
In [17]:
y.head()
Out[17]:
In [18]:
plt.scatter(X.RM, y, marker='.');
plt.xlabel('# Rooms');
plt.ylabel('House Price');
In [19]:
plt.scatter(X.LSTAT, y, marker='.');
plt.xlabel('LSTAT');
plt.ylabel('House Price');
In [20]:
X = X.RM
In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.values.reshape(X.shape[0], 1),
y.values.reshape(y.shape[0], 1),
test_size=.3)
In [22]:
print('X_train.shape =', X_train.shape)
print('X_test.shape =', X_test.shape)
print('y_train.shape =', y_train.shape)
print('y_test.shape =', y_test.shape)
In [23]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)
In [24]:
y_pred = model.predict(X_test)
In [25]:
from sklearn.metrics import mean_squared_error
print('Mean squared error =', mean_squared_error(y_test, y_pred))
In [26]:
print('Linear Regression score = %.2f%%' % (model.score(X_test, y_test) * 100))
In [27]:
plt.scatter(X_test, y_test, marker='.');
plt.plot(X_test, y_pred, color='red');
In [28]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
In [29]:
X = pd.DataFrame(mnist.data)
y = pd.Series(mnist.target)
In [30]:
f, axes = plt.subplots(3, 4, figsize=(8, 8));
for i in range(3):
for j in range(4):
axes[i, j].axis('off')
if i == 2 and j >= 2:
continue
num = i * 4 + j
axes[i, j].set_title(num)
axes[i, j].matshow(X.values[y == num][0].reshape(28, 28))
In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, shuffle=False)
In [32]:
print('X_train.shape =', X_train.shape)
print('y_train.shape =', y_train.shape)
print('X_test.shape =', X_test.shape)
print('y_test.shape =', y_test.shape)
In [33]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier().fit(X_train, y_train)
In [34]:
print('SGD Classifier score = %.2f%%' % (model.score(X_test, y_test) * 100))
In [35]:
f, axes = plt.subplots(3, 4, figsize=(8, 8));
for i in range(3):
for j in range(4):
axes[i, j].axis('off')
if i == 2 and j >= 2:
continue
axes[i, j].set_title(i * 4 + j)
axes[i, j].matshow(model.coef_[i * 4 + j].reshape(28, 28))
In [36]:
y_pred = model.predict(X_test)
f, axes = plt.subplots(1, 5, figsize=(12, 4))
samples = random.sample(list(y_test[y_test == y_pred].iteritems()), 5)
for i_zero, (i, p) in enumerate(samples):
axes[i_zero].axis('off')
axes[i_zero].matshow(X_test.loc[i].values.reshape(28, 28))
axes[i_zero].set_title('pred = %d' % y_pred[i - 60000], color='green')
In [37]:
y_pred = model.predict(X_test)
f, axes = plt.subplots(1, 5, figsize=(12, 4))
samples = random.sample(list(y_test[y_test != y_pred].iteritems()), 5)
for i_zero, (i, p) in enumerate(samples):
axes[i_zero].axis('off')
axes[i_zero].matshow(X_test.loc[i].values.reshape(28, 28))
axes[i_zero].set_title('pred = %d' % y_pred[i - 60000], color='red')
In [38]:
from sklearn.datasets import make_moons
X, y = make_moons(1000, noise=.05)
In [39]:
plt.scatter(X[:, 0], X[:, 1], s=10);
In [40]:
from sklearn.cluster import DBSCAN
model = DBSCAN(eps=.2).fit(X)
In [41]:
plt.scatter(X[:, 0], X[:, 1], s=10, c=model.labels_, cmap=plt.cm.rainbow);
In [42]:
from sklearn.metrics import adjusted_rand_score
print('Adjusted rand index =', adjusted_rand_score(y, model.labels_))
In [43]:
from sklearn.datasets import fetch_olivetti_faces
faces = fetch_olivetti_faces()
In [44]:
plt.subplots(2, 3)
for i in range(6):
plt.subplot(2, 3, i + 1)
plt.axis('off')
plt.imshow(faces.images[10 * i].reshape(64, 64), cmap=plt.cm.gray)
In [45]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
In [46]:
faces.data = StandardScaler().fit_transform(faces.data)
pca = PCA().fit(faces.data)
In [47]:
plt.subplots(2, 3)
for i in range(6):
plt.subplot(2, 3, i + 1)
plt.axis('off')
plt.imshow(pca.components_[i].reshape(64, 64), cmap=plt.cm.gray_r)