Notes for me:
Follow presentation on: http://bit.ly/ML-SpringCampus
We will use:
In [ ]:
import warnings
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import Utils
from Utils import cmap_light
from Utils import cmap_bold
In [ ]:
boston_dataset = datasets.load_boston()
print(boston_dataset.DESCR)
In [ ]:
X = boston_dataset.data
Y = boston_dataset.target
names = list(boston_dataset.feature_names) + ['Price']
labels = np.reshape(Y,
(Y.shape[0], 1))
df = pd.DataFrame(data=np.concatenate((X, labels), axis=1),
columns=names)
df.head(10)
In [ ]:
df_tmp = df[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX',
'RM', 'Price']]
df_tmp.head(10)
In [ ]:
df_tmp.describe()
In [ ]:
from Utils import plot_boston_dataset
plot_boston_dataset(boston_dataset.data,
boston_dataset.target)
The 95% confidence interval suggests Rexthor's dog could also be a cat, or possibly a teapot.
In [ ]:
model = LinearRegression()
model.fit(X, Y)
r2 = model.score(X, Y)
print("R^2 value: {:0.3f}".format(r2))
In [ ]:
example_n = np.random.randint(0, Y.shape[0])
Utils.describe_example_boston_dataset(X[example_n])
print("\n\nPredicted price: {:2.2f} Real value: {:2.2f}".format(
model.predict(X[example_n].reshape(1, -1))[0], Y[example_n]))
In [ ]:
iris_dataset = datasets.load_iris()
print("Features: " + str(iris_dataset.feature_names))
print("Classes: " + str(iris_dataset.target_names))
X = iris_dataset.data
y = iris_dataset.target
In [ ]:
# Load it to a DF
idx = np.random.permutation(150)
y = y[idx]
X = X[idx]
labels = np.reshape(y,
(y.shape[0], 1))
df = pd.DataFrame(data=np.concatenate((X, labels), axis=1),
columns=iris_dataset.feature_names + ['Class'])
df.head(10)
In [ ]:
df.describe()
In [ ]:
# Let's take a peak at the data:
plt.figure(figsize=(8,8))
colors = "bry"
for i, color in zip([0, 1, 2], colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired)
plt.text(5.25, 2.20, "Versicolor", fontsize=14)
plt.text(7, 3.5, "Virginica", fontsize=14)
plt.text(4.5, 3.75, "Setosa", fontsize=14)
plt.title("The 3 different Iris species", fontsize=18,
fontweight='bold')
plt.xlabel(iris_dataset.feature_names[0], fontsize=14)
plt.ylabel(iris_dataset.feature_names[1], fontsize=14)
plt.show()
In [ ]:
# We will focus identifying only the Iris Setosa
plt.figure(figsize=(8,8))
colors = "br"
idx = np.where(y == 0) # Give me the indices of the Iris Setosa examples
plt.scatter(X[idx, 0], X[idx, 1], c='b', cmap=plt.cm.Paired)
plt.text(4.5, 3.75, "Setosa", fontsize=14)
idx = np.where(y != 0) # where it's not Iris Setosa
plt.scatter(X[idx, 0], X[idx, 1], c='r', cmap=plt.cm.Paired)
plt.text(7.0, 2.5, "Others", fontsize=14)
plt.title("Scatter plot of Iris Setosa and the others Iris",
fontsize=18, fontweight='bold')
plt.xlabel(iris_dataset.feature_names[0], fontsize=14)
plt.ylabel(iris_dataset.feature_names[1], fontsize=14)
plt.show()
In [ ]:
# We only care about whether each flower is a
# Iris Setosa and we are looking only at two of their features
X = iris_dataset.data
y = iris_dataset.target
new_y = y == 0
model = LogisticRegression(random_state=42, verbose=0)
model.fit(X[:,0:2], new_y)
accuracy = model.score(X[:,0:2], new_y)
print("Accuracy: {:0.3f}%".format(accuracy*100))
In [ ]:
from Utils import predict_mesh
# Let's take a look at what our model is doing
# First plot the examples
plt.figure(figsize=(8,8))
colors = "br"
idx = np.where(y == 0)
plt.scatter(X[idx, 0], X[idx, 1], c='b', cmap=plt.cm.Paired)
plt.text(4.5, 3.75, "Setosa", fontsize=14)
idx = np.where(y != 0)
plt.scatter(X[idx, 0], X[idx, 1], c='r', cmap=plt.cm.Paired)
plt.text(7.0, 2.5, "Others", fontsize=14)
(xx, yy, Z) = predict_mesh(X, model)
plt.contour(xx, yy, Z, cmap=plt.cm.Paired)
plt.title("Decision Boundary", fontsize=18, fontweight='bold')
plt.xlabel(iris_dataset.feature_names[0], fontsize=14)
plt.ylabel(iris_dataset.feature_names[1], fontsize=14)
plt.show()
Let's start with linear regression:
$$ \hat{y} = w_0 + w_1.x_1 + w_2.x_2 + w_3.x_3$$Adding a $x_0=1$ we get
$$ \hat{y} = w^T \cdot x $$For each variable we have a weight, an "importance", and the linear combination of the weights and features results in our estimated value $\hat{y}$.
In [ ]:
def sigmoid(x):
return 1 / (1 + np.exp(-x))
x = np.arange(-20, 20, 0.001)
y = sigmoid(x)
plt.figure(figsize=(10,5))
plt.plot(x, y)
plt.title("Sigmoid Function", fontsize=14)
plt.show()
In [ ]:
# Read the data file and drop the collumns we don't care about:
whisky_dataframe = pd.read_csv(
filepath_or_buffer="whiskies.csv", header=0, sep=',',
index_col=1)
whisky_dataframe.drop(['RowID', 'Postcode', ' Latitude',
' Longitude'], inplace=True, axis=1)
whisky_dataframe.head(10)
In [ ]:
whisky_dataframe.describe()
In [ ]:
Utils.plot_whisky_histograms(whisky_dataframe)
In [ ]:
Utils.plot_whiky_body_correlation(whisky_dataframe)
In [ ]:
Utils.plot_1d_random_data(0.5, 30)
In [ ]:
Utils.plot_2d_random_data(0.5, 30)
In [ ]:
random_data_1 =np.random.multivariate_normal(
mean= [0, 0], cov=[[5, 5], [0, 0.5]], size=100)
random_data_2 =np.random.multivariate_normal(
mean= [6, 6], cov=[[5, 5], [0, 0.5]], size=100)
random_data = np.concatenate([random_data_1, random_data_2], axis=0)
random_labels = np.concatenate([np.ones((100,1)),np.zeros((100,1))], axis=0)
fig = plt.figure(figsize=(8, 8))
plt.scatter(random_data[:, 0], random_data[:, 1], c=random_labels, cmap=cmap_light)
#plt.scatter(random_data_2[:, 0], random_data_2[:, 1], c='r')
plt.plot([-5, 10], [-5, 10], 'r--')
plt.plot([5, 0], [0, 5], 'g--')
plt.xlim((-7, 14))
plt.ylim((-7, 14))
plt.title('Random Data with Principal Components', fontsize=16)
plt.xlabel('Random Dimension 1', fontsize=14)
plt.ylabel('Random Dimension 2', fontsize=14)
plt.show()
In [ ]:
pca = PCA(n_components=2)
pca.fit(random_data)
transformed_data = pca.fit_transform(random_data)
plt.figure(figsize=(8,6))
plt.scatter(transformed_data[:,0], transformed_data[:,1],
c=random_labels, cmap=cmap_light)
plt.plot([-10, 10], [0, 0], 'r--')
plt.xlim((-10, 10))
plt.ylim((-5, 5))
plt.title('Transformed Random Data', fontsize=16)
plt.xlabel('Random Dimension 1', fontsize=14)
plt.ylabel('Random Dimension 2', fontsize=14)
plt.show()
In [ ]:
pca = PCA(n_components=1)
pca.fit(random_data)
transformed_data = pca.fit_transform(random_data)
plt.figure(figsize=(8,5))
plt.scatter(transformed_data[:,0], np.zeros((200,1)),
c=random_labels, cmap=cmap_light)
plt.plot([-10, 10], [0, 0], 'r--')
plt.xlim((-10, 10))
plt.ylim((-5, 5))
plt.title('Transformed Random Data', fontsize=16)
plt.xlabel('Random Dimension 1', fontsize=14)
plt.show()
print("% of variance explained by PCA: {:0.1f}% \
".format(
pca.explained_variance_ratio_[0]*100))
In [ ]:
# Adapted from: http://scikit-learn.org/stable/auto_examples/linear_model/plot_polynomial_interpolation.html
# Author: Mathieu Blondel
# Jake Vanderplas
# License: BSD 3 clause
def f(x, noise=False):
""" function to approximate by polynomial interpolation"""
if(noise):
return np.sin(x) + np.random.randn(x.shape[0])/4
return np.sin(x)
space_size = 6
# generate points used to plot
x_plot = np.linspace(-space_size, space_size, 100)
# generate points and keep a subset of them
x = np.linspace(-space_size, space_size, 100)
rng = np.random.RandomState(42)
rng.shuffle(x)
x = np.sort(x[:10])
y = f(x, True)
# create matrix versions of these arrays
X = x[:, np.newaxis]
X_plot = x_plot[:, np.newaxis]
colors = ['teal', 'yellowgreen', 'gold', 'blue']
lw = 2
fig = plt.figure(figsize=(12,12))
for count, degree in enumerate([1, 3, 6, 10]):
ax = fig.add_subplot(2, 2, count+1)
ax.plot(x_plot, f(x_plot), color='cornflowerblue', linewidth=lw,
label="ground truth")
ax.scatter(x, y, color='navy', s=30, marker='o', label="training points")
model = make_pipeline(PolynomialFeatures(degree), Ridge())
model.fit(X, y)
y_plot = model.predict(X_plot)
ax.plot(x_plot, y_plot, color=colors[count], linewidth=lw,
label="degree %d" % degree)
ax.legend(loc='lower left')
ax.set_ylim((-5, 5))
plt.show()
In [ ]:
whisky_data = whisky_dataframe.values
pca = PCA(n_components=2, whiten=True)
# Here whiten means centering the data around 0,
# which is needed so that PCA works correctly
transformed_data = pca.fit_transform(whisky_data)
In [ ]:
print("% of variance explained by each component: \
\n 1st {:0.1f}% \
\n 2nd {:0.1f}% \
".format(
pca.explained_variance_ratio_[0]*100,
pca.explained_variance_ratio_[1]*100))
In [ ]:
fig = plt.figure(figsize=(8,6))
plt.scatter(x = transformed_data[:,0], y=transformed_data[:,1])
plt.xlim((-3, 5))
plt.ylim((-3, 5))
plt.title('Transformed Whisky Data', fontsize=16)
plt.xlabel('Principal Component 1', fontsize=14)
plt.ylabel('Principal Component 2', fontsize=14)
plt.show()
In [ ]:
labels = whisky_dataframe['Tobacco']
whisky_data = whisky_dataframe.drop('Tobacco', axis=1).values
In [ ]:
print("Percentage of Positive Labels: {:.2f}%".format(
np.sum(labels)/len(labels)*100))
In [ ]:
pca = PCA(n_components=2, whiten=True)
# Here whiten means centering the data around 0,
# which is neededso that PCA works correctly
transformed_data = pca.fit_transform(whisky_data)
In [ ]:
train_data, test_data, train_labels, test_labels = train_test_split(
transformed_data, labels, test_size=0.30, random_state=0)
# Without Class weights
classf = LogisticRegression()
# With Class weights
class_weight={0:1, 1: 12}
classf = LogisticRegression(class_weight=class_weight)
classf.fit(train_data, train_labels)
accuracy = classf.score(train_data, train_labels)
print("\n\nTraining Accuracy:\t {:0.3f}%\n\n".format(accuracy*100))
accuracy = classf.score(test_data, test_labels)
print("Test Accuracy:\t\t {:0.3f}%\n\n".format(accuracy*100))
In [ ]:
print("\tTraining \n")
predicted_labels = classf.predict(train_data)
cm = confusion_matrix(train_labels, predicted_labels)
Utils.print_cm(cm)
print("\n\tTesting \n")
predicted_labels = classf.predict(test_data)
cm = confusion_matrix(test_labels, predicted_labels)
Utils.print_cm(cm)
In [ ]:
class_weight={0:1, 1: 12}
classf = LogisticRegression(random_state=42,
class_weight=class_weight)
#classf = LogisticRegression(random_state=42)
In [ ]:
# Select parameters to use in Cross-Validation
classf_cv = classf
data_cv = transformed_data
N_CV = 10
# Cross Validation
t0 = time()
scores = cross_val_score(classf_cv, data_cv, labels, cv = N_CV)
print("Scores: ")
for i, score in enumerate(scores):
print( '\t' + str(i) + ':\t' + str(score))
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
print("\nCross val done in %0.3fs." % (time() - t0))
In [ ]:
In [ ]:
random_data = np.random.randn(100, 2)
random_labels = np.random.randint(0,2,100)
fig = plt.figure(figsize=(8,8))
plt.scatter(random_data[:, 0], random_data[:, 1],
c=random_labels, cmap=cmap_bold)
plt.xlabel('Random Dimension 1', fontsize=14)
plt.ylabel('Random Dimension 2', fontsize=14)
plt.show()
In [ ]:
clf = KNeighborsClassifier(n_neighbors=1)
clf = KNeighborsClassifier(n_neighbors=10)
clf.fit(random_data, random_labels)
print("Accuracy: {:0.3f}%".format(
clf.score(random_data, random_labels)*100))
In [ ]:
(xx, yy, Z) = Utils.predict_mesh(random_data, clf, h=0.01)
fig = plt.figure(figsize=(8,8))
plt.xlabel('Random Dimension 1', fontsize=14)
plt.ylabel('Random Dimension 2', fontsize=14)
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
plt.scatter(random_data[:, 0], random_data[:, 1],
c=random_labels, cmap=cmap_bold)
plt.show()
In [ ]:
In [ ]:
In [ ]:
random_labels = np.concatenate([np.ones((50,)), np.zeros((50,))])
random_data = np.concatenate([
np.add(np.multiply(np.random.randn(50, 2),
np.array([0.7, 1.5])), np.array([3, 1])),
np.multiply(np.random.randn(50, 2),
np.array([0.5, 3]))
])
fig = plt.figure(figsize=(8, 8))
plt.scatter(random_data[:, 0], random_data[:, 1],
c=random_labels, cmap=cmap_bold)
plt.xlim((-4, 8))
plt.ylim((-6, 6))
plt.xlabel('Random Dimension 1', fontsize=14)
plt.ylabel('Random Dimension 2', fontsize=14)
plt.show()
In [ ]:
In [ ]:
whisky_data = pd.read_csv(
filepath_or_buffer="Meta-Critic Whisky Database – Selfbuilts Whisky Analysis.csv")
whisky_data.describe()
whisky_data.head()
In [ ]:
In [ ]:
In [ ]: