In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns; sns.set()
import sklearn
from scipy import stats
%matplotlib inline
In [3]:
iris = sns.load_dataset('iris')
iris.head()
Out[3]:
In [4]:
sns.pairplot(iris, hue='species')
plt.show()
In [5]:
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = 2 * x - 1 + rng.randn(50)
plt.scatter(x, y)
plt.show()
In [6]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
X = x[:, np.newaxis]
model.fit(X, y)
Out[6]:
In [7]:
x_test = np.linspace(0, 10, 1000).reshape(1000,1)
y_test = model.predict(x_test)
plt.scatter(x, y)
plt.scatter(x_test, y_test, s=0.5)
plt.show()
In [8]:
iris = sns.load_dataset('iris')
X_iris = iris.drop('species', axis=1)
y_iris = iris['species']
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris, random_state=1)
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
from sklearn.metrics import accuracy_score
print(ytest.shape)
print(y_model.shape)
accuracy_score(ytest, y_model)
Out[8]:
In [9]:
from sklearn.decomposition import PCA
model = PCA(n_components=2)
model.fit(X_iris)
X_2d = model.transform(X_iris)
iris['PCA1'] = X_2d[:, 0]
iris['PCA2'] = X_2d[:, 1]
sns.lmplot('PCA1', 'PCA2', hue='species', data=iris, fit_reg=False)
plt.show()
In [10]:
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=3, covariance_type='full')
model.fit(X_iris)
y_gmm= model.predict(X_iris)
iris['cluster'] = y_gmm
sns.lmplot('PCA1','PCA2', data=iris, hue='species', col='cluster', fit_reg=False)
plt.show()
In [11]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
def PolynomialRegression(degree=2, **kwargs):
return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))
def make_data(N, err=1.0, rseed=1):
rng = np.random.RandomState(rseed)
X = rng.rand(N, 1) ** 2
y = 10 - 1. / (X.ravel() + 0.1)
if err > 0:
y += err * rng.randn(N)
return X, y
import matplotlib.pyplot as plt
import seaborn; seaborn.set()
%matplotlib inline
X, y = make_data(40)
X_test = np.linspace(-0.1, 1.1, 500)[:, None]
# print(np.linspace(-0.1,1.1,500).shape)
# print(np.linspace(-0.1,1.1,500)[:, None].shape)
# print(np.linspace(-0.1,1.1,500)[:, np.newaxis].shape)
plt.scatter(X.ravel(), y, color='black')
axis = plt.axis()
for degree in [1, 3, 5, 9]:
y_test = PolynomialRegression(degree).fit(X, y).predict(X_test)
plt.plot(X_test.ravel(), y_test, label='degree={0}'.format(degree))
plt.ylim(-2, 12)
plt.legend(loc='best')
# plt.savefig('xx.jpg', dpi=500)
Out[11]:
In [12]:
from sklearn.learning_curve import validation_curve
degree = np.arange(0, 21)
train_score, val_score = validation_curve(PolynomialRegression(), X, y, 'polynomialfeatures__degree', degree, cv=7)
plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score')
Out[12]:
In [13]:
plt.scatter(X.ravel(), y)
lim = plt.axis()
y_test = PolynomialRegression(3).fit(X, y).predict(X_test)
plt.plot(X_test.ravel(), y_test, color='red')
# plt.axis(lim)
Out[13]:
In [14]:
X2, y2 = make_data(500)
plt.scatter(X2, y2, s=2)
# plt.savefig('xxxx.jpg', dpi=800)
Out[14]:
In [15]:
from sklearn.learning_curve import validation_curve
degree = np.arange(0, 21)
train_score2, val_score2 = validation_curve(PolynomialRegression(), X2, y2, 'polynomialfeatures__degree', degree, cv=7)
plt.plot(degree, np.median(train_score2, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score2, 1), color='red', label='validation score')
plt.plot(degree, np.median(train_score, 1), color='blue',linestyle='dotted', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red',linestyle='dotted', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score')
# plt.savefig('xxx.jpg', dpi=800)
Out[15]:
In [16]:
data = [
{'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
{'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
{'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
{'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int)
X = vec.fit_transform(data)
print(vec.get_feature_names())
y = vec.inverse_transform(X)
In [17]:
sample = ['problem of evil', 'evil queen', 'horizon problem']
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(sample)
X.toarray()
Out[17]:
In [18]:
x = np.array([1,2,3,4,5])
y = np.array([4,2,1,3,7])
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
x = x.reshape(5,1)
X = poly.fit_transform(x)
In [19]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X, y)
plt.scatter(x, y)
plt.plot(x, model.predict(X))
Out[19]:
In [20]:
X = np.array([
[1,2,3],
[4,5,np.nan]
])
from sklearn.preprocessing import Imputer
X = Imputer().fit_transform(X)
print(X)
In [21]:
from sklearn.datasets import make_blobs
X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
In [22]:
%matplotlib inline
plt.scatter(X[:,0], X[:,1], c=y, cmap='RdGy', s=50)
plt.show()
In [23]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
rng = np.random.RandomState(42)
Xnew = rng.rand(2000, 2) * [14, 18] - [6, 14]
ynew = model.predict(Xnew)
plt.scatter(Xnew[:,0], Xnew[:,1], c=ynew, cmap='viridis', s=20, alpha=0.4)
plt.scatter(X[:,0], X[:,1], c=y, cmap='RdGy', s=50, alpha=1)
plt.show()
In [24]:
ynewnew = model.predict_proba(Xnew)
print(ynewnew)
In [25]:
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
In [26]:
categories = ['talk.religion.misc', 'soc.religion.christian', 'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(train.data, train.target)
labels = model.predict(test.data)
In [27]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test.target, labels)
sns.heatmap(mat.T, square=True, cbar=False, xticklabels=train.target_names,
yticklabels=train.target_names, annot=True, fmt='d')
plt.xlabel('true label')
plt.ylabel('predicted label')
Out[27]:
In [28]:
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = x * 2 + rng.randn(50) - 5
plt.scatter(x, y)
Out[28]:
In [35]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
model.fit(x[:, np.newaxis], y)
xfit = np.linspace(0, 10, 1000)
yfit = model.predict(xfit[:, np.newaxis])
plt.scatter(x, y)
plt.plot(xfit, yfit, 'r')
Out[35]:
In [31]:
print(model.coef_,model.intercept_)
In [39]:
from sklearn.preprocessing import PolynomialFeatures
x = np.array([2, 3, 4])
poly = PolynomialFeatures(3, include_bias=False)
x_poly = poly.fit_transform(x[:, np.newaxis])
print(x_poly)
In [41]:
from sklearn.pipeline import make_pipeline
model = make_pipeline(PolynomialFeatures(7),
LinearRegression())
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = np.sin(x) + 0.1 * rng.randn(50)
model.fit(x[:, np.newaxis], y)
xfit = np.linspace(0, 10, 1000)
yfit = model.predict(xfit[:, np.newaxis])
plt.scatter(x, y)
plt.plot(xfit, yfit, 'r-.')
Out[41]:
In [49]:
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=1)
plt.scatter(X[:,0], X[:,1], c=y, s=50, cmap='autumn')
plt.colorbar()
Out[49]:
In [46]:
from sklearn.svm import SVC # support vector classifier
model = SVC(C=1e10, kernel='linear')
model.fit(X, y)
Out[46]:
In [48]:
model.predict([[-1,1]])
Out[48]:
In [50]:
from sklearn.datasets.samples_generator import make_circles
X, y = make_circles(100, factor=0.1, noise=0.1)
clf = SVC(kernel='rbf', C=1e6)
clf.fit(X, y)
Out[50]:
In [57]:
def plot_svc_decision_function(model, ax=None, plot_support=True):
"""Plot the decision function for a 2D SVC"""
if ax is None:
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# create grid to evaluate model
x = np.linspace(xlim[0], xlim[1], 30)
y = np.linspace(ylim[0], ylim[1], 30)
Y, X = np.meshgrid(y, x)
xy = np.vstack([X.ravel(), Y.ravel()]).T
P = model.decision_function(xy).reshape(X.shape)
# plot decision boundary and margins
ax.contour(X, Y, P, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
# plot support vectors
if plot_support:
ax.scatter(model.support_vectors_[:, 0],
model.support_vectors_[:, 1],
s=300, linewidth=1, facecolors='none');
ax.set_xlim(xlim)
ax.set_ylim(ylim)
In [62]:
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=300, lw=1, facecolors='none');
In [66]:
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
plt.imshow(faces.images[0], cmap='viridis')
Out[66]:
In [70]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
In [71]:
pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target, random_state=42)
param_grid = {'svc__C': [1,5,10,50],
'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)
%time grid.fit(Xtrain, ytrain)
print(grid.best_params_)
In [83]:
from sklearn.datasets import make_blobs
X, y = make_blobs(centers=4, n_samples=500, cluster_std=1, random_state=42)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis')
Out[83]:
In [84]:
def visualize_classifier(model, X, y, ax=None, cmap='rainbow'):
ax = ax or plt.gca()
# Plot the training points
ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap,
clim=(y.min(), y.max()), zorder=3)
ax.axis('tight')
ax.axis('off')
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# fit the estimator
model.fit(X, y)
xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
np.linspace(*ylim, num=200))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
# Create a color plot with the results
n_classes = len(np.unique(y))
contours = ax.contourf(xx, yy, Z, alpha=0.3,
levels=np.arange(n_classes + 1) - 0.5,
cmap=cmap, clim=(y.min(), y.max()),
zorder=1)
ax.set(xlim=xlim, ylim=ylim)
In [85]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=0)
visualize_classifier(model, X, y)
In [87]:
from sklearn.datasets import load_sample_image
china = load_sample_image('china.jpg')
plt.imshow(china)
Out[87]:
In [89]:
In [ ]: