In [1]:
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()
Out[1]:
In [2]:
%matplotlib inline
sns.set()
sns.pairplot(iris, hue='species', size=1.5);
In [3]:
X_iris = iris.drop('species', axis=1)
y_iris = iris['species']
X_iris.shape, y_iris.shape
Out[3]:
In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# classification
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris, random_state=1)
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)
Out[5]:
In [6]:
# dimension reduction
from sklearn.decomposition import PCA
model = PCA(n_components=2)
model.fit(X_iris)
X_2D = model.transform(X_iris)
iris['PCA1'] = X_2D[:, 0]
iris['PCA2'] = X_2D[:, 1]
sns.lmplot("PCA1", "PCA2", hue='species', data=iris, fit_reg=False)
Out[6]:
In [8]:
# clustering
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=3, covariance_type='full')
model.fit(X_iris)
y_gmm = model.predict(X_iris)
iris['cluster'] = y_gmm
sns.lmplot("PCA1", "PCA2", data=iris, hue='species', col='cluster', fit_reg=False);
In [9]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.images.shape
Out[9]:
In [10]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(10, 10, figsize=(8, 8),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')
ax.text(0.05, 0.05, str(digits.target[i]),
transform=ax.transAxes, color='green')
In [11]:
X = digits.data
y = digits.target
X.shape, y.shape
Out[11]:
In [13]:
# dimension reduction
from sklearn.manifold import Isomap
iso = Isomap(n_components=2)
iso.fit(digits.data)
data_projected = iso.transform(digits.data)
data_projected.shape
Out[13]:
In [14]:
plt.scatter(data_projected[:, 0], data_projected[:, 1], c=digits.target,
edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('spectral', 10))
plt.colorbar(label='digit label', ticks=range(10))
plt.clim(-0.5, 9.5);
In [15]:
# classification
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=0)
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)
Out[15]:
In [17]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(ytest, y_model)
sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value');
In [18]:
fig, axes = plt.subplots(10, 10, figsize=(8, 8),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')
ax.text(0.05, 0.05, str(y_model[i]), transform=ax.transAxes,
color='green' if (ytest[i] == y_model[i]) else 'red')
In [19]:
# Model Validation
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
iris = load_iris()
X = iris.data
y = iris.target
model = KNeighborsClassifier(n_neighbors=1).fit(X, y)
cross_val_score(model, X, y, cv=5)
Out[19]:
In [20]:
from sklearn.cross_validation import LeaveOneOut
scores = cross_val_score(model, X, y, cv=LeaveOneOut(len(X)))
scores.mean()
Out[20]:
In [21]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
def PolynomialRegression(degree=2, **kwargs):
return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))
def make_data(N, err=1.0, rseed=1):
rng = np.random.RandomState(rseed)
X = rng.rand(N, 1) ** 2
y = 10-1./(X.ravel()+0.1)
if err > 0:
y += err * rng.randn(N)
return X, y
In [24]:
X, y = make_data(40)
X_test = np.linspace(-0.1, 1.1, 500)[:, np.newaxis]
plt.scatter(X.ravel(), y, color='black')
for degree in [1, 3, 5]:
y_test = PolynomialRegression(degree).fit(X, y).predict(X_test)
plt.plot(X_test, y_test, label='degree={0}'.format(degree))
plt.xlim(-0.1, 1.0)
plt.ylim(-2, 12)
plt.legend(loc='best');
In [26]:
from sklearn.model_selection import validation_curve
degree = np.arange(0, 21)
train_score, val_score = validation_curve(PolynomialRegression(), X, y,
'polynomialfeatures__degree',
degree, cv=7)
plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score');
In [29]:
plt.scatter(X, y)
y_test = PolynomialRegression(3).fit(X, y).predict(X_test)
plt.plot(X_test, y_test);
In [30]:
X2, y2 = make_data(200) # more training examples
plt.scatter(X2, y2)
Out[30]:
In [31]:
degree = np.arange(21)
train_score2, val_score2 = validation_curve(PolynomialRegression(), X2, y2,
'polynomialfeatures__degree',
degree, cv=7)
plt.plot(degree, np.median(train_score2, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score2, 1), color='red', label='validation score')
plt.plot(degree, np.median(train_score, 1), color='blue', alpha=0.3, linestyle='dashed')
plt.plot(degree, np.median(val_score, 1), color='red', alpha=0.3, linestyle='dashed')
plt.legend(loc='lower center')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score')
Out[31]:
In [32]:
from sklearn.learning_curve import learning_curve
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for i, degree in enumerate([2, 9]):
N, train_lc, val_lc = learning_curve(PolynomialRegression(degree),
X, y, cv=7,
train_sizes=np.linspace(0.3, 1, 25))
ax[i].plot(N, np.mean(train_lc, 1), color='blue', label='training score')
ax[i].plot(N, np.mean(val_lc, 1), color='red', label='validation score')
ax[i].hlines(np.mean([train_lc[-1], val_lc[-1]]), N[0], N[-1], color='gray', linestyle='dashed')
ax[i].set_ylim(0, 1)
ax[i].set_xlim(N[0], N[-1])
ax[i].set_xlabel('training size')
ax[i].set_ylabel('score')
ax[i].set_title('degree = {0}'.format(degree), size=14)
ax[i].legend(loc='best')
In [34]:
from sklearn.model_selection import GridSearchCV
param_grid = {'polynomialfeatures__degree': np.arange(21),
'linearregression__fit_intercept': [True, False],
'linearregression__normalize': [True, False]}
grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7).fit(X, y)
In [35]:
grid.best_params_
Out[35]:
In [37]:
model = grid.best_estimator_
plt.scatter(X, y)
y_test = model.fit(X, y).predict(X_test)
plt.plot(X_test, y_test)
Out[37]:
In [38]:
# Categorical Features
from sklearn.feature_extraction import DictVectorizer
data = [
{'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
{'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
{'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
{'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}]
vec = DictVectorizer(sparse=False, dtype=int)
vec.fit_transform(data)
Out[38]:
In [39]:
vec.get_feature_names()
Out[39]:
In [40]:
vec = DictVectorizer(sparse=True, dtype=int)
vec.fit_transform(data)
Out[40]:
In [41]:
# Text Features
from sklearn.feature_extraction.text import CountVectorizer
sample = ['problem of evil', 'evil queen','horizon problem']
vec = CountVectorizer()
X = vec.fit_transform(sample)
X
Out[41]:
In [42]:
import pandas as pd
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
Out[42]:
In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
X = TfidfVectorizer().fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
Out[43]:
In [44]:
# Derived Features
x = np.array([1, 2, 3, 4, 5])
y = np.array([4, 2, 1, 3, 7])
plt.scatter(x, y);
In [46]:
from sklearn.linear_model import LinearRegression
X = x[:, np.newaxis]
model = LinearRegression().fit(X, y)
yfit = model.predict(X)
plt.scatter(x, y)
plt.plot(x, yfit);
In [47]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
X2 = poly.fit_transform(X)
print(X2)
In [48]:
model = LinearRegression().fit(X2, y)
yfit = model.predict(X2)
plt.scatter(x, y)
plt.plot(x, yfit);
In [49]:
# Imputation of Missing Data
X=np.array([[np.nan,0, 3],
[3, 7, 9 ],
[3, 5, 2 ],
[4, np.nan,6 ],
[8, 8, 1 ]])
y = np.array([14, 16, -1, 8, -5])
In [50]:
from sklearn.preprocessing import Imputer
imp = Imputer(strategy='mean')
X2 = imp.fit_transform(X)
X2
Out[50]:
In [51]:
# Feature Pipelines
from sklearn.pipeline import make_pipeline
model = make_pipeline(Imputer(strategy='mean'),
PolynomialFeatures(degree=2),
LinearRegression())
model.fit(X, y)
print(model.predict(X))
In [ ]: