notebook.community

Edit and run



In [1]:

    
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()









    Out[1]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa



In [2]:

    
%matplotlib inline
sns.set()
sns.pairplot(iris, hue='species', size=1.5);



In [3]:

    
X_iris = iris.drop('species', axis=1)
y_iris = iris['species']
X_iris.shape, y_iris.shape









    Out[3]:





((150, 4), (150,))



In [5]:

    
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# classification
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris, random_state=1)
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)









    Out[5]:





0.97368421052631582



In [6]:

    
# dimension reduction
from sklearn.decomposition import PCA 
model = PCA(n_components=2)
model.fit(X_iris)
X_2D = model.transform(X_iris)
iris['PCA1'] = X_2D[:, 0]
iris['PCA2'] = X_2D[:, 1]
sns.lmplot("PCA1", "PCA2", hue='species', data=iris, fit_reg=False)









    Out[6]:





<seaborn.axisgrid.FacetGrid at 0x1194dced0>



In [8]:

    
# clustering
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=3, covariance_type='full')
model.fit(X_iris)
y_gmm = model.predict(X_iris)
iris['cluster'] = y_gmm
sns.lmplot("PCA1", "PCA2", data=iris, hue='species', col='cluster', fit_reg=False);



In [9]:

    
from sklearn.datasets import load_digits
digits = load_digits()
digits.images.shape









    Out[9]:





(1797, 8, 8)



In [10]:

    
import matplotlib.pyplot as plt
fig, axes = plt.subplots(10, 10, figsize=(8, 8),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
    ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')
    ax.text(0.05, 0.05, str(digits.target[i]),
            transform=ax.transAxes, color='green')



In [11]:

    
X = digits.data
y = digits.target
X.shape, y.shape









    Out[11]:





((1797, 64), (1797,))



In [13]:

    
# dimension reduction
from sklearn.manifold import Isomap
iso = Isomap(n_components=2)
iso.fit(digits.data)
data_projected = iso.transform(digits.data)
data_projected.shape









    Out[13]:





(1797, 2)



In [14]:

    
plt.scatter(data_projected[:, 0], data_projected[:, 1], c=digits.target,
            edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('spectral', 10))
plt.colorbar(label='digit label', ticks=range(10))
plt.clim(-0.5, 9.5);



In [15]:

    
# classification
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=0)
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)









    Out[15]:





0.83333333333333337



In [17]:

    
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(ytest, y_model)
sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value');



In [18]:

    
fig, axes = plt.subplots(10, 10, figsize=(8, 8),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
    ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')
    ax.text(0.05, 0.05, str(y_model[i]), transform=ax.transAxes,
    color='green' if (ytest[i] == y_model[i]) else 'red')



In [19]:

    
# Model Validation
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
iris = load_iris()
X = iris.data
y = iris.target
model = KNeighborsClassifier(n_neighbors=1).fit(X, y)
cross_val_score(model, X, y, cv=5)









    Out[19]:





array([ 0.96666667,  0.96666667,  0.93333333,  0.93333333,  1.        ])



In [20]:

    
from sklearn.cross_validation import LeaveOneOut
scores = cross_val_score(model, X, y, cv=LeaveOneOut(len(X)))
scores.mean()









    Out[20]:





0.95999999999999996



In [21]:

    
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline


def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))


def make_data(N, err=1.0, rseed=1):
    rng = np.random.RandomState(rseed)
    X = rng.rand(N, 1) ** 2
    y = 10-1./(X.ravel()+0.1)
    if err > 0:
        y += err * rng.randn(N) 
    return X, y



In [24]:

    
X, y = make_data(40)
X_test = np.linspace(-0.1, 1.1, 500)[:, np.newaxis]
plt.scatter(X.ravel(), y, color='black')
for degree in [1, 3, 5]:
    y_test = PolynomialRegression(degree).fit(X, y).predict(X_test)
    plt.plot(X_test, y_test, label='degree={0}'.format(degree))
plt.xlim(-0.1, 1.0)
plt.ylim(-2, 12)
plt.legend(loc='best');



In [26]:

    
from sklearn.model_selection import validation_curve
degree = np.arange(0, 21)
train_score, val_score = validation_curve(PolynomialRegression(), X, y,
                                          'polynomialfeatures__degree',
                                          degree, cv=7)
plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score');



In [29]:

    
plt.scatter(X, y)
y_test = PolynomialRegression(3).fit(X, y).predict(X_test)
plt.plot(X_test, y_test);



In [30]:

    
X2, y2 = make_data(200)  # more training examples
plt.scatter(X2, y2)









    Out[30]:





<matplotlib.collections.PathCollection at 0x132cd7750>



In [31]:

    
degree = np.arange(21)
train_score2, val_score2 = validation_curve(PolynomialRegression(), X2, y2,
                                            'polynomialfeatures__degree',
                                            degree, cv=7)
plt.plot(degree, np.median(train_score2, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score2, 1), color='red', label='validation score')
plt.plot(degree, np.median(train_score, 1), color='blue', alpha=0.3, linestyle='dashed')
plt.plot(degree, np.median(val_score, 1), color='red', alpha=0.3, linestyle='dashed')
plt.legend(loc='lower center')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score')









    Out[31]:





<matplotlib.text.Text at 0x132d5b790>



In [32]:

    
from sklearn.learning_curve import learning_curve
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for i, degree in enumerate([2, 9]):
    N, train_lc, val_lc = learning_curve(PolynomialRegression(degree),
                                         X, y, cv=7,
                                         train_sizes=np.linspace(0.3, 1, 25))
    ax[i].plot(N, np.mean(train_lc, 1), color='blue', label='training score')
    ax[i].plot(N, np.mean(val_lc, 1), color='red', label='validation score')
    ax[i].hlines(np.mean([train_lc[-1], val_lc[-1]]), N[0], N[-1], color='gray', linestyle='dashed')
    ax[i].set_ylim(0, 1)
    ax[i].set_xlim(N[0], N[-1])
    ax[i].set_xlabel('training size')
    ax[i].set_ylabel('score')
    ax[i].set_title('degree = {0}'.format(degree), size=14)
    ax[i].legend(loc='best')



In [34]:

    
from sklearn.model_selection import GridSearchCV
param_grid = {'polynomialfeatures__degree': np.arange(21),
              'linearregression__fit_intercept': [True, False],
              'linearregression__normalize': [True, False]}
grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7).fit(X, y)



In [35]:

    
grid.best_params_









    Out[35]:





{'linearregression__fit_intercept': False,
 'linearregression__normalize': True,
 'polynomialfeatures__degree': 4}



In [37]:

    
model = grid.best_estimator_
plt.scatter(X, y)
y_test = model.fit(X, y).predict(X_test)
plt.plot(X_test, y_test)









    Out[37]:





[<matplotlib.lines.Line2D at 0x132a83a90>]



In [38]:

    
# Categorical Features
from sklearn.feature_extraction import DictVectorizer 

data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}]
vec = DictVectorizer(sparse=False, dtype=int)
vec.fit_transform(data)









    Out[38]:





array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]])



In [39]:

    
vec.get_feature_names()









    Out[39]:





['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'rooms']



In [40]:

    
vec = DictVectorizer(sparse=True, dtype=int)
vec.fit_transform(data)









    Out[40]:





<4x5 sparse matrix of type '<type 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>



In [41]:

    
# Text Features
from sklearn.feature_extraction.text import CountVectorizer

sample = ['problem of evil', 'evil queen','horizon problem']
vec = CountVectorizer()
X = vec.fit_transform(sample)
X









    Out[41]:





<3x5 sparse matrix of type '<type 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>



In [42]:

    
import pandas as pd
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())



In [43]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
X = TfidfVectorizer().fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())



In [44]:

    
# Derived Features
x = np.array([1, 2, 3, 4, 5])
y = np.array([4, 2, 1, 3, 7])
plt.scatter(x, y);



In [46]:

    
from sklearn.linear_model import LinearRegression
X = x[:, np.newaxis]
model = LinearRegression().fit(X, y)
yfit = model.predict(X)
plt.scatter(x, y)
plt.plot(x, yfit);



In [47]:

    
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
X2 = poly.fit_transform(X)
print(X2)









    



[[   1.    1.    1.    1.]
 [   1.    2.    4.    8.]
 [   1.    3.    9.   27.]
 [   1.    4.   16.   64.]
 [   1.    5.   25.  125.]]



In [48]:

    
model = LinearRegression().fit(X2, y)
yfit = model.predict(X2)
plt.scatter(x, y)
plt.plot(x, yfit);



In [49]:

    
# Imputation of Missing Data
X=np.array([[np.nan,0, 3],
            [3, 7, 9 ], 
            [3, 5, 2 ], 
            [4, np.nan,6 ], 
            [8, 8, 1 ]])
y = np.array([14, 16, -1,  8, -5])



In [50]:

    
from sklearn.preprocessing import Imputer
imp = Imputer(strategy='mean')
X2 = imp.fit_transform(X)
X2









    Out[50]:





array([[ 4.5,  0. ,  3. ],
       [ 3. ,  7. ,  9. ],
       [ 3. ,  5. ,  2. ],
       [ 4. ,  5. ,  6. ],
       [ 8. ,  8. ,  1. ]])



In [51]:

    
# Feature Pipelines
from sklearn.pipeline import make_pipeline
model = make_pipeline(Imputer(strategy='mean'),
                      PolynomialFeatures(degree=2),
                      LinearRegression())
model.fit(X, y)
print(model.predict(X))









    



[ 14.  16.  -1.   8.  -5.]



In [ ]:

	evil	horizon	of	problem	queen
0	0.517856	0.000000	0.680919	0.517856	0.000000
1	0.605349	0.000000	0.000000	0.000000	0.795961
2	0.000000	0.795961	0.000000	0.605349	0.000000

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa