notebook.community

Edit and run



In [42]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns; sns.set()
import sklearn
from scipy import stats

%matplotlib inline



In [3]:

    
iris = sns.load_dataset('iris')
iris.head()









    Out[3]:







  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa



In [4]:

    
sns.pairplot(iris, hue='species')
plt.show()



In [5]:

    
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = 2 * x - 1 + rng.randn(50)
plt.scatter(x, y)
plt.show()



In [6]:

    
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
X = x[:, np.newaxis]
model.fit(X, y)









    Out[6]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [7]:

    
x_test = np.linspace(0, 10, 1000).reshape(1000,1)
y_test = model.predict(x_test)
plt.scatter(x, y)
plt.scatter(x_test, y_test, s=0.5)
plt.show()



In [8]:

    
iris = sns.load_dataset('iris')
X_iris = iris.drop('species', axis=1)
y_iris = iris['species']

from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris, random_state=1)

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)

from sklearn.metrics import accuracy_score
print(ytest.shape)
print(y_model.shape)
accuracy_score(ytest, y_model)









    



(38,)
(38,)






    



/Users/morizeyao/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)






    Out[8]:





0.97368421052631582



In [9]:

    
from sklearn.decomposition import PCA
model = PCA(n_components=2)
model.fit(X_iris)
X_2d = model.transform(X_iris)
iris['PCA1'] = X_2d[:, 0]
iris['PCA2'] = X_2d[:, 1]
sns.lmplot('PCA1', 'PCA2', hue='species', data=iris, fit_reg=False)
plt.show()



In [10]:

    
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=3, covariance_type='full')
model.fit(X_iris)
y_gmm= model.predict(X_iris)
iris['cluster'] = y_gmm
sns.lmplot('PCA1','PCA2', data=iris, hue='species', col='cluster', fit_reg=False)
plt.show()



In [11]:

    
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))
def make_data(N, err=1.0, rseed=1):
    rng = np.random.RandomState(rseed)
    X = rng.rand(N, 1) ** 2
    y = 10 - 1. / (X.ravel() + 0.1)
    if err > 0:
        y += err * rng.randn(N)
    return X, y

import matplotlib.pyplot as plt
import seaborn; seaborn.set()
%matplotlib inline

X, y = make_data(40)
X_test = np.linspace(-0.1, 1.1, 500)[:, None]

# print(np.linspace(-0.1,1.1,500).shape)
# print(np.linspace(-0.1,1.1,500)[:, None].shape)
# print(np.linspace(-0.1,1.1,500)[:, np.newaxis].shape)

plt.scatter(X.ravel(), y, color='black')
axis = plt.axis()
for degree in [1, 3, 5, 9]:
    y_test = PolynomialRegression(degree).fit(X, y).predict(X_test)
    plt.plot(X_test.ravel(), y_test, label='degree={0}'.format(degree))
plt.ylim(-2, 12)
plt.legend(loc='best')
# plt.savefig('xx.jpg', dpi=500)









    Out[11]:





<matplotlib.legend.Legend at 0x116252d30>



In [12]:

    
from sklearn.learning_curve import validation_curve
degree = np.arange(0, 21)
train_score, val_score = validation_curve(PolynomialRegression(), X, y, 'polynomialfeatures__degree', degree, cv=7)
plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score')









    



/Users/morizeyao/anaconda/lib/python3.6/site-packages/sklearn/learning_curve.py:23: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the functions are moved. This module will be removed in 0.20
  DeprecationWarning)






    Out[12]:





<matplotlib.text.Text at 0x117330da0>



In [13]:

    
plt.scatter(X.ravel(), y)
lim = plt.axis()
y_test = PolynomialRegression(3).fit(X, y).predict(X_test)
plt.plot(X_test.ravel(), y_test, color='red')
# plt.axis(lim)









    Out[13]:





[<matplotlib.lines.Line2D at 0x112824ba8>]



In [14]:

    
X2, y2 = make_data(500)
plt.scatter(X2, y2, s=2)
# plt.savefig('xxxx.jpg', dpi=800)









    Out[14]:





<matplotlib.collections.PathCollection at 0x112866198>



In [15]:

    
from sklearn.learning_curve import validation_curve
degree = np.arange(0, 21)
train_score2, val_score2 = validation_curve(PolynomialRegression(), X2, y2, 'polynomialfeatures__degree', degree, cv=7)
plt.plot(degree, np.median(train_score2, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score2, 1), color='red', label='validation score')
plt.plot(degree, np.median(train_score, 1), color='blue',linestyle='dotted', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red',linestyle='dotted', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score')
# plt.savefig('xxx.jpg', dpi=800)









    Out[15]:





<matplotlib.text.Text at 0x111db7d30>



In [16]:

    
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
    ]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int)
X = vec.fit_transform(data)
print(vec.get_feature_names())
y = vec.inverse_transform(X)









    



['neighborhood=Fremont', 'neighborhood=Queen Anne', 'neighborhood=Wallingford', 'price', 'rooms']



In [17]:

    
sample = ['problem of evil', 'evil queen', 'horizon problem']
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(sample)
X.toarray()









    Out[17]:





array([[ 0.51785612,  0.        ,  0.68091856,  0.51785612,  0.        ],
       [ 0.60534851,  0.        ,  0.        ,  0.        ,  0.79596054],
       [ 0.        ,  0.79596054,  0.        ,  0.60534851,  0.        ]])



In [18]:

    
x = np.array([1,2,3,4,5])
y = np.array([4,2,1,3,7])
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
x = x.reshape(5,1)
X = poly.fit_transform(x)



In [19]:

    
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X, y)
plt.scatter(x, y)
plt.plot(x, model.predict(X))









    Out[19]:





[<matplotlib.lines.Line2D at 0x1122abd30>]



In [20]:

    
X = np.array([
    [1,2,3],
    [4,5,np.nan]
])
from sklearn.preprocessing import Imputer
X = Imputer().fit_transform(X)
print(X)









    



[[ 1.  2.  3.]
 [ 4.  5.  3.]]



In [21]:

    
from sklearn.datasets import make_blobs
X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)



In [22]:

    
%matplotlib inline
plt.scatter(X[:,0], X[:,1], c=y, cmap='RdGy', s=50)
plt.show()



In [23]:

    
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
rng = np.random.RandomState(42)
Xnew = rng.rand(2000, 2) * [14, 18] - [6, 14]
ynew = model.predict(Xnew)
plt.scatter(Xnew[:,0], Xnew[:,1], c=ynew, cmap='viridis', s=20, alpha=0.4)
plt.scatter(X[:,0], X[:,1], c=y, cmap='RdGy', s=50, alpha=1)
plt.show()



In [24]:

    
ynewnew = model.predict_proba(Xnew)
print(ynewnew)









    



[[  2.91519722e-12   1.00000000e+00]
 [  1.14739388e-06   9.99998853e-01]
 [  1.00000000e+00   2.16388958e-11]
 ..., 
 [  8.30181730e-01   1.69818270e-01]
 [  9.99211375e-01   7.88624968e-04]
 [  9.99999995e-01   4.89784579e-09]]



In [25]:

    
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()



In [26]:

    
categories = ['talk.religion.misc', 'soc.religion.christian', 'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(train.data, train.target)
labels = model.predict(test.data)



In [27]:

    
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test.target, labels)
sns.heatmap(mat.T, square=True, cbar=False, xticklabels=train.target_names, 
            yticklabels=train.target_names, annot=True, fmt='d')
plt.xlabel('true label')
plt.ylabel('predicted label')









    Out[27]:





<matplotlib.text.Text at 0x1115c8da0>



In [28]:

    
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = x * 2 + rng.randn(50) - 5
plt.scatter(x, y)









    Out[28]:





<matplotlib.collections.PathCollection at 0x112091828>



In [35]:

    
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
model.fit(x[:, np.newaxis], y)
xfit = np.linspace(0, 10, 1000)
yfit = model.predict(xfit[:, np.newaxis])
plt.scatter(x, y)
plt.plot(xfit, yfit, 'r')









    Out[35]:





[<matplotlib.lines.Line2D at 0x10f254128>]



In [31]:

    
print(model.coef_,model.intercept_)









    



[ 1.9776566] -4.90331072553



In [39]:

    
from sklearn.preprocessing import PolynomialFeatures
x = np.array([2, 3, 4])
poly = PolynomialFeatures(3, include_bias=False)
x_poly = poly.fit_transform(x[:, np.newaxis])
print(x_poly)









    



[[  2.   4.   8.]
 [  3.   9.  27.]
 [  4.  16.  64.]]



In [41]:

    
from sklearn.pipeline import make_pipeline
model = make_pipeline(PolynomialFeatures(7),
                      LinearRegression())
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = np.sin(x) + 0.1 * rng.randn(50)
model.fit(x[:, np.newaxis], y)
xfit = np.linspace(0, 10, 1000)
yfit = model.predict(xfit[:, np.newaxis])
plt.scatter(x, y)
plt.plot(xfit, yfit, 'r-.')









    Out[41]:





[<matplotlib.lines.Line2D at 0x1118a8cf8>]



In [49]:

    
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=1)
plt.scatter(X[:,0], X[:,1], c=y, s=50, cmap='autumn')
plt.colorbar()









    Out[49]:





<matplotlib.colorbar.Colorbar at 0x119a4a710>



In [46]:

    
from sklearn.svm import SVC  # support vector classifier
model = SVC(C=1e10, kernel='linear')
model.fit(X, y)









    Out[46]:





array([[ 0.08848433,  2.32299086],
       [ 3.2460247 ,  2.84942165],
       [ 2.06576754,  2.68353415],
       [ 0.89011768,  1.79849015]])



In [48]:

    
model.predict([[-1,1]])









    Out[48]:





array([1])



In [50]:

    
from sklearn.datasets.samples_generator import make_circles
X, y = make_circles(100, factor=0.1, noise=0.1)

clf = SVC(kernel='rbf', C=1e6)
clf.fit(X, y)









    Out[50]:





SVC(C=1000000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [57]:

    
def plot_svc_decision_function(model, ax=None, plot_support=True):
    """Plot the decision function for a 2D SVC"""
    if ax is None:
        ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    # create grid to evaluate model
    x = np.linspace(xlim[0], xlim[1], 30)
    y = np.linspace(ylim[0], ylim[1], 30)
    Y, X = np.meshgrid(y, x)
    xy = np.vstack([X.ravel(), Y.ravel()]).T
    P = model.decision_function(xy).reshape(X.shape)
    
    # plot decision boundary and margins
    ax.contour(X, Y, P, colors='k',
               levels=[-1, 0, 1], alpha=0.5,
               linestyles=['--', '-', '--'])
    
    # plot support vectors
    if plot_support:
        ax.scatter(model.support_vectors_[:, 0],
                   model.support_vectors_[:, 1],
                   s=300, linewidth=1, facecolors='none');
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)



In [62]:

    
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
            s=300, lw=1, facecolors='none');



In [66]:

    
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
plt.imshow(faces.images[0], cmap='viridis')









    



Downloading LFW metadata: http://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt
Downloading LFW metadata: http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt
Downloading LFW metadata: http://vis-www.cs.umass.edu/lfw/pairs.txt
Downloading LFW data (~200MB): http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz






    Out[66]:





<matplotlib.image.AxesImage at 0x11b976b70>



In [70]:

    
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split



In [71]:

    
pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target, random_state=42)

param_grid = {'svc__C': [1,5,10,50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)
%time grid.fit(Xtrain, ytrain)
print(grid.best_params_)









    



CPU times: user 1min 15s, sys: 3.35 s, total: 1min 19s
Wall time: 22 s
{'svc__C': 10, 'svc__gamma': 0.001}



In [83]:

    
from sklearn.datasets import make_blobs
X, y = make_blobs(centers=4, n_samples=500, cluster_std=1, random_state=42)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis')









    Out[83]:





<matplotlib.collections.PathCollection at 0x11b63b748>



In [84]:

    
def visualize_classifier(model, X, y, ax=None, cmap='rainbow'):
    ax = ax or plt.gca()
    
    # Plot the training points
    ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap,
               clim=(y.min(), y.max()), zorder=3)
    ax.axis('tight')
    ax.axis('off')
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    # fit the estimator
    model.fit(X, y)
    xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
                         np.linspace(*ylim, num=200))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    # Create a color plot with the results
    n_classes = len(np.unique(y))
    contours = ax.contourf(xx, yy, Z, alpha=0.3,
                           levels=np.arange(n_classes + 1) - 0.5,
                           cmap=cmap, clim=(y.min(), y.max()),
                           zorder=1)

    ax.set(xlim=xlim, ylim=ylim)



In [85]:

    
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=0)
visualize_classifier(model, X, y)



In [87]:

    
from sklearn.datasets import load_sample_image
china = load_sample_image('china.jpg')
plt.imshow(china)









    Out[87]:





<matplotlib.image.AxesImage at 0x118fdc2b0>



In [89]:









    



Downloading species data from http://www.cs.princeton.edu/~schapire/maxent/datasets/samples.zip to /Users/morizeyao/scikit_learn_data






    



---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-89-3aca7cfb1259> in <module>()
      1 from sklearn.datasets import fetch_species_distributions
----> 2 data = fetch_species_distributions()

~/anaconda/lib/python3.6/site-packages/sklearn/datasets/species_distributions.py in fetch_species_distributions(data_home, download_if_missing)
    225         print('Downloading species data from %s to %s' % (SAMPLES_URL,
    226                                                           data_home))
--> 227         X = np.load(BytesIO(urlopen(SAMPLES_URL).read()))
    228 
    229         for f in X.files:

~/anaconda/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

~/anaconda/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    530         for processor in self.process_response.get(protocol, []):
    531             meth = getattr(processor, meth_name)
--> 532             response = meth(req, response)
    533 
    534         return response

~/anaconda/lib/python3.6/urllib/request.py in http_response(self, request, response)
    640         if not (200 <= code < 300):
    641             response = self.parent.error(
--> 642                 'http', request, response, code, msg, hdrs)
    643 
    644         return response

~/anaconda/lib/python3.6/urllib/request.py in error(self, proto, *args)
    568         if http_err:
    569             args = (dict, 'default', 'http_error_default') + orig_args
--> 570             return self._call_chain(*args)
    571 
    572 # XXX probably also want an abstract factory that knows when it makes

~/anaconda/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

~/anaconda/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    648 class HTTPDefaultErrorHandler(BaseHandler):
    649     def http_error_default(self, req, fp, code, msg, hdrs):
--> 650         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    651 
    652 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found



In [ ]:

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa