In [1]:
#% matplotlib nbagg
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
In [2]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.keys()
Out[2]:
In [3]:
digits.images.shape
Out[3]:
In [4]:
print(digits.images[0])
In [5]:
plt.matshow(digits.images[0], cmap=plt.cm.Greys)
Out[5]:
In [6]:
digits.data.shape
Out[6]:
In [7]:
digits.target.shape
Out[7]:
In [8]:
digits.target
Out[8]:
Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)
Splitting the data:
In [9]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)
Load the iris dataset from the sklearn.datasets module using the load_iris function.
The function returns a dictionary-like object that has the same attributes as digits.
What is the number of classes, features and data points in this dataset? Use a scatterplot to visualize the dataset.
You can look at DESCR attribute to learn more about the dataset.
In [10]:
# %load solutions/load_iris.py
In [11]:
from sklearn.datasets import load_iris
In [12]:
dd = load_iris()
In [13]:
dd.viewkeys()
Out[13]:
In [14]:
dd['data'].shape
Out[14]:
In [22]:
print(dd['DESCR'])
In [15]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(dd['data'], dd['target'])
In [16]:
X_train2.shape, X_test2.shape, y_train2.shape, y_test2.shape
Out[16]:
In [47]:
plt.scatter( X_train2[:,0], X_train2[:,1], c=y_train2 )
Out[47]:
In [23]:
from sklearn.svm import LinearSVC
In [24]:
svm = LinearSVC(C=0.1)
In [32]:
svm.fit(X_train2, y_train2)
Out[32]:
In [33]:
print(svm.predict(X_train2))
print(y_train2)
In [34]:
svm.score(X_train2, y_train2)
Out[34]:
In [35]:
svm.score(X_test2, y_test2)
Out[35]:
In [40]:
from sklearn.ensemble import RandomForestClassifier
In [41]:
rf = RandomForestClassifier(n_estimators=50)
In [42]:
rf.fit(X_train2, y_train2)
Out[42]:
In [44]:
rf.score(X_train2, y_train2)
Out[44]:
In [45]:
rf.score(X_test2, y_test2)
Out[45]:
In [ ]: