In [1]:
%pylab inline
sklearn.datasets
contains well known datasets that you can download and use
http://scikit-learn.org/stable/datasets/
Methods:
In [2]:
import sklearn
from sklearn.datasets import load_digits
In [3]:
digits = load_digits() # Bunch object
type(digits)
Out[3]:
In [4]:
digits.keys()
Out[4]:
In [5]:
digits.target_names
Out[5]:
In [6]:
digits.data.shape
Out[6]:
In [7]:
digits.data[0]
Out[7]:
In [8]:
plt.imshow(digits.images[0], cmap=plt.cm.binary, interpolation="none")
Out[8]:
In [9]:
housing = sklearn.datasets.fetch_california_housing()
In [10]:
housing.keys()
Out[10]:
In [11]:
housing.feature_names
Out[11]:
How each feature affect housing prices?
In [12]:
x = housing.data[:, 2]
y = housing.data[:, 3]
In [13]:
vals = housing.target
plt.scatter(x,y, s=vals)
Out[13]:
In [14]:
from sklearn.cross_validation import train_test_split
Xtrain_d, Xtest_d, ytrain_d, ytest_d = train_test_split(digits.data, digits.target, test_size=0.1, )
In [15]:
len(Xtrain_d), len(Xtest_d)
Out[15]:
Let's use kNN classifiers!
In [16]:
from sklearn.neighbors import KNeighborsClassifier as kNN
In [17]:
n1_model = kNN(n_neighbors=3)
In [18]:
n1_model.fit(Xtrain_d, ytrain_d)
Out[18]:
In [19]:
# Let's compare the predicated values with the actal ones
X0 = [digits.data[0]]
y0 = digits.target[0]
out0 = n1_model.predict(X0)[0]
print("Equals: ", out0, y0)
In [20]:
n1_model.predict_proba(X0) # The model is pretty sure about its prediction
Out[20]:
In [21]:
n1_model.score(Xtest_d, ytest_d) # Check the accuracy on the training data
Out[21]:
In [22]:
from sklearn.metrics import confusion_matrix, f1_score
ypred_d = n1_model.predict(Xtest_d)
confusion_matrix(ypred_d, ytest_d)
Out[22]:
In [23]:
f1_score(ytest_d, ypred_d, average="macro")
Out[23]:
In [25]:
from sklearn.cross_validation import cross_val_score
res = cross_val_score(n1_model, digits.data, digits.target, cv=10)
print(res)
print("Average:", np.average(res))
In [26]:
from sklearn.linear_model import LinearRegression
In [27]:
lin_model = LinearRegression()
In [28]:
housing.data.shape
Out[28]:
In [29]:
sep = 15000
X_h = housing.data[:sep,2:4]
y_h = housing.target[:sep]
lin_model.fit(X_h, y_h)
Out[29]:
In [30]:
lin_model.predict([housing.data[sep][2:4]]), housing.target[sep]
Out[30]:
In [31]:
lin_model.score(housing.data[sep:, 2:4], housing.target[sep:])
Out[31]:
Find the best model for the diabetes dataset: http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes