In [1]:
# this demostration is to show the typical steps to apply machine learing algrithom
# to the data
# 1 - prepare the train and test data set, the train data set has the target value,
# but the test data set does not have
# 2 - split the train data set into two part with specific %, the smaller part to be
# used to validate the model output
# 3 - initilize the cls as classifier, and use 'fit' method on train data set
# 4 - test one or more test samples, and then generate the predicted result
from sklearn.cross_validation import train_test_split
In [3]:
from sklearn import preprocessing
In [4]:
# get the train dataset, only pick up two features
from sklearn import datasets
iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target
X, y = X_iris[:,:2], y_iris
In [5]:
# have a glance for the data
X[0,:]
Out[5]:
In [6]:
y[0]
Out[6]:
In [8]:
# split the train data set into train and test (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 33)
In [9]:
# check the shape
print X_train.shape, X_test.shape
In [10]:
# init the scaler
scaler = preprocessing.StandardScaler().fit(X_train)
# standardlize the train and test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [11]:
# check normlized value, same distribution but different value
X_train[0,:]
Out[11]:
In [16]:
# try to plot the data
import matplotlib.pyplot as plt
colors = ['red', 'greenyellow', 'blue']
for i in xrange(len(colors)):
xs = X_train[:,0][y_train == i]
ys = X_train[:,1][y_train == i]
plt.scatter(xs, ys, c = colors[i])
plt.legend(iris.target_names)
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.show()
# figure_1.png
In [18]:
# apply model on the train data set
from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train, y_train)
Out[18]:
In [19]:
# check the result
print clf.coef_
In [20]:
print clf.intercept_
In [27]:
# predict one new value 4.7 / 3.1
import numpy as np
in_X = np.array([[4.7, 3.1]])
print clf.predict(scaler.transform(in_X))
In [29]:
# Evaluatin our result
from sklearn import metrics
y_pred = clf.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)
In [30]:
# a detail report
print metrics.classification_report(y_test, y_pred, target_names = iris.target_names)
In [ ]: