In [1]:
# this demostration is to show the typical steps to apply machine learing algrithom 
# to the data
# 1 - prepare the train and test data set, the train data set has the target value, 
#     but the test data set does not have
# 2 - split the train data set into two part with specific %, the smaller part to be
#     used to validate the model output
# 3 - initilize the cls as classifier, and use 'fit' method on train data set
# 4 - test one or more test samples, and then generate the predicted result
from sklearn.cross_validation import train_test_split

In [3]:
from sklearn import preprocessing

In [4]:
# get the train dataset, only pick up two features
from sklearn import datasets
iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target
X, y = X_iris[:,:2], y_iris

In [5]:
# have a glance for the data 
X[0,:]


Out[5]:
array([ 5.1,  3.5])

In [6]:
y[0]


Out[6]:
0

In [8]:
# split the train data set into train and test (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 33)

In [9]:
# check the shape
print X_train.shape, X_test.shape


(112, 2) (38, 2)

In [10]:
# init the scaler 
scaler = preprocessing.StandardScaler().fit(X_train)
# standardlize the train and test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# check normlized value, same distribution but different value
X_train[0,:]


Out[11]:
array([-0.91090798, -1.59761476])

In [16]:
# try to plot the data
import matplotlib.pyplot as plt
colors = ['red', 'greenyellow', 'blue']
for i in xrange(len(colors)):
    xs = X_train[:,0][y_train == i]
    ys = X_train[:,1][y_train == i]
    plt.scatter(xs, ys, c = colors[i])
plt.legend(iris.target_names)
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.show()
# figure_1.png

In [18]:
# apply model on the train data set
from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train, y_train)


Out[18]:
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [19]:
# check the result
print clf.coef_


[[-26.20653395  17.03102977]
 [  2.98134261 -14.92334899]
 [ 16.15781219   8.28536583]]

In [20]:
print clf.intercept_


[-11.45445853  -2.43636472 -16.45025123]

In [27]:
# predict one new value 4.7 / 3.1
import numpy as np
in_X = np.array([[4.7, 3.1]])
print clf.predict(scaler.transform(in_X))


[0]

In [29]:
# Evaluatin our result
from sklearn import metrics
y_pred = clf.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)


0.631578947368

In [30]:
# a detail report
print metrics.classification_report(y_test, y_pred, target_names = iris.target_names)


             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00         8
 versicolor       0.42      0.73      0.53        11
  virginica       0.73      0.42      0.53        19

avg / total       0.70      0.63      0.63        38


In [ ]: