notebook.community

Edit and run



In [1]:

    
# this demostration is to show the typical steps to apply machine learing algrithom 
# to the data
# 1 - prepare the train and test data set, the train data set has the target value, 
#     but the test data set does not have
# 2 - split the train data set into two part with specific %, the smaller part to be
#     used to validate the model output
# 3 - initilize the cls as classifier, and use 'fit' method on train data set
# 4 - test one or more test samples, and then generate the predicted result
from sklearn.cross_validation import train_test_split



In [3]:

    
from sklearn import preprocessing



In [4]:

    
# get the train dataset, only pick up two features
from sklearn import datasets
iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target
X, y = X_iris[:,:2], y_iris



In [5]:

    
# have a glance for the data 
X[0,:]









    Out[5]:





array([ 5.1,  3.5])



In [6]:

    
y[0]









    Out[6]:





0



In [8]:

    
# split the train data set into train and test (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 33)



In [9]:

    
# check the shape
print X_train.shape, X_test.shape









    



(112, 2) (38, 2)



In [10]:

    
# init the scaler 
scaler = preprocessing.StandardScaler().fit(X_train)
# standardlize the train and test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



In [11]:

    
# check normlized value, same distribution but different value
X_train[0,:]









    Out[11]:





array([-0.91090798, -1.59761476])



In [16]:

    
# try to plot the data
import matplotlib.pyplot as plt
colors = ['red', 'greenyellow', 'blue']
for i in xrange(len(colors)):
    xs = X_train[:,0][y_train == i]
    ys = X_train[:,1][y_train == i]
    plt.scatter(xs, ys, c = colors[i])
plt.legend(iris.target_names)
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.show()
# figure_1.png



In [18]:

    
# apply model on the train data set
from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train, y_train)









    Out[18]:





SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)



In [19]:

    
# check the result
print clf.coef_









    



[[-26.20653395  17.03102977]
 [  2.98134261 -14.92334899]
 [ 16.15781219   8.28536583]]



In [20]:

    
print clf.intercept_









    



[-11.45445853  -2.43636472 -16.45025123]



In [27]:

    
# predict one new value 4.7 / 3.1
import numpy as np
in_X = np.array([[4.7, 3.1]])
print clf.predict(scaler.transform(in_X))

[0]



In [29]:

    
# Evaluatin our result
from sklearn import metrics
y_pred = clf.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)









    



0.631578947368



In [30]:

    
# a detail report
print metrics.classification_report(y_test, y_pred, target_names = iris.target_names)









    



             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00         8
 versicolor       0.42      0.73      0.53        11
  virginica       0.73      0.42      0.53        19

avg / total       0.70      0.63      0.63        38



In [ ]: