In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score, cross_val_predict, StratifiedKFold 
from sklearn import preprocessing, metrics, svm, ensemble
from sklearn.metrics import accuracy_score, classification_report
import tabpy_client


/Users/zhangjunwu/anaconda/envs/Tableau-Python-Server/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/zhangjunwu/anaconda/envs/Tableau-Python-Server/lib/python2.7/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [2]:
# Iris dataset: http://aima.cs.berkeley.edu/data/iris.csv
df =  pd.read_csv('./iris2.csv', header=0)
# Take a look at the structure of the file
df.head(n=10)


Out[2]:
sepal_length sepal_width petal_length petal_width Class
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
5 5.4 3.9 1.7 0.4 setosa
6 4.6 3.4 1.4 0.3 setosa
7 5.0 3.4 1.5 0.2 setosa
8 4.4 2.9 1.4 0.2 setosa
9 4.9 3.1 1.5 0.1 setosa

In [3]:
# Use LabelEncoder to convert textual classifications to numeric. 
# We will use the same encoder later to convert them back.
encoder = preprocessing.LabelEncoder()
df['Class'] = encoder.fit_transform(df['Class'])

# Check the result of the transform
df.head(n=10)


Out[3]:
sepal_length sepal_width petal_length petal_width Class
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
5 5.4 3.9 1.7 0.4 0
6 4.6 3.4 1.4 0.3 0
7 5.0 3.4 1.5 0.2 0
8 4.4 2.9 1.4 0.2 0
9 4.9 3.1 1.5 0.1 0

In [4]:
# Split columns into independent/predictor variables vs dependent/response/outcome variable
X = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])

# Scale the data. We will use the same scaler later for scoring function
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

# 10 fold stratified cross validation
kf = StratifiedKFold(y, n_folds=10, random_state=None, shuffle=True)

# Define the parameter grid to use for tuning the Support Vector Machine
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# Pick accuracy as the goal you're optimizing
scoringmethods = ['accuracy']

print 'pass'


pass

In [5]:
# Iterate through different metrics looking for best parameter set
for score in scoringmethods:
    print("~~~ Hyper-parameter tuning for best %s ~~~" % score)
    
    # Setup for grid search with cross-validation for Support Vector Machine
    # n_jobs=-1 for parallel execution using all available cores
    svmclf = GridSearchCV(svm.SVC(C=1), parameters, cv=kf, scoring=score,n_jobs=1)
    svmclf.fit(X, y)
    
    # Show each result from grid search
    print("Scores for different parameter combinations in the grid:")
    for params, mean_score, scores in svmclf.grid_scores_:
        print("  %0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params)) 
    print("")
# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")
y_pred = svmclf.predict(X)
print(classification_report(y, y_pred))
    
# Show the definition of the best model
print("Best model:")
print(svmclf.best_estimator_)
    
# Show accuracy 
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("")


~~~ Hyper-parameter tuning for best accuracy ~~~
Scores for different parameter combinations in the grid:
  0.853 (+/-0.036) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
  0.853 (+/-0.036) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
  0.900 (+/-0.031) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
  0.853 (+/-0.036) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
  0.967 (+/-0.017) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
  0.900 (+/-0.031) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
  0.967 (+/-0.022) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
  0.967 (+/-0.017) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
  0.973 (+/-0.022) for {'kernel': 'linear', 'C': 1}
  0.967 (+/-0.022) for {'kernel': 'linear', 'C': 10}
  0.973 (+/-0.022) for {'kernel': 'linear', 'C': 100}
  0.980 (+/-0.021) for {'kernel': 'linear', 'C': 1000}

Classification report:
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.98      0.98      0.98        50
          2       0.98      0.98      0.98        50

avg / total       0.99      0.99      0.99       150

Best model:
SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Accuracy: 0.987


In [6]:
# Logistic regression with 10 fold stratified cross-validation using model specific cross-validation in scikit-learn
lgclf = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10))),penalty='l2',scoring='roc_auc',cv=kf)
lgclf.fit(X, y)
y_pred = lgclf.predict(X)

# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")
print(classification_report(y, y_pred))

# Show accuracy
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))


Classification report:
             precision    recall  f1-score   support

          0       0.79      0.96      0.86        50
          1       0.85      0.70      0.77        50
          2       0.96      0.92      0.94        50

avg / total       0.87      0.86      0.86       150

Accuracy: 0.860

In [7]:
# Naive Bayes with 10 fold stratified cross-validation
nbclf = GaussianNB()
scores = cross_val_score(nbclf, X, y, cv=kf, scoring='accuracy')

# Show accuracy statistics for cross-validation
print("Accuracy: %0.3f" % (scores.mean()))


Accuracy: 0.947

In [13]:
# Connect to TabPy server using the client library
connection = tabpy_client.Client('http://localhost:9004/')
print 'connect success'


connect success

In [14]:
# The scoring function that will use the SVM Classifier to classify new data points
def iris_classiffier2(sepal_length,sepal_width,petal_length,petal_width):
    X = np.column_stack([sepal_length,sepal_width,petal_length,petal_width])
    X = scaler.transform(X)
    return encoder.inverse_transform(svmclf.predict(X)).tolist()
print 'define success'


define success

In [15]:
# Publish the function to TabPy server so it can be used from Tableau
# Using the name Iris_Classiffier2 and a short description of what it does
connection.deploy('Iris_Classiffier2',
                  iris_classiffier2,
                  'Returns Iris dataset prediction',override=True)
print 'deploy success!'


deploy success!

In [ ]: