In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn import preprocessing, metrics, svm, ensemble
from sklearn.metrics import accuracy_score, classification_report
import tabpy_client
In [2]:
# Iris dataset: http://aima.cs.berkeley.edu/data/iris.csv
df = pd.read_csv('./iris2.csv', header=0)
# Take a look at the structure of the file
df.head(n=10)
Out[2]:
In [3]:
# Use LabelEncoder to convert textual classifications to numeric.
# We will use the same encoder later to convert them back.
encoder = preprocessing.LabelEncoder()
df['Class'] = encoder.fit_transform(df['Class'])
# Check the result of the transform
df.head(n=10)
Out[3]:
In [4]:
# Split columns into independent/predictor variables vs dependent/response/outcome variable
X = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])
# Scale the data. We will use the same scaler later for scoring function
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
# 10 fold stratified cross validation
kf = StratifiedKFold(y, n_folds=10, random_state=None, shuffle=True)
# Define the parameter grid to use for tuning the Support Vector Machine
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
# Pick accuracy as the goal you're optimizing
scoringmethods = ['accuracy']
print 'pass'
In [5]:
# Iterate through different metrics looking for best parameter set
for score in scoringmethods:
print("~~~ Hyper-parameter tuning for best %s ~~~" % score)
# Setup for grid search with cross-validation for Support Vector Machine
# n_jobs=-1 for parallel execution using all available cores
svmclf = GridSearchCV(svm.SVC(C=1), parameters, cv=kf, scoring=score,n_jobs=1)
svmclf.fit(X, y)
# Show each result from grid search
print("Scores for different parameter combinations in the grid:")
for params, mean_score, scores in svmclf.grid_scores_:
print(" %0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() / 2, params))
print("")
# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")
y_pred = svmclf.predict(X)
print(classification_report(y, y_pred))
# Show the definition of the best model
print("Best model:")
print(svmclf.best_estimator_)
# Show accuracy
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("")
In [6]:
# Logistic regression with 10 fold stratified cross-validation using model specific cross-validation in scikit-learn
lgclf = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10))),penalty='l2',scoring='roc_auc',cv=kf)
lgclf.fit(X, y)
y_pred = lgclf.predict(X)
# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")
print(classification_report(y, y_pred))
# Show accuracy
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
In [7]:
# Naive Bayes with 10 fold stratified cross-validation
nbclf = GaussianNB()
scores = cross_val_score(nbclf, X, y, cv=kf, scoring='accuracy')
# Show accuracy statistics for cross-validation
print("Accuracy: %0.3f" % (scores.mean()))
In [13]:
# Connect to TabPy server using the client library
connection = tabpy_client.Client('http://localhost:9004/')
print 'connect success'
In [14]:
# The scoring function that will use the SVM Classifier to classify new data points
def iris_classiffier2(sepal_length,sepal_width,petal_length,petal_width):
X = np.column_stack([sepal_length,sepal_width,petal_length,petal_width])
X = scaler.transform(X)
return encoder.inverse_transform(svmclf.predict(X)).tolist()
print 'define success'
In [15]:
# Publish the function to TabPy server so it can be used from Tableau
# Using the name Iris_Classiffier2 and a short description of what it does
connection.deploy('Iris_Classiffier2',
iris_classiffier2,
'Returns Iris dataset prediction',override=True)
print 'deploy success!'
In [ ]: