In [ ]:
import requests
import json
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import scipy
from scipy import *

%matplotlib inline 

#Scikitlearn imports
from sklearn.datasets.samples_generator import make_blobs, make_moons
from sklearn.cross_validation import train_test_split
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC # "Support Vector Classifier"

sns.set_context("talk", font_scale=1.3)

ben_style = {'axes.axisbelow': True,
 'axes.edgecolor': '0',
 'axes.facecolor': 'white',
 'axes.grid': True,
 'axes.labelcolor': '.15',
 'axes.linewidth': 1,
 'figure.facecolor': 'white',
 'font.family': [u'sans-serif'],
             'font.size': 75,
 'font.sans-serif': [u'Helvetica Neue',
                     u'sans-serif'],
 'grid.color': '0.9',
 'grid.linestyle': u'',
 'image.cmap': u'Greys',
 'legend.frameon': False,
 'legend.numpoints': 1,
 'legend.scatterpoints': 1,
 'lines.solid_capstyle': u'round',
 'text.color': '.15',
 'xtick.color': '.15',
 'xtick.direction': u'in',
 'xtick.major.size': 3.0,
 'xtick.minor.size': 2.0,
 'ytick.color': '.15',
 'ytick.direction': u'in',
 'ytick.major.size': 3.0,
 'ytick.minor.size': 2.0}

sns.set_style("white", ben_style)

service_location = "http://localhost:3000"
debug = False

def printd(str):
    if debug:
        print str

def get_collection_member(id=None, **kw):
    global service_location
    
    if id is None:
        return
    return requests.get(service_location+'/collection/%s/member'%(id))

def get_ko(id=None, query=None):
    global service_location
    query_string = ""
    
    if not id and not query:
        return
    if type(id) is list:
        #id = [x for x in id if x != None]
        print id
        query = {"_id":{"$in":id}}
        id = None
    if query:
        query_string = "?where=%s"%(json.dumps(query))
    if id is None:
        id = ""
    
    request = service_location+'/ko/%s%s'%(id, query_string)
    printd(request)
    return requests.get(request)

def create_ko(ko):
    global service_location
    return requests.post(service_location+'/ko', json=ko)

def create_collection(col):
    global service_location
    return requests.post(service_location+'/collection',json=col)

def tag_search(tag):
    element = {"key":tag['key']}
    if "value" in tag:
        element['value'] = tag['value']
        
    search = {"tag":{"$elemMatch":element}}
    return search
    
def get_collection(id=None,query=None):
    global service_location
    query_string = "?where="
    
    if not id and not query:
        return
    if query:
        query_string = "?where=%s"%(json.dumps(query))
    if id is None:
        id = ""
        
    request = service_location+'/collection/%s%s'%(id, query_string)
    printd(request)
    return requests.get(request)

Create test datasets

In this case we are making a n=5 of "gaussian blobs" with st_dev=1. We will look at 2 cases,

  • easy case: The blobs are well separated
  • medium case: Blobs overlap slightly
  • harder case: Blobs are moon shaped and intersect

In [ ]:
# Blobs with 4 -- slight overlaps
num_centers = 5
st_dev = 1
n_samples = 1000
noise = 0.12
#Easy Case
#x,y = make_blobs(n_samples=n_samples, centers=num_centers, cluster_std=st_dev, random_state=10)

#Medium Case
x,y = make_blobs(n_samples=n_samples, centers=num_centers, cluster_std=st_dev, random_state=0)
dataset = {"x":x.tolist(), "y":y.tolist()}

#Harder Case
# x,y = make_moons(n_samples=n_samples, noise=noise)
# dataset = {"x":x.tolist(), "y":y.tolist()}

Create ko


In [ ]:
sample_ko = {"owner":"blaiszik","key":"test","object":"test","uri":["http://google.com"],"data":dataset}
r = create_ko(sample_ko)
result = r.json()
new_id =  result['_id']
print "Created ko: %s"%(result['_id'])

Create a Collection and add ko


In [ ]:
sample_collection = {"owner":"blaiszik","name":"aps-tutorial", "uri":[], "tag":[{"key":"tutorial-new6", "value":None}]}
sample_collection['member'] = [{"data_type":"ko", "_id": result['_id']}
                              ]
sample_collection['tag']
r = create_collection([sample_collection])

Load a Collection and read ko members


In [ ]:
# Get a collection based on a tag search
r = get_collection(query=tag_search({"key":"tutorial-new6"}))
result = r.json()[0]

#Read collection ko members
ids = [member['_id'] if member['data_type']=='ko' else None 
       for member in result['member']]
r = get_ko(id=ids)
r.json()

Get ko By ID or by Query


In [ ]:
print "Getting ko %s"%(new_id)
r = get_ko(new_id)

# print "Getting ko by query:"
# query = {"owner":"wilde"}
# r = get_ko(query=query)

result =  r.json()

#r = get_ko(id=["5679aad366304c16141be297","5679aad366304c16141be297"])
r.json()

In [ ]:
df1 = pd.DataFrame(result['data']['x'], columns=["x1","x2"])
df2 = pd.DataFrame(result['data']['y'], columns=["y"])
df = pd.concat([df1,df2], axis=1)

In [ ]:
plt.scatter(df['x1'], df['x2'], c=df['y'], s=50, cmap=plt.cm.RdBu_r);
sns.despine()

Classifier Testing


In [ ]:
clfs = [ 
        (ExtraTreesClassifier(n_estimators=10), "Extra Trees"),
        (RandomForestClassifier(n_estimators=10), "Random Forest"),
        (GaussianNB(), "Gaussian Naive-Bayes")
        ]

for i, (clf,title) in enumerate(clfs):
    clf.fit(df[['x1','x2']], df['y'])
    fig  = sns.lmplot(x="x1", y="x2", data=df, order=1, hue="y", fit_reg=False, scatter_kws={"s": 50});
    fig.ax.set_ylabel('x1')
    fig.ax.set_xlabel('x2')
    fig.ax.set_title(title)
    
    ## Plot decision contour or probability function
    h = .01  # step size in the mesh
    x_min, x_max = df['x1'].min() - 1, df['x1'].max() + 1
    y_min, y_max = df['x2'].min() - 1, df['x2'].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                              np.arange(y_min, y_max, h))
    if hasattr(clf, "decision_function"):
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)
    fig.ax.contourf(xx, yy, Z, alpha=0.2, cmap=plt.cm.RdBu_r)

Clustering

Find the optimal number of clusters

  • Automatic grouping of similar objects into sets.
  • Applications: Customer segmentation, Grouping experiment outcomes

In [ ]:
distortions = []
silhouette_range = range(1,10)

for i in silhouette_range:
    km = KMeans(n_clusters=i, 
                init='k-means++', 
                n_init=10, 
                max_iter=300, 
                random_state=0)
    km.fit(df[['x1','x2']])
    distortions .append(km.inertia_)
plt.plot(silhouette_range, distortions , marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()
#plt.savefig('./figures/elbow.png', dpi=300)
sns.despine()
plt.show()

Silhouette Averages


In [ ]:
import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples

silhouette_avg = []
silhouette_range = range(2,10)

for i in silhouette_range:    
    km = KMeans(n_clusters=i, 
                init='k-means++', 
                n_init=10, 
                max_iter=300,
                tol=1e-04,
                random_state=0)
    y_km = km.fit_predict(df[['x1','x2']])
    
    silhouette_vals = silhouette_samples(df[['x1','x2']], y_km, metric='euclidean')
    silhouette_avg.append(np.mean(silhouette_vals))
plt.plot(silhouette_range, silhouette_avg , marker='o')
sns.despine()

K-Means Clustering


In [ ]:
show_decision = True
show_centroids = True

#K-Means
clr = KMeans(n_clusters=num_centers)
y_pred = clr.fit_predict(df[['x1','x2']])

##Decision Boundary
if show_decision:
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = df['x1'].min() - 1, df['x1'].max() + 1
    y_min, y_max = df['x2'].min() - 1, df['x2'].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    Z = clr.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()

    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.RdBu_r,
               aspect='auto', origin='lower', alpha=0.25, zorder=0)
    sns.despine()

plt.scatter(df['x1'],df['x2'], c=df['y'], cmap=plt.cm.RdBu_r, zorder=1)
ax = plt.gca()
ax.set_xlabel('x1')
ax.set_ylabel('x2')

if show_centroids:
    # Plot the centroids as a green  X
    centroids = clr.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='o', s=100, linewidths=8,
                color='darkblue', zorder=2)

Deep Neural Network


In [ ]:
df[['x1','x2']].info()

In [ ]:
import skflow
from sklearn import datasets, metrics

iris = datasets.load_iris()
clf = skflow.TensorFlowDNNClassifier(hidden_units=[100, 200, 100], n_classes=4)

clf.fit(iris.data, iris.target)
score = metrics.accuracy_score(clf.predict(iris.data), iris.target)
print("Accuracy: %f" % score)

In [ ]: