In [ ]:
import requests
import json
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import scipy
from scipy import *
%matplotlib inline
#Scikitlearn imports
from sklearn.datasets.samples_generator import make_blobs, make_moons
from sklearn.cross_validation import train_test_split
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC # "Support Vector Classifier"
sns.set_context("talk", font_scale=1.3)
ben_style = {'axes.axisbelow': True,
'axes.edgecolor': '0',
'axes.facecolor': 'white',
'axes.grid': True,
'axes.labelcolor': '.15',
'axes.linewidth': 1,
'figure.facecolor': 'white',
'font.family': [u'sans-serif'],
'font.size': 75,
'font.sans-serif': [u'Helvetica Neue',
u'sans-serif'],
'grid.color': '0.9',
'grid.linestyle': u'',
'image.cmap': u'Greys',
'legend.frameon': False,
'legend.numpoints': 1,
'legend.scatterpoints': 1,
'lines.solid_capstyle': u'round',
'text.color': '.15',
'xtick.color': '.15',
'xtick.direction': u'in',
'xtick.major.size': 3.0,
'xtick.minor.size': 2.0,
'ytick.color': '.15',
'ytick.direction': u'in',
'ytick.major.size': 3.0,
'ytick.minor.size': 2.0}
sns.set_style("white", ben_style)
service_location = "http://localhost:3000"
debug = False
def printd(str):
if debug:
print str
def get_collection_member(id=None, **kw):
global service_location
if id is None:
return
return requests.get(service_location+'/collection/%s/member'%(id))
def get_ko(id=None, query=None):
global service_location
query_string = ""
if not id and not query:
return
if type(id) is list:
#id = [x for x in id if x != None]
print id
query = {"_id":{"$in":id}}
id = None
if query:
query_string = "?where=%s"%(json.dumps(query))
if id is None:
id = ""
request = service_location+'/ko/%s%s'%(id, query_string)
printd(request)
return requests.get(request)
def create_ko(ko):
global service_location
return requests.post(service_location+'/ko', json=ko)
def create_collection(col):
global service_location
return requests.post(service_location+'/collection',json=col)
def tag_search(tag):
element = {"key":tag['key']}
if "value" in tag:
element['value'] = tag['value']
search = {"tag":{"$elemMatch":element}}
return search
def get_collection(id=None,query=None):
global service_location
query_string = "?where="
if not id and not query:
return
if query:
query_string = "?where=%s"%(json.dumps(query))
if id is None:
id = ""
request = service_location+'/collection/%s%s'%(id, query_string)
printd(request)
return requests.get(request)
In [ ]:
# Blobs with 4 -- slight overlaps
num_centers = 5
st_dev = 1
n_samples = 1000
noise = 0.12
#Easy Case
#x,y = make_blobs(n_samples=n_samples, centers=num_centers, cluster_std=st_dev, random_state=10)
#Medium Case
x,y = make_blobs(n_samples=n_samples, centers=num_centers, cluster_std=st_dev, random_state=0)
dataset = {"x":x.tolist(), "y":y.tolist()}
#Harder Case
# x,y = make_moons(n_samples=n_samples, noise=noise)
# dataset = {"x":x.tolist(), "y":y.tolist()}
In [ ]:
sample_ko = {"owner":"blaiszik","key":"test","object":"test","uri":["http://google.com"],"data":dataset}
r = create_ko(sample_ko)
result = r.json()
new_id = result['_id']
print "Created ko: %s"%(result['_id'])
In [ ]:
sample_collection = {"owner":"blaiszik","name":"aps-tutorial", "uri":[], "tag":[{"key":"tutorial-new6", "value":None}]}
sample_collection['member'] = [{"data_type":"ko", "_id": result['_id']}
]
sample_collection['tag']
r = create_collection([sample_collection])
In [ ]:
# Get a collection based on a tag search
r = get_collection(query=tag_search({"key":"tutorial-new6"}))
result = r.json()[0]
#Read collection ko members
ids = [member['_id'] if member['data_type']=='ko' else None
for member in result['member']]
r = get_ko(id=ids)
r.json()
In [ ]:
print "Getting ko %s"%(new_id)
r = get_ko(new_id)
# print "Getting ko by query:"
# query = {"owner":"wilde"}
# r = get_ko(query=query)
result = r.json()
#r = get_ko(id=["5679aad366304c16141be297","5679aad366304c16141be297"])
r.json()
In [ ]:
df1 = pd.DataFrame(result['data']['x'], columns=["x1","x2"])
df2 = pd.DataFrame(result['data']['y'], columns=["y"])
df = pd.concat([df1,df2], axis=1)
In [ ]:
plt.scatter(df['x1'], df['x2'], c=df['y'], s=50, cmap=plt.cm.RdBu_r);
sns.despine()
In [ ]:
clfs = [
(ExtraTreesClassifier(n_estimators=10), "Extra Trees"),
(RandomForestClassifier(n_estimators=10), "Random Forest"),
(GaussianNB(), "Gaussian Naive-Bayes")
]
for i, (clf,title) in enumerate(clfs):
clf.fit(df[['x1','x2']], df['y'])
fig = sns.lmplot(x="x1", y="x2", data=df, order=1, hue="y", fit_reg=False, scatter_kws={"s": 50});
fig.ax.set_ylabel('x1')
fig.ax.set_xlabel('x2')
fig.ax.set_title(title)
## Plot decision contour or probability function
h = .01 # step size in the mesh
x_min, x_max = df['x1'].min() - 1, df['x1'].max() + 1
y_min, y_max = df['x2'].min() - 1, df['x2'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
fig.ax.contourf(xx, yy, Z, alpha=0.2, cmap=plt.cm.RdBu_r)
In [ ]:
distortions = []
silhouette_range = range(1,10)
for i in silhouette_range:
km = KMeans(n_clusters=i,
init='k-means++',
n_init=10,
max_iter=300,
random_state=0)
km.fit(df[['x1','x2']])
distortions .append(km.inertia_)
plt.plot(silhouette_range, distortions , marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()
#plt.savefig('./figures/elbow.png', dpi=300)
sns.despine()
plt.show()
In [ ]:
import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples
silhouette_avg = []
silhouette_range = range(2,10)
for i in silhouette_range:
km = KMeans(n_clusters=i,
init='k-means++',
n_init=10,
max_iter=300,
tol=1e-04,
random_state=0)
y_km = km.fit_predict(df[['x1','x2']])
silhouette_vals = silhouette_samples(df[['x1','x2']], y_km, metric='euclidean')
silhouette_avg.append(np.mean(silhouette_vals))
plt.plot(silhouette_range, silhouette_avg , marker='o')
sns.despine()
In [ ]:
show_decision = True
show_centroids = True
#K-Means
clr = KMeans(n_clusters=num_centers)
y_pred = clr.fit_predict(df[['x1','x2']])
##Decision Boundary
if show_decision:
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = df['x1'].min() - 1, df['x1'].max() + 1
y_min, y_max = df['x2'].min() - 1, df['x2'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clr.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.RdBu_r,
aspect='auto', origin='lower', alpha=0.25, zorder=0)
sns.despine()
plt.scatter(df['x1'],df['x2'], c=df['y'], cmap=plt.cm.RdBu_r, zorder=1)
ax = plt.gca()
ax.set_xlabel('x1')
ax.set_ylabel('x2')
if show_centroids:
# Plot the centroids as a green X
centroids = clr.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='o', s=100, linewidths=8,
color='darkblue', zorder=2)
In [ ]:
df[['x1','x2']].info()
In [ ]:
import skflow
from sklearn import datasets, metrics
iris = datasets.load_iris()
clf = skflow.TensorFlowDNNClassifier(hidden_units=[100, 200, 100], n_classes=4)
clf.fit(iris.data, iris.target)
score = metrics.accuracy_score(clf.predict(iris.data), iris.target)
print("Accuracy: %f" % score)
In [ ]: