In [10]:
import numpy as np
import scipy as sp
from sklearn.datasets import fetch_mldata
import math
from collections import Counter
from scipy.cluster.vq import vq, kmeans, whiten
from skll.metrics import kappa
In [2]:
dataset = fetch_mldata('banana-ida')
In [3]:
def laplace(stddev, size):
return np.random.laplace(0, stddev, size)
def laplace(stddev):
return np.random.laplace(0, stddev)
def noisy_count(data, epsilon):
return len(data) + laplace(1. / epsilon)
def noisy_sum(data, epsilon):
clipped = np.clip(data, -1, 1)
return np.sum(clipped) + laplace(1. / epsilon)
def noisy_average(data, epsilon):
clipped = np.clip(data, -1, 1)
tally = np.sum(clipped)
count = len(clipped)
if count == 0:
return np.random.uniform(-1, 1)
candidate = (tally + laplace(2. / epsilon)) / count
while candidate < -1.0 or candidate > 1.0:
candidate = (tally + laplace(2. / epsilon)) / count
return candidate
In [4]:
def gen_data(dimensions, length):
return np.random.uniform(0, 1, dimensions*length).reshape(length, dimensions)
def gen_datapoint(dimensions):
return np.random.uniform(0, 1, dimensions)
In [11]:
def perceptron_step(x, y, normal, epsilon):
errors = np.array([xi*yi for xi, yi in zip(x, y) if (yi*np.sum(xi*normal)) < 0]).reshape((-1, normal.shape[0]))
newnormal = np.zeros(normal.shape)
for i in range(len(normal)):
newnormal[i] = normal[i] + noisy_average(errors[:, i], epsilon)
return newnormal
def svm_step(x, y, normal, epsilon):
errors = np.array([xi*yi for xi, yi in zip(x, y) if (yi*np.sum(xi*normal)) < 1]).reshape((-1, normal.shape[0]))
errors = np.vstack((errors, np.array(10*[-normal])))
newnormal = np.zeros(normal.shape)
for i in range(len(normal)):
newnormal[i] = normal[i] + noisy_average(errors[:, i], epsilon)
return newnormal
def logistic_step(x, y, normal, epsilon):
errors = np.array([xi*((yi+1)/2. - 1./(1+np.exp(np.sum(xi*normal)))) for xi, yi in zip(x, y)]).reshape((-1, normal.shape[0]))
newnormal = np.zeros(normal.shape)
for i in range(len(normal)):
newnormal[i] = normal[i] + noisy_average(errors[:, i], epsilon)
return newnormal
def fit_binary(x, y, fn, epsilon, niter=20):
if any(abs(yi) != 1 for yi in y):
y[y == 0] = -1
if any(abs(yi) != 1 for yi in y):
raise ValueError('Unrecognized class label occured')
normal = gen_datapoint(x.shape[1])
for _ in range(niter):
normal = fn(x, y, normal, epsilon)
return normal
def eval_binary(x, y, normal):
err = y*x.dot(normal)
err[err > 0] = 0
err[err < 0] = 1
return 1 - np.average(err)
def fit_ova(x, y, fn, epsilon, niter=20):
yset = sorted(list(set(y)))
normal = gen_data(x.shape[1], len(yset))
labels = np.ones((len(yset), x.shape[0]), dtype=np.int) * -1
for idx, yi in enumerate(yset):
labels[idx, np.where(y==yi)[0]] = 1
for _ in range(niter):
for idx, yi in enumerate(yset):
normal[idx, :] = fn(x, labels[idx, :], normal[idx, :], epsilon)
return normal
def eval_ova(x, y, normal):
yset = sorted(list(set(y)))
ydict = dict(zip(yset, range(len(yset))))
ytr = [ydict[yi] for yi in y]
err = ytr - np.argmax(x.dot(normal.T), axis=1)
err[err != 0] = 1
return 1 - np.average(err)
def eval_ova_kappa(x, y, normal):
yset = sorted(list(set(y)))
ydict = dict(zip(yset, range(len(yset))))
ytr = [ydict[yi] for yi in y]
return kappa(ytr, np.argmax(x.dot(normal.T), axis=1))
In [6]:
derp = dataset['target']
multinormal = fit_ova(dataset['data'], dataset['target'], svm_step, 0.1, niter=50)
In [7]:
eval_ova(dataset['data'], dataset['target'], multinormal)
Out[7]:
In [12]:
eval_ova_kappa(dataset['data'], dataset['target'], multinormal)
Out[12]:
In [16]:
from sklearn.cross_validation import train_test_split
dsets = ['iris', 'diabetes_scale', 'image-ida', 'diabetes-ida', 'breast-cancer-ida', 'ringnorm-ida', 'thyroid-ida', 'usps']
fns = [(logistic_step, 'log'), (svm_step, 'svm'), (perceptron_step, 'perc')]
for dset in dsets:
dataset = fetch_mldata(dset)
X_train, X_test, Y_train, Y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=42)
for niter in [5, 10, 15, 25]:
for eps in [0.001, 0.01, 0.1, 0.5, 1]:
for fn in fns:
result = 0
for _ in range(10):
multinormal = fit_ova(X_train, Y_train, fn[0], eps, niter)
result += eval_ova_kappa(X_test, Y_test, multinormal)
print dset+';'+fn[1]+';'+str(niter)+';'+str(eps)+';'+str(result / 10)
In [ ]:
import seaborn
In [583]:
dsets = ['iris', 'banana-ida','diabetes_scale', 'image-ida',
'diabetes-ida', 'breast-cancer-ida', 'ringnorm-ida', 'thyroid-ida']
for dset in dsets:
dataset = fetch_mldata(dset)
Y = dataset['target']
Yset = list(set(Y))
cnt = Counter(Y)
guess_ratio = cnt.most_common(1)[0][1] / float(len(Y))
print dset +';'+str(guess_ratio)
In [ ]:
In [ ]:
In [547]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score
clf = linear_model.SGDClassifier()
clf.fit(dataset['data'], dataset['target'])
Out[547]:
In [548]:
pred = clf.predict(dataset['data'])
Y = dataset['target']
mtx = [1 if y1==y2 else 0 for (y1, y2) in zip(pred, Y) ]
In [549]:
np.sum(mtx)/float(len(mtx))
Out[549]:
In [ ]: