In [ ]:
import csv, re
import numpy as np
from random import shuffle
from collections import Counter
from sklearn import cross_validation
from sklearn import svm
ftrain = open('features.train')
ftest = open('features.test')
def read_file(f):
return [map(float, re.split('\s+', r.strip())) for r in f]
all_train = read_file(ftrain)
all_test = read_file(ftest)
In [ ]:
def run_svm(X, Y, C, K):
N = len(Y)
ssvm = svm.SVC(kernel='poly', C=10000000, gamma=1, degree=K, coef0=1)
ssvm.fit(X, Y)
return ssvm
In [ ]:
def build_vs_all_set(data, which):
"""Prepare a which-vs-all dataset."""
X = []
Y = []
for p in data:
X.append(p[1:])
if int(p[0]) == int(which):
Y.append(1.0)
else:
Y.append(-1.0)
return X, Y
def build_vs_vs_set(data, one, other):
"""Prepare a one-vs-other dataset"""
X = []
Y = []
for p in data:
if int(p[0]) == one:
X.append(p[1:])
Y.append(1.0)
elif int(p[0]) == other:
X.append(p[1:])
Y.append(-1.0)
return X, Y
In [ ]:
def train_and_score(train_set, degree, C, test_set=False):
ssvm = svm.SVC(kernel='poly',
C=C,
gamma=1,
degree=degree,
coef0=1)
ssvm.fit(*train_set) # X, Y
return [ssvm.score(*train_set),
len(ssvm.support_vectors_),
ssvm.score(*test_set) if test_set else None]
def train_and_score_rbf(train_set, C, test_set=False):
ssvm = svm.SVC(kernel='rbf', gamma=1, C=C)
ssvm.fit(*train_set) # X, Y
return [ssvm.score(*train_set),
len(ssvm.support_vectors_),
ssvm.score(*test_set) if test_set else None]
In [ ]:
def ex2():
r = []
for j in [0, 2, 4, 6, 8]:
r.append([j] +
train_and_score(
train_set = build_vs_all_set(all_train, j),
degree = 2,
C = 0.01))
return min(r, key=lambda p: p[1])
def ex3():
r = []
for j in [1, 3, 5, 7, 9]:
r.append([j] +
train_and_score(
train_set = build_vs_all_set(all_train, j),
degree = 2,
C = 0.01))
return max(r, key=lambda p: p[1])
result_2 = ex2()
result_3 = ex3()
print result_2
print result_3
def ex4():
return abs(result_3[2] - result_2[2])
print ex4()
In [ ]:
def ex5():
train_set = build_vs_vs_set(all_train, 1, 5)
test_set = build_vs_vs_set(all_test, 1, 5)
for C in [0.001, 0.01, 0.1, 1]:
print train_and_score(
train_set = train_set,
degree=2,
C=C,
test_set = test_set)
print '**'
ex5()
In [ ]:
def ex6():
train_set = build_vs_vs_set(all_train, 1, 5)
test_set = build_vs_vs_set(all_test, 1, 5)
for C in [0.0001, 0.001, 0.01, 1]:
print "** C={:f}".format(C)
for Q in [2, 5]:
print train_and_score(train_set=train_set,
degree=Q,
C=C,
test_set=test_set)
print ''
ex6()
In [ ]:
def ex7_8():
def do_run():
"""Run and cross-validate all models with a partition."""
train_set = build_vs_vs_set(all_train, 1, 5)
# zip, shuffle, and unzip (to keep the X-Y correspondence)
train_set = zip(*train_set)
shuffle(train_set)
train_set = zip(*train_set)
this_run_results = []
# for each C in the question
for C in [0.0001, 0.001, 0.01, 0.1, 1]:
ssvm = svm.SVC(kernel='poly',
C=C,
gamma=1,
degree=2,
coef0=1)
scores = cross_validation.cross_val_score(
ssvm, train_set[0], train_set[1], cv=10)
this_run_results.append([C, np.mean(scores)])
# return a tuple like (C, E_cv) for the best C found in this run
return max(this_run_results,
key=lambda p: p[1])
all_runs = [do_run() for _ in range(100)]
# count occurrences in the results and average out E_cv
return Counter([x[0] for x in all_runs]), \
1 - np.mean([x[1] for x in all_runs])
ex7_8()
In [ ]:
def ex9_10():
train_set = build_vs_vs_set(all_train, 1, 5)
test_set = build_vs_vs_set(all_test, 1, 5)
for C in [0.01, 1, 100, 10e4, 10e6]:
print train_and_score_rbf(
train_set=train_set,
C=C,
test_set=test_set)
ex9_10()