In [2]:
import numpy as np
from bokeh.plotting import HBox, VBox, figure, show, output_file, GridPlot
from bokeh.models.mappers import LinearColorMapper
from bokeh.models import BasicTicker, Grid
from sklearn import metrics
from sklearn import preprocessing
from sklearn.datasets import fetch_olivetti_faces
from sklearn.utils.validation import check_random_state
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.cross_validation import train_test_split
from sklearn.covariance import GraphLassoCV, ledoit_wolf
from sklearn.grid_search import GridSearchCV
from scipy.spatial import distance
import sklearn
import OVFM.Model as md
import OVFM.FeatureMap as fm
import OVFM.Risk as rsk
import OVFM.LearningRate as lr
import OVFM.DataGeneration as dg
import OVFM.SGD as sgd
import time
import sys
In [3]:
def simplex_map( i ):
if i > 2:
return np.bmat( [ [ [ [ 1 ] ], np.repeat( -1.0 / ( i - 1 ), i - 1 ).reshape( ( 1, i - 1 ) ) ], [ np.zeros( ( i - 2, 1 ) ), simplex_map( i - 1 ) * np.sqrt( 1.0 - 1.0 / ( i - 1 ) ** 2 ) ] ] )
elif i == 2:
return np.array( [ [ 1, -1 ] ] )
else:
raise "invalid number of classes"
In [54]:
# Load the training and test data sets
train = np.genfromtxt('train.csv', delimiter=',',skip_header=1)
# test = np.genfromtxt('test.csv', delimiter=',',skip_header=1)
# Create numpy arrays for use with scikit-learn
train_X = train[:,1:-1].astype( float )
train_y = train[:,-1:]
# test_X = test[:,1:]
D = 1000
gamma = 0.1
C = 0.000
eta0 = 0.5
scaler = preprocessing.StandardScaler( )
train_X = scaler.fit_transform( train_X )
In [55]:
rf = RandomForestClassifier( n_estimators = 500 )
X,X_,y,y_ = train_test_split( train_X, train_y.ravel( ), test_size = 0.33 )
rf.fit( X, y )
y_rf = rf.predict( X_ )
print metrics.classification_report( y_, y_rf )
print metrics.accuracy_score( y_, y_rf )
In [59]:
lb = preprocessing.LabelBinarizer( neg_label = 0, pos_label = 1 )
train_y = np.dot( lb.fit_transform( train_y ).astype( float ), simplex_map( 7 ).T )
# gff = fm.GaussianFF( gamma, train_X.shape[ 1 ], D )
# Kex = gff.kernel_exact( train_X )
# Kap = gff.kernel_approx( train_X )
# fig, axes = plt.subplots( nrows=1, ncols=2, sharex=False, sharey=False )
# im = axes[ 0 ].imshow( Kex, origin = 'lower' )
# im.set_cmap( 'hot' )
# axes[ 0 ].set_title( 'Kernel exact' )
# im = axes[ 1 ].imshow( Kap, origin = 'lower' )
# im.set_cmap( 'hot' )
# axes[ 1 ].set_title( 'Kernel approximation' )
# plt.show( )
# print 'Kernel approximation MSE:', np.linalg.norm( Kex - Kap ) ** 2 / train_X.size
# M = np.cov( train_y.T )
# Dg = np.diag( np.diag( M ) + np.sum( M, axis = 0 ) )
# L = np.linalg.inv( Dg - M )
In [107]:
D = 1000
gamma = 0.2
C = 1e-5
eta0 = 1.0
L = simplex_map( 7 )
In [112]:
risk = rsk.Ridge( C )
lc = lr.Constant( 1. * eta0 )
lb = lr.Constant( 0.01 * eta0 )
X,X_,y,y_ = train_test_split( train_X, train_y, test_size = .33 )
model = md.Model( fm.DecomposableFF( gamma, train_X.shape[ 1 ], D, B = np.eye( 6 ) ) )
opt = sgd.SGD( risk, 5.0, lc, lb, 10, 10000 )
opt.fit( model, X, y )
y_rf = model( X_ )
S = simplex_map( 7 )
y_ = np.argmax( np.dot( y_, S ), axis = 1 )
y_rf = np.argmax( np.dot( y_rf, S ), axis = 1 )
In [113]:
print metrics.classification_report( y_, y_rf )
print metrics.accuracy_score( y_, y_rf )
In [105]:
model.bias
Out[105]:
In [ ]: