In [81]:
import numpy as np

from bokeh.plotting import HBox, VBox, figure, show, output_file, GridPlot
from bokeh.models.mappers import LinearColorMapper
from bokeh.models import BasicTicker, Grid 

from sklearn import metrics
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.datasets import fetch_olivetti_faces
from sklearn.utils.validation import check_random_state
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.cross_validation import train_test_split
from sklearn.covariance import GraphLassoCV, ledoit_wolf
from sklearn.grid_search import GridSearchCV
from scipy.spatial import distance
 
import scipy
import sklearn
import OVFM.Model as md
import OVFM.FeatureMap as fm
import OVFM.Risk as rsk
import OVFM.LearningRate as lr
import OVFM.DataGeneration as dg
import OVFM.SGD as sgd
 
import time
import sys

In [9]:
## Parse NCI60 files and match CID for supervised learning
# def parse_nci60_targets( filename ):
#     nci60 = open( filename )
#     n = sum( 1 for line in nci60 ) - 1
#     nci60_targets = np.empty( ( n, 60 ) )
#     cid = np.empty( n, dtype = np.long )

#     nci60.seek( 0 )
#     header = nci60.readline( ).split( )
#     for i, e in enumerate( header ):
#         header[ i ] = e.replace( '\"', '' )
#     for i, line in enumerate( nci60 ):
#         split_line = line.split( ' ' )
#         cid[ i ] = long( split_line[ 0 ].replace( '\"', '' ) )
#         nci60_targets[ i, : ] = np.array( [ float( e ) for e in split_line[ -60: ] ] )
#     nci60.close( )
#     return cid, nci60_targets

# def parse_nci60_paths( filename, cid_match ):
#     nci60 = open( filename )
#     nci60_paths = np.empty( ( cid_match.shape[ 0 ], 17793 ) )
#     cid = np.empty( cid_match.shape[ 0 ], dtype = np.long )

#     nci60.seek( 0 )
#     header = nci60.readline( )
#     for i, line in enumerate( nci60 ):
#         split_line = line.split( ' ' )
#         value = long( split_line[ 0 ].replace( '\"', '' ) )
#         match = np.where( value == cid_match )[ 0 ]
#         if match.size == 1:
#             cid[ match[ 0 ] ] = value
#             nci60_paths[ match[ 0 ], : ] = np.array( [ float( e ) for e in split_line[ 1: ] ] )

#     return cid, nci60_paths
    
# cid_targets, targets = parse_nci60_targets( 'nci60.ztable' )
# np.savetxt( "cid_targets.csv", cid_targets, delimiter= " ",  fmt='%i' )
# np.savetxt( "targets.csv", targets, delimiter= " " )
    
# cid_paths, paths = parse_nci60_paths( 'nci60paths.ztable', cid_targets )
# np.savetxt( "cid_paths.csv", cid_paths, delimiter= " ",  fmt='%i' )
# np.savetxt( "paths.csv", paths, delimiter= " " )

cid_targets = np.genfromtxt( 'cid_targets.csv', delimiter= ' ' )
targets = np.genfromtxt( 'targets.csv', delimiter= ' ' )

cid_paths = np.genfromtxt( 'cid_paths.csv', delimiter= ' ' )
paths = np.genfromtxt( 'paths.csv', delimiter= ' ' )

In [79]:
# Preprocessing:
# remove feature where all data are 0
paths_pruned = np.delete( paths, np.where( np.sum( paths, axis = 0 ) == 0 )[ 0 ], axis = 1 )
# take log of paths count
paths_log = np.concatenate( ( np.log( paths_pruned[ :, 0 ] ).reshape( paths_pruned.shape[ 0 ], 1 ), paths_pruned[ :, 1: ] ), axis = 1 )
# unit variance, 0 mean
paths_scaled = preprocessing.StandardScaler( ).fit_transform( paths_log )

In [83]:
paths_train, paths_test, targets_train, targets_test = cross_validation.train_test_split( paths_scaled, targets, test_size=0.33 )

In [119]:
D = 1000
gamma = 1. / paths_scaled.shape[ 1 ]
L = np.cov( 60 )
C = 0
eta0 = 1

df = fm.DecomposableFF( 1 * gamma, paths_train.shape[ 1 ], D, L )
modelD = md.Model( df )

risk = rsk.Ridge( C, 0 )
lc = lr.Constant( 1 * eta0 )
lb = lr.Constant( 0.0 * eta0 )
est = sgd.SGD( risk, 5.0, lc, lb, 10, 10000 )

In [120]:
est.fit( modelD, paths_train, targets_train )


0 0.812794793852 0.0 0.0 1.20875736469 0.0422164685928
10000 0.605307575153 0.0 0.0 2766.06981454 2.10578072754
20000 0.556057819055 0.0 0.0 8040.48522996 3.25707011116
30000 0.522490560176 0.0 0.0 14316.117504 4.02373124683
40000 0.497711098318 0.0 0.0 21219.5620121 4.73402111131
50000 0.477222052524 0.0 0.0 28419.4066856 5.38661373744
60000 0.460263923777 0.0 0.0 35799.6706209 5.90954029552
70000 0.446181929518 0.0 0.0 43360.0450871 6.35919916736

In [121]:
print np.mean( ( modelD( paths_test ) - targets_test ) ** 2 )


0.75496610424

In [122]:
modelD( paths_train )


Out[122]:
array([[ 0.516871  , -0.3704659 , -0.3739153 , ..., -0.25307039,
        -0.75928157, -0.5401718 ],
       [ 0.37059937, -0.33339466, -0.20259142, ..., -0.0523545 ,
        -0.92856252,  0.2724876 ],
       [ 0.54806733, -0.15257014, -0.32482935, ..., -0.01462882,
        -0.81645348, -0.25451618],
       ..., 
       [ 0.41826852, -0.45895766,  0.18449422, ..., -0.18379023,
        -0.81479074, -0.32042941],
       [ 1.27693972, -0.43947292, -0.12582231, ..., -0.04965663,
        -0.44283768,  0.21488221],
       [ 0.30326741, -0.25379979, -0.07382043, ..., -0.24119259,
        -0.37136806,  0.05377798]])

In [123]:
targets_train


Out[123]:
array([[-0.2 , -0.57, -0.99, ..., -0.28,  0.16, -0.82],
       [-0.18, -0.39, -1.56, ..., -0.37, -0.24,  0.88],
       [ 0.21,  1.12, -0.64, ..., -0.1 , -0.8 , -0.43],
       ..., 
       [-0.21, -1.15, -0.51, ..., -0.78, -0.46,  0.27],
       [ 3.33, -0.46, -0.67, ..., -0.4 , -0.52,  0.94],
       [ 1.22, -0.72, -0.03, ..., -0.51, -0.81, -0.53]])

In [ ]: