In [13]:
import matplotlib.pyplot as plt
%matplotlib inline  

import numpy as np
import pandas as pd

dataset = pd.read_csv('./msd/YearPredictionMSD.txt', 
                      header=None).as_matrix()

In [14]:
X_train = dataset[:463715, 1:].astype(np.float32)
y_train = np.asarray(dataset[:463715, 0])

X_test = dataset[463715:, 1:].astype(np.float32)
y_test = np.asarray(dataset[463715:, 0])

In [15]:
print("Dataset is MB:", dataset.nbytes/1E6)

del dataset


Dataset is MB: 375.17116

In [16]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
import time

In [17]:
regr = LinearRegression()

tic = time.clock()
regr.fit(X_train, y_train)
print("Training time [s]:", time.clock()-tic)

print("MAE train set:", mean_absolute_error(y_train, 
                                  regr.predict(X_train)))

print("MAE test set:", mean_absolute_error(y_test, 
                                  regr.predict(X_test)))


Training time [s]: 6.211522000000059
MAE train set: 6.79557143273
MAE test set: 6.8004976262

In [18]:
regr = SGDRegressor(random_state=101)

tic = time.clock()
regr.fit(X_train, y_train)
print("Training time [s]:", time.clock()-tic)

print("MAE train set:", mean_absolute_error(y_train, 
                                  regr.predict(X_train)))

print("MAE test set:", mean_absolute_error(y_test, 
                                  regr.predict(X_test)))


Training time [s]: 1.4601459999998951
MAE train set: 4.61352832373e+15
MAE test set: 4.64860022117e+15

In [19]:
regr = SGDRegressor(random_state=101, n_iter=100)

tic = time.clock()
regr.fit(X_train, y_train)
print("Training time [s]:", time.clock()-tic)

print("MAE train set:", mean_absolute_error(y_train, 
                                  regr.predict(X_train)))

print("MAE test set:", mean_absolute_error(y_test, 
                                  regr.predict(X_test)))


Training time [s]: 23.028584000000137
MAE train set: 8.12486708423e+14
MAE test set: 8.11624417368e+14

In [20]:
from sklearn.preprocessing import PolynomialFeatures
PolynomialFeatures().fit_transform(X_train[:10,:]).shape[1]


Out[20]:
4186

In [21]:
from sklearn.pipeline import Pipeline
from sklearn import feature_selection
from sklearn.feature_selection import SelectKBest
import gc

folds = 12
train_idx = list(KFold(X_train.shape[0], folds, random_state=101, shuffle=True))[0][1]

to_plot = []


for k_feat in range(50, 2001, 50):
    
    gc.collect()
    
    print('---------------------------')
    print("K = ", k_feat)
    
    poly = PolynomialFeatures()
    regr = LinearRegression()
    f_sel = SelectKBest(feature_selection.f_regression, k=k_feat)

    pipeline = Pipeline([('poly', poly), ('f_sel', f_sel), ('regr', regr)])
    tic = time.clock()
    pipeline.fit(X_train[train_idx], y_train[train_idx])
    print("Training time [s]:", time.clock()-tic)
    
    mae_train = mean_absolute_error(y_train[train_idx], pipeline.predict(X_train[train_idx]))
    mae_test = mean_absolute_error(y_test, pipeline.predict(X_test))

    print("MAE train set:", mae_train)

    print("MAE test set:", mae_test)
    
    to_plot.append((k_feat, mae_train, mae_test))


---------------------------
K =  50
Training time [s]: 9.189624000000094
MAE train set: 7.2167947147
MAE test set: 7.17509061835
---------------------------
K =  100
Training time [s]: 9.120705000000271
MAE train set: 7.15342169823
MAE test set: 7.11426166834
---------------------------
K =  150
Training time [s]: 10.262009999999918
MAE train set: 7.06834013527
MAE test set: 7.06141525342
---------------------------
K =  200
Training time [s]: 9.969074999999975
MAE train set: 7.02144412121
MAE test set: 7.04225179913
---------------------------
K =  250
Training time [s]: 10.463522999999896
MAE train set: 6.97841358686
MAE test set: 7.0199851787
---------------------------
K =  300
Training time [s]: 10.902124000000185
MAE train set: 6.91862958298
MAE test set: 6.9704004744
---------------------------
K =  350
Training time [s]: 10.932276000000002
MAE train set: 6.88969389119
MAE test set: 6.95757659986
---------------------------
K =  400
Training time [s]: 10.988593000000037
MAE train set: 6.86189632215
MAE test set: 6.93971151828
---------------------------
K =  450
Training time [s]: 10.934052000000065
MAE train set: 6.8470951285
MAE test set: 6.93249603911
---------------------------
K =  500
Training time [s]: 11.516423000000032
MAE train set: 6.72959173339
MAE test set: 6.83194648692
---------------------------
K =  550
Training time [s]: 12.424326000000292
MAE train set: 6.71634222538
MAE test set: 6.82796065018
---------------------------
K =  600
Training time [s]: 12.464111999999659
MAE train set: 6.70628319015
MAE test set: 6.83433806849
---------------------------
K =  650
Training time [s]: 13.470016000000214
MAE train set: 6.69552528727
MAE test set: 6.82796657269
---------------------------
K =  700
Training time [s]: 14.409058999999615
MAE train set: 6.67643159459
MAE test set: 6.81717314484
---------------------------
K =  750
Training time [s]: 14.7598109999999
MAE train set: 6.65909392533
MAE test set: 6.82012877525
---------------------------
K =  800
Training time [s]: 16.1001839999999
MAE train set: 6.62333700563
MAE test set: 6.79037693333
---------------------------
K =  850
Training time [s]: 16.856222999999773
MAE train set: 6.60585395287
MAE test set: 6.79795536879
---------------------------
K =  900
Training time [s]: 22.623407000000043
MAE train set: 6.51544505322
MAE test set: 6.71429177541
---------------------------
K =  950
Training time [s]: 22.074413000000277
MAE train set: 6.51045903119
MAE test set: 6.72172702014
---------------------------
K =  1000
Training time [s]: 19.745074999999815
MAE train set: 6.49810675993
MAE test set: 6.72195323088
---------------------------
K =  1050
Training time [s]: 25.120972000000165
MAE train set: 6.48685165912
MAE test set: 6.72579016523
---------------------------
K =  1100
Training time [s]: 24.06064000000015
MAE train set: 6.47600501737
MAE test set: 6.73529947042
---------------------------
K =  1150
Training time [s]: 28.219682999999804
MAE train set: 6.47057203284
MAE test set: 6.74342672984
---------------------------
K =  1200
Training time [s]: 31.512782000000243
MAE train set: 6.46298446662
MAE test set: 6.75134851044
---------------------------
K =  1250
Training time [s]: 35.47639599999957
MAE train set: 6.45400747295
MAE test set: 6.7529763329
---------------------------
K =  1300
Training time [s]: 38.28391099999999
MAE train set: 6.44523423512
MAE test set: 6.76696563111
---------------------------
K =  1350
Training time [s]: 41.944379000000026
MAE train set: 6.44276461289
MAE test set: 6.76906109668
---------------------------
K =  1400
Training time [s]: 45.48775500000011
MAE train set: 6.43744960773
MAE test set: 6.77973099752
---------------------------
K =  1450
Training time [s]: 49.26655900000014
MAE train set: 6.42799902806
MAE test set: 6.78606734133
---------------------------
K =  1500
Training time [s]: 54.279123000000254
MAE train set: 6.41908632884
MAE test set: 6.78776307493
---------------------------
K =  1550
Training time [s]: 59.284466000000066
MAE train set: 6.40760621297
MAE test set: 6.79340352064
---------------------------
K =  1600
Training time [s]: 63.60156000000006
MAE train set: 6.40244215936
MAE test set: 6.80588825081
---------------------------
K =  1650
Training time [s]: 71.3499710000001
MAE train set: 6.39500350681
MAE test set: 6.80674622547
---------------------------
K =  1700
Training time [s]: 77.72881400000006
MAE train set: 6.39144714509
MAE test set: 6.80876654751
---------------------------
K =  1750
Training time [s]: 90.47765600000002
MAE train set: 6.38828227431
MAE test set: 6.81618479538
---------------------------
K =  1800
Training time [s]: 94.09637999999995
MAE train set: 6.38436743183
MAE test set: 6.83793056244
---------------------------
K =  1850
Training time [s]: 108.53475900000012
MAE train set: 6.37654462545
MAE test set: 6.851422917
---------------------------
K =  1900
Training time [s]: 113.16720600000008
MAE train set: 6.37078254348
MAE test set: 6.84987645125
---------------------------
K =  1950
Training time [s]: 117.7559339999998
MAE train set: 6.36538626383
MAE test set: 6.85842654271
---------------------------
K =  2000
Training time [s]: 136.62466099999983
MAE train set: 6.35740356263
MAE test set: 6.86709623004

In [22]:
plt.plot([x[0] for x in to_plot], [x[1] for x in to_plot], 'b', label='Train')
plt.plot([x[0] for x in to_plot], [x[2] for x in to_plot], 'r--', label='Test')
plt.xlabel('Num. features selected')
plt.ylabel('MAE train/test')
plt.legend(loc=0)

plt.show()


What about a classifier instead of a regressor?


In [23]:
print(np.unique(np.ascontiguousarray(y_train)))
print(len(np.unique(np.ascontiguousarray(y_train))))


[ 1922.  1924.  1925.  1926.  1927.  1928.  1929.  1930.  1931.  1932.
  1933.  1934.  1935.  1936.  1937.  1938.  1939.  1940.  1941.  1942.
  1943.  1944.  1945.  1946.  1947.  1948.  1949.  1950.  1951.  1952.
  1953.  1954.  1955.  1956.  1957.  1958.  1959.  1960.  1961.  1962.
  1963.  1964.  1965.  1966.  1967.  1968.  1969.  1970.  1971.  1972.
  1973.  1974.  1975.  1976.  1977.  1978.  1979.  1980.  1981.  1982.
  1983.  1984.  1985.  1986.  1987.  1988.  1989.  1990.  1991.  1992.
  1993.  1994.  1995.  1996.  1997.  1998.  1999.  2000.  2001.  2002.
  2003.  2004.  2005.  2006.  2007.  2008.  2009.  2010.  2011.]
89

In [24]:
from sklearn.linear_model import SGDClassifier
regr = SGDClassifier('log', random_state=101)

tic = time.clock()
regr.fit(X_train, y_train)
print("Training time [s]:", time.clock()-tic)

print("MAE train set:", mean_absolute_error(y_train, 
                                  regr.predict(X_train)))

print("MAE test set:", mean_absolute_error(y_test, 
                                  regr.predict(X_test)))


Training time [s]: 69.49637499999972
MAE train set: 8.57550866373
MAE test set: 8.52630253728

In [ ]: