In [1]:
import numpy as np
import scipy.sparse as sp

from mnist import MNIST

In [2]:
from ridge_regression import Ridge, RidgeRegularizationPath

In [3]:
import pandas as pd
%matplotlib inline

import seaborn as sns

In [4]:
import matplotlib as mpl

In [5]:
mndata = MNIST('./python-mnist/data')
train_ims, train_labels = mndata.load_training()
test_ims, test_labels = mndata.load_testing()

train_ims = np.array(train_ims)
test_ims = np.array(test_ims)
train_istwo = np.array([int(x==2) for x in train_labels])
test_istwo = np.array([int(x==2) for x in test_labels])

In [6]:
# save 1000 points for validation
num_pts = 50000
train_X = sp.csc_matrix(train_ims[:num_pts, ])
train_y = sp.csc_matrix([train_istwo[:num_pts, ]]).T
val_X = sp.csc_matrix(train_ims[num_pts:, ])
val_y = sp.csc_matrix([train_istwo[num_pts:, ]]).T
print("train X, y shapes: {}, {}".format(train_X.shape, train_y.shape))
print("val X, y shapes: {}, {}".format(val_X.shape, val_y.shape))


train X, y shapes: (50000, 784), (50000, 1)
val X, y shapes: (10000, 784), (10000, 1)

In [7]:
train_X.shape


Out[7]:
(50000, 784)

In [8]:
train_ims.shape


Out[8]:
(60000, 784)

In [ ]:
rp = RidgeRegularizationPath(train_X=train_X, train_y=train_y, lam_max=10, frac_decrease=0.5, steps=10,
                 val_X=val_X, val_y=val_y)

In [ ]:
rp.walk_path()


Loop 1: solving weights.
/Users/janet/miniconda3/envs/mlpy3/lib/python3.5/site-packages/scipy/sparse/linalg/dsolve/linsolve.py:247: SparseEfficiencyWarning: splu requires CSC matrix format
  warn('splu requires CSC matrix format', SparseEfficiencyWarning)
/Users/janet/miniconda3/envs/mlpy3/lib/python3.5/site-packages/scipy/sparse/linalg/dsolve/linsolve.py:165: SparseEfficiencyWarning: spsolve is more efficient when sparse b is in the CSC matrix format
  'is in the CSC matrix format', SparseEfficiencyWarning)
Loop 2: solving weights.
Loop 3: solving weights.

In [ ]:
assert False
little_X = train_ims[0:500, :] little_y = train_istwo[0:500] print(little_X.shape) print(little_y.shape) little_X = sp.csc_matrix(little_X) little_y = sp.csc_matrix([little_y]).T

In [ ]:
result = Ridge(X = train_X, y = train_y, lam = 1)  # was 0.05 when running my HW

In [ ]:
result.solve_coeffs()

In [ ]:
import analyze_ridge_results
from imp import reload

In [ ]:
mpl.rcParams['figure.figsize'] = 4, 3

In [ ]:
to_compare = pd.DataFrame({"predictions": result.y_preds, 
                           "label": result.y.toarray()[:,0]})

print(to_compare.head())
print(to_compare.shape)

plot = sns.violinplot(x='label', y='predictions',
               data=to_compare, size=2)

plot.figure.savefig("MINST_regression_distributions.pdf")
reload(analyze_ridge_results)

In [ ]:
results = analyze_ridge_results.analyze_results(result, 0.30)

In [ ]:
results.keys()

In [ ]:
results['call_fracs']

In [ ]:
results['loss_01']

In [ ]:
print(result.y.shape)
result.y.toarray()[0:10, ]

In [ ]:
print(result.y_preds.shape)
result.y_preds[0:10]

In [ ]:
print(result.y.shape)
result.y[0:10,]

In [ ]:
diff = result.y - result.y_preds

In [ ]:
result.calc_square_loss()

In [ ]:
result.y_preds

In [ ]: