In [ ]:
import numpy as np
import pandas as pd
import scipy.io as io
import scipy.sparse as sp

In [ ]:
from lasso import RegularizationPathTrainTest

In [ ]:
import matplotlib.pyplot as plt
import matplotlib as mpl

%matplotlib inline
%config InlineBackend.figure_formats=['svg']

import seaborn as sns

Upvote data


In [ ]:
# Load a text file of integers:
y = np.loadtxt("yelp_data/upvote_labels.txt", dtype=np.int)
# Load a text file with strings identifying the 1000 features:
featureNames = open("yelp_data/upvote_features.txt").read().splitlines()
featureNames = np.array(featureNames)
# Load a csv of floats, which are the values of 1000 features (columns) for 6000 samples (rows):
A = np.genfromtxt("yelp_data/upvote_data.csv", delimiter=",")
norms = np.apply_along_axis(np.linalg.norm,0,A)
A = A / norms
# print(np.apply_along_axis(np.linalg.norm,0,A))

In [ ]:
# Randomize input order
np.random.seed(12345)
shuffler = np.arange(len(y))
np.random.shuffle(shuffler)
A = A[shuffler,:]
y = y[shuffler]

In the Yelp Question in HW1, please normalize the data so that it has the same L2 norm. We will grade it either way, but please state clearly what you did to treat the yelp data, which is currently not normalized.

http://stackoverflow.com/questions/7140738/numpy-divide-along-axis

toy = np.array([[0., 1, 100.], [1, 10., 1000]]) print(toy) norms = np.apply_along_axis(np.linalg.norm,0,toy) # np.linalg.norm assumes L2 toy = toy / norms print(toy) # check the norms: np.apply_along_axis(np.linalg.norm,0,toy)

In [ ]:
#data_splits = (4000, 5000) # HW setting
data_splits = (2000, 2500)# faster setting

In [ ]:
A_train = A[:data_splits[0], :]; y_train = y[:data_splits[0]]
A_val = A[data_splits[0]:data_splits[1], :]; y_val = y[data_splits[0]:data_splits[1]]
A_test = A[data_splits[1]:, :]; y_test = y[data_splits[1]:]

In [ ]:
A_train.shape

Train models for varying lambda values. Calculate training error for each model.


In [ ]:
result = RegularizationPathTrainTest(X_train=A_train[0:100, 0:50], y_train=y_train[0:100], feature_names=featureNames, lam_max=1, 
                                     X_val=A_val[0:100, 0:50], y_val=y_val[0:100,], steps=2, frac_decrease=0.05,
                                    delta = 0.001)

In [ ]:
result.results_df

In [ ]:
result.analyze_path()

In [ ]:
result.results_df

In [ ]:
result = RegularizationPathTrainTest(X_train=A_train, y_train=y_train, feature_names=featureNames, lam_max=100, 
                                     X_val=A_val, y_val=y_val, steps=5, frac_decrease=0.7, delta=0.01)

In [ ]:
result.analyze_path()

In [ ]:
print(A_train.shape)
print(A_val.shape)
print(A_test.shape)

In [ ]:
#Assuming df is a pandas data frame with columns 'x', 'y', and 'label'
fig, ax = plt.subplots(1, 1, figsize=(5, 4))
colors = {1: 'gray', 10: 'b'}
data = result.results_df.copy()
plt.semilogx(data['lam'], data['RMSE (validation)'], linestyle='--', marker='o', color='g')
plt.semilogx(data['lam'], data['RMSE (training)'], linestyle='--', marker='o', color='#D1D1D1')
#for key,grp in data.groupby('sigma'):
#    print (key)
#    plt.semilogx(grp.lam, grp.recall, linestyle='--', marker='o', 
#             color=colors[key], label='sigma = {}'.format(key)) #, t, t**2, 'bs', t, t**3, 'g^')
plt.legend(loc = 'best')
plt.xlabel('lambda')
plt.ylabel('RMSE')
#ax.set_ylim([0.55, 1.05])

In [ ]:
#Assuming df is a pandas data frame with columns 'x', 'y', and 'label'
fig, ax = plt.subplots(1, 1, figsize=(4,3))
colors = {1: 'gray', 10: 'b'}
data = result.results_df.copy()
plt.semilogx(data['lam'], data['# nonzero coefficients'], linestyle='--', marker='o', color='b')
#for key,grp in data.groupby('sigma'):
#    print (key)
#    plt.semilogx(grp.lam, grp.recall, linestyle='--', marker='o', 
#             color=colors[key], label='sigma = {}'.format(key)) #, t, t**2, 'bs', t, t**3, 'g^')
plt.legend(loc = 'best')
plt.xlabel('lambda')
plt.ylabel('num nonzero coefficients')
#ax.set_ylim([0.55, 1.05])

In [ ]:
assert False

Star data


In [ ]:
# Load a text file of integers:
y = np.loadtxt("yelp_data/star_labels.txt", dtype=np.int)
# Load a text file with strings identifying the 2500 features:
featureNames = open("yelp_data/star_features.txt").read().splitlines()
# Load a matrix market matrix with 45000 samples of 2500 features, convert it to csc format:
A = sp.csc_matrix(io.mmread("yelp_data/star_data.mtx"))

In [ ]: