In [ ]:
import numpy as np
import pandas as pd
import scipy.io as io
import scipy.sparse as sp
In [ ]:
from lasso import RegularizationPathTrainTest
In [ ]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
%config InlineBackend.figure_formats=['svg']
import seaborn as sns
In [ ]:
# Load a text file of integers:
y = np.loadtxt("yelp_data/upvote_labels.txt", dtype=np.int)
# Load a text file with strings identifying the 1000 features:
featureNames = open("yelp_data/upvote_features.txt").read().splitlines()
featureNames = np.array(featureNames)
# Load a csv of floats, which are the values of 1000 features (columns) for 6000 samples (rows):
A = np.genfromtxt("yelp_data/upvote_data.csv", delimiter=",")
norms = np.apply_along_axis(np.linalg.norm,0,A)
A = A / norms
# print(np.apply_along_axis(np.linalg.norm,0,A))
In [ ]:
# Randomize input order
np.random.seed(12345)
shuffler = np.arange(len(y))
np.random.shuffle(shuffler)
A = A[shuffler,:]
y = y[shuffler]
In the Yelp Question in HW1, please normalize the data so that it has the same L2 norm. We will grade it either way, but please state clearly what you did to treat the yelp data, which is currently not normalized.
http://stackoverflow.com/questions/7140738/numpy-divide-along-axis
In [ ]:
#data_splits = (4000, 5000) # HW setting
data_splits = (2000, 2500)# faster setting
In [ ]:
A_train = A[:data_splits[0], :]; y_train = y[:data_splits[0]]
A_val = A[data_splits[0]:data_splits[1], :]; y_val = y[data_splits[0]:data_splits[1]]
A_test = A[data_splits[1]:, :]; y_test = y[data_splits[1]:]
In [ ]:
A_train.shape
In [ ]:
result = RegularizationPathTrainTest(X_train=A_train[0:100, 0:50], y_train=y_train[0:100], feature_names=featureNames, lam_max=1,
X_val=A_val[0:100, 0:50], y_val=y_val[0:100,], steps=2, frac_decrease=0.05,
delta = 0.001)
In [ ]:
result.results_df
In [ ]:
result.analyze_path()
In [ ]:
result.results_df
In [ ]:
result = RegularizationPathTrainTest(X_train=A_train, y_train=y_train, feature_names=featureNames, lam_max=100,
X_val=A_val, y_val=y_val, steps=5, frac_decrease=0.7, delta=0.01)
In [ ]:
result.analyze_path()
In [ ]:
print(A_train.shape)
print(A_val.shape)
print(A_test.shape)
In [ ]:
#Assuming df is a pandas data frame with columns 'x', 'y', and 'label'
fig, ax = plt.subplots(1, 1, figsize=(5, 4))
colors = {1: 'gray', 10: 'b'}
data = result.results_df.copy()
plt.semilogx(data['lam'], data['RMSE (validation)'], linestyle='--', marker='o', color='g')
plt.semilogx(data['lam'], data['RMSE (training)'], linestyle='--', marker='o', color='#D1D1D1')
#for key,grp in data.groupby('sigma'):
# print (key)
# plt.semilogx(grp.lam, grp.recall, linestyle='--', marker='o',
# color=colors[key], label='sigma = {}'.format(key)) #, t, t**2, 'bs', t, t**3, 'g^')
plt.legend(loc = 'best')
plt.xlabel('lambda')
plt.ylabel('RMSE')
#ax.set_ylim([0.55, 1.05])
In [ ]:
#Assuming df is a pandas data frame with columns 'x', 'y', and 'label'
fig, ax = plt.subplots(1, 1, figsize=(4,3))
colors = {1: 'gray', 10: 'b'}
data = result.results_df.copy()
plt.semilogx(data['lam'], data['# nonzero coefficients'], linestyle='--', marker='o', color='b')
#for key,grp in data.groupby('sigma'):
# print (key)
# plt.semilogx(grp.lam, grp.recall, linestyle='--', marker='o',
# color=colors[key], label='sigma = {}'.format(key)) #, t, t**2, 'bs', t, t**3, 'g^')
plt.legend(loc = 'best')
plt.xlabel('lambda')
plt.ylabel('num nonzero coefficients')
#ax.set_ylim([0.55, 1.05])
In [ ]:
assert False
In [ ]:
# Load a text file of integers:
y = np.loadtxt("yelp_data/star_labels.txt", dtype=np.int)
# Load a text file with strings identifying the 2500 features:
featureNames = open("yelp_data/star_features.txt").read().splitlines()
# Load a matrix market matrix with 45000 samples of 2500 features, convert it to csc format:
A = sp.csc_matrix(io.mmread("yelp_data/star_data.mtx"))
In [ ]: