In [1]:
def compute_r_squared(y_train, y_pred):
# sum of squared errors
SSE = sum((y_train - y_pred) ** 2);
# total sum of squares
SST = sum((y_train - np.mean(y_train)) ** 2);
return 1 - SSE/SST
In [2]:
def compute_bucket_correctness(y_train, y_pred):
# from http://www.epa.gov/pm/2012/decfsstandards.pdf
buckets = [
(-float("inf"), 12, 'good'),
(12, 35.5, 'moderate'),
(35.5, 55.5, 'unhealthy for sensitive'),
(55.5, 150.5, 'unhealthy'),
(150.5, 250.5, 'very unhealthy'),
(250.5, float("inf")),
]
try:
correct_bucket = [b for b in buckets if b[0] <= y_train and b[1] > y_train][0]
guessed_bucket = [b for b in buckets if b[0] <= y_pred and b[1] > y_pred][0]
except Exception as e:
print 'train', y_train
print 'test', y_pred
raise e
return (correct_bucket == guessed_bucket) or (abs(y_train - y_pred) / y_train) < 0.20 or (abs(y_train - y_pred) < 5)
In [3]:
import numpy as np
test_datasets = ['/Users/victorzhong/designproject/'+p+'_raw.mat' for p \
in ['newark', 'mexico', 'newark+mexico']]
# The training data is a subsample of 'newark+mexico'
# It contains randomly sampled images for each of the following PM ranges
# 0-10
# 10-20
# 20-30
# 30-40
# 40-50
# 50-60
# 60+
train_data = '/Users/victorzhong/designproject/merged_raw.mat'
import scipy.io
from sklearn.linear_model import RidgeCV, Ridge, LinearRegression
from IPython.display import Image, display
import pylab as pl
alphas = np.array([ 0.1, 1., 10. ])
print '-'*5, 'Training', '-'*5
print
train_data = scipy.io.loadmat(train_data)
X_train = train_data['feature_matrix']
y_train = train_data['label_vector']
print 'X_train:', X_train.shape
print 'y_train:', y_train.shape
print
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cross_validation import cross_val_score, KFold
scores = []
fold = 0
correctnesses = []
for train_fold_idx, test_fold_idx in KFold(len(y_train), n_folds=10):
X_train_fold, X_test_fold, y_train_fold, y_test_fold = \
X_train[train_fold_idx], X_train[test_fold_idx], \
y_train[train_fold_idx], y_train[test_fold_idx]
lr = RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)\
.fit(X_train_fold, y_train_fold)
mean_abs_error = np.mean(np.abs(lr.predict(X_test_fold) - y_test_fold))
fold += 1
print 'fold', fold, ':', mean_abs_error
scores.append(mean_abs_error)
plot_train_target = np.transpose(y_train_fold).tolist()[0]
plot_test_target = np.transpose(y_test_fold).tolist()[0]
plot_train_estimate = np.transpose(lr.predict(X_train_fold)).tolist()[0]
plot_test_estimate = np.transpose(lr.predict(X_test_fold)).tolist()[0]
sorted_index = sorted(range(len(plot_train_target)), key=plot_train_target.__getitem__)
x = [ plot_train_target[k] for k in sorted_index ]
y = [ plot_train_estimate[k] for k in sorted_index ]
plt.scatter(x, x, s=10, c='b', marker="s")
plt.scatter(x, y, s=10, c='r', marker="o")
pl.title('Goodness of fit on training data')
pl.xlabel('target PM2.5')
pl.ylabel('estimated PM2.5')
pl.show()
print 'training R^2:', compute_r_squared(y_train_fold, lr.predict(X_train_fold))
bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_train_target, plot_train_estimate)]
num_right = len([ b for b in bucket_correctness if b == True])
num_total = len(bucket_correctness)
print 'train bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
sorted_index = sorted(range(len(plot_test_target)), key=plot_test_target.__getitem__)
x = [ plot_test_target[k] for k in sorted_index ]
y = [ plot_test_estimate[k] for k in sorted_index ]
plt.scatter(x, x, s=10, c='b', marker="s")
plt.scatter(x, y, s=10, c='r', marker="o")
pl.title('Goodness of fit on test data')
pl.xlabel('target PM2.5')
pl.ylabel('estimated PM2.5')
pl.show()
print 'testing R^2:', compute_r_squared(y_test_fold, lr.predict(X_test_fold))
bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_test_target, plot_test_estimate)]
num_right = len([ b for b in bucket_correctness if b == True])
num_total = len(bucket_correctness)
print 'test bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
correctnesses.append(float(num_right)/num_total)
print
print "10 fold CV mean absolute error is", np.mean(scores)
print "10 fold mean bucket correctness is", mean(correctnesses)
print 'max PM2.5 in training set:', np.max(y_train)
print 'min PM2.5 in training set:', np.min(y_train)
In [4]:
print 'Linear regression model:'
lr = RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)\
.fit(X_train, y_train)
print lr
means = np.mean(X_train, 0).tolist()
stds = np.std(X_train, 0).tolist()
features = [p.tolist()[0] for p in train_data['parameter_names'][0]]
coefficients = lr.coef_.tolist()[0]
for idx,feature in enumerate(features):
print feature + ':'
print "\t", 'coefficient:', coefficients[idx]
print "\t", 'mean:', means[idx]
print "\t", 'std:', stds[idx]
y_pred_lr = lr.predict(X_train)
# sum of squared errors
SSE = sum((y_train - y_pred_lr) ** 2);
# total sum of squares
SST = sum((y_train - np.mean(y_train)) ** 2);
# Rsquared
Rsquared = 1 - SSE/SST
print 'sum of squared errors:', SSE
print 'total sum of squareds:', SST
print 'R squared:', Rsquared
print "mean abs error: %f" % np.mean(np.abs(y_pred_lr-y_train))
print
print 'bias'
print lr.intercept_
print
print 'coefficients'
print coefficients
print
print 'means'
print means
print
print 'stds'
print stds
print
print 'mean of PM:', np.mean(y_train)
print 'std of PM:', np.std(y_train)
print 'Training goodness of fit:'
plot_train_target = np.transpose(y_train).tolist()[0]
plot_train_estimate = np.transpose(lr.predict(X_train)).tolist()[0]
sorted_index = sorted(range(len(plot_train_target)), key=plot_train_target.__getitem__)
x = [ plot_train_target[k] for k in sorted_index ]
y = [ plot_train_estimate[k] for k in sorted_index ]
plt.scatter(x, x, s=10, c='b', marker="s")
plt.scatter(x, y, s=10, c='r', marker="o")
pl.title('Goodness of fit on training data')
pl.xlabel('target PM2.5')
pl.ylabel('estimated PM2.5')
pl.show()
bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_train_target, plot_train_estimate)]
num_right = len([ b for b in bucket_correctness if b == True])
num_total = len(bucket_correctness)
print 'train bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
#print 'predictions:'
#print np.transpose(y_pred_lr).tolist()[0]
#print 'targets:'
#print np.transpose(y_train).tolist()[0]
print
In [5]:
for test_dataset in test_datasets:
print '-'*5, 'Testing', test_dataset.split('/')[-1], '-'*5
test_data = scipy.io.loadmat(test_dataset)
X_test = test_data['feature_matrix']
y_test = test_data['label_vector']
print 'X_test:', X_test.shape
means = np.mean(X_test, 0).tolist()
stds = np.std(X_test, 0).tolist()
for idx,feature in enumerate(features):
print feature + ':'
print "\t", 'mean:', means[idx]
print "\t", 'std:', stds[idx]
print
print 'mean of PM:', np.mean(y_test)
print 'std of PM:', np.std(y_test)
print 'Linear Regression:'
y_pred_lr = lr.predict(X_test)
print lr
print
print 'test mean abs error: %f' % np.mean(np.abs(y_pred_lr-y_test))
print 'Training goodness of fit:'
plot_test_target = np.transpose(y_test).tolist()[0]
plot_test_estimate = np.transpose(lr.predict(X_test)).tolist()[0]
sorted_index = sorted(range(len(plot_test_target)), key=plot_test_target.__getitem__)
x = [ plot_test_target[k] for k in sorted_index ]
y = [ plot_test_estimate[k] for k in sorted_index ]
plt.scatter(x, x, s=10, c='b', marker="s")
plt.scatter(x, y, s=10, c='r', marker="o")
pl.title('Goodness of fit on test data: ' + test_dataset.split('/')[-1])
pl.xlabel('target PM2.5')
pl.ylabel('estimated PM2.5')
pl.show()
print 'testing R^2:', compute_r_squared(y_test_fold, lr.predict(X_test_fold))
bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_test_target, plot_test_estimate)]
num_right = len([ b for b in bucket_correctness if b == True])
num_total = len(bucket_correctness)
print 'test bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
print
print
In [6]:
from IPython.display import Image, display
import os
samples = scipy.io.loadmat(\
'/Users/victorzhong/designproject/showcase/cropped.mat')
feature_matrix = samples['feature_matrix'].tolist()
label_vector = samples['label_vector']
feature_names = [n.tolist()[0] for n in samples['parameter_names'][0]]
print 'features:', feature_names
estimates = np.dot(feature_matrix, np.transpose(lr.coef_)) + \
lr.intercept_
print 'mean absolute error:', np.mean(np.abs(estimates - label_vector))
In [7]:
for idx, f in enumerate([f for f in os.listdir(\
'/Users/victorzhong/designproject/showcase/cropped/') \
if f.endswith('.jpg')]):
cropped = '/Users/victorzhong/designproject/showcase/cropped/' + f
full = '/Users/victorzhong/designproject/showcase/full/' + f
print '-'*5, f, '-'*5
print 'full image:'
display(Image(filename=full))
print 'crop:'
display(Image(filename=cropped))
print 'features:'
for k, name in enumerate(feature_names):
print "\t", name + ':', feature_matrix[idx][k]
print 'label:'
print "\t", label_vector.item(idx)
print 'estimate:'
print "\t", estimates.item(idx)
print 'computation:'
for k in range(len(coefficients)):
print "\t", coefficients[k], '*', feature_matrix[idx][k], '+'
print "\t", lr.intercept_.item(0)
print
In [8]:
print 'Evaluating on Newark data'
newark_dataset = '/Users/victorzhong/designproject/newark_raw.mat'
train_data = scipy.io.loadmat(newark_dataset)
X_train = train_data['feature_matrix']
y_train = train_data['label_vector']
print 'X_train:', X_train.shape
print 'y_train:', y_train.shape
print
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cross_validation import cross_val_score, KFold
scores = []
fold = 0
correctnesses = []
for train_fold_idx, test_fold_idx in KFold(len(y_train), n_folds=10):
X_train_fold, X_test_fold, y_train_fold, y_test_fold = \
X_train[train_fold_idx], X_train[test_fold_idx], \
y_train[train_fold_idx], y_train[test_fold_idx]
lr = RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)\
.fit(X_train_fold, y_train_fold)
mean_abs_error = np.mean(np.abs(lr.predict(X_test_fold) - y_test_fold))
fold += 1
print 'fold', fold, ':', mean_abs_error
scores.append(mean_abs_error)
plot_train_target = np.transpose(y_train_fold).tolist()[0]
plot_test_target = np.transpose(y_test_fold).tolist()[0]
plot_train_estimate = np.transpose(lr.predict(X_train_fold)).tolist()[0]
plot_test_estimate = np.transpose(lr.predict(X_test_fold)).tolist()[0]
sorted_index = sorted(range(len(plot_train_target)), key=plot_train_target.__getitem__)
x = [ plot_train_target[k] for k in sorted_index ]
y = [ plot_train_estimate[k] for k in sorted_index ]
plt.scatter(x, x, s=10, c='b', marker="s")
plt.scatter(x, y, s=10, c='r', marker="o")
pl.title('Goodness of fit on training data')
pl.xlabel('target PM2.5')
pl.ylabel('estimated PM2.5')
pl.show()
print 'training R^2:', compute_r_squared(y_train_fold, lr.predict(X_train_fold))
bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_train_target, plot_train_estimate)]
num_right = len([ b for b in bucket_correctness if b == True])
num_total = len(bucket_correctness)
print 'train bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
sorted_index = sorted(range(len(plot_test_target)), key=plot_test_target.__getitem__)
x = [ plot_test_target[k] for k in sorted_index ]
y = [ plot_test_estimate[k] for k in sorted_index ]
plt.scatter(x, x, s=10, c='b', marker="s")
plt.scatter(x, y, s=10, c='r', marker="o")
pl.title('Goodness of fit on test data')
pl.xlabel('target PM2.5')
pl.ylabel('estimated PM2.5')
pl.show()
print 'testing R^2:', compute_r_squared(y_test_fold, lr.predict(X_test_fold))
bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_test_target, plot_test_estimate)]
num_right = len([ b for b in bucket_correctness if b == True])
num_total = len(bucket_correctness)
print 'test bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
correctnesses.append(float(num_right)/num_total)
print
print "10 fold CV mean absolute error is ", np.mean(scores)
print correctnesses
print "10 fold mean bucket correctness is", mean(correctnesses)
print 'max PM2.5 in training set:', np.max(y_train)
print 'min PM2.5 in training set:', np.min(y_train)
In [9]:
print 'Fit on Newark data:'
lr = RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)\
.fit(X_train, y_train)
plot_train_target = np.transpose(y_train).tolist()[0]
plot_train_estimate = np.transpose(lr.predict(X_train)).tolist()[0]
sorted_index = sorted(range(len(plot_train_target)), key=plot_train_target.__getitem__)
x = [ plot_train_target[k] for k in sorted_index ]
y = [ plot_train_estimate[k] for k in sorted_index ]
plt.scatter(x, x, s=10, c='b', marker="s")
plt.scatter(x, y, s=10, c='r', marker="o")
pl.title('Goodness of fit on training data')
pl.xlabel('target PM2.5')
pl.ylabel('estimated PM2.5')
pl.show()
print 'R^2 of training:', compute_r_squared(y_train, lr.predict(X_train))
print 'train mean abs error: %f' % np.mean(np.abs(lr.predict(X_train)-y_train))
bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_train_target, plot_train_estimate)]
num_right = len([ b for b in bucket_correctness if b == True])
num_total = len(bucket_correctness)
print 'train bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
In [9]: