In [1]:
def compute_r_squared(y_train, y_pred):
    # sum of squared errors
    SSE = sum((y_train - y_pred) ** 2);
    # total sum of squares
    SST = sum((y_train - np.mean(y_train)) ** 2);
    return 1 - SSE/SST

In [2]:
def compute_bucket_correctness(y_train, y_pred):
    # from http://www.epa.gov/pm/2012/decfsstandards.pdf

    buckets = [
               (-float("inf"), 12, 'good'),
               (12, 35.5, 'moderate'),
               (35.5, 55.5, 'unhealthy for sensitive'),
               (55.5, 150.5, 'unhealthy'),
               (150.5, 250.5, 'very unhealthy'),
               (250.5, float("inf")),
               ]
    
    try:
        correct_bucket = [b for b in buckets if b[0] <= y_train and b[1] > y_train][0]
        guessed_bucket = [b for b in buckets if b[0] <= y_pred and b[1] > y_pred][0]
    except Exception as e:
        print 'train', y_train
        print 'test', y_pred
        raise e
    return (correct_bucket == guessed_bucket) or (abs(y_train - y_pred) / y_train) < 0.20 or (abs(y_train - y_pred) < 5)

In [3]:
import numpy as np

test_datasets = ['/Users/victorzhong/designproject/'+p+'_raw.mat' for p \
                 in ['newark', 'mexico', 'newark+mexico']]

# The training data is a subsample of 'newark+mexico'
# It contains randomly sampled images for each of the following PM ranges
# 0-10
# 10-20
# 20-30
# 30-40
# 40-50
# 50-60
# 60+

train_data = '/Users/victorzhong/designproject/merged_raw.mat'

import scipy.io
from sklearn.linear_model import RidgeCV, Ridge, LinearRegression
from IPython.display import Image, display
import pylab as pl

alphas = np.array([ 0.1, 1., 10. ])

print '-'*5, 'Training', '-'*5
print

train_data = scipy.io.loadmat(train_data)
X_train = train_data['feature_matrix']
y_train = train_data['label_vector']

print 'X_train:', X_train.shape
print 'y_train:', y_train.shape
print

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cross_validation import cross_val_score, KFold

scores = []
fold = 0
correctnesses = []

for train_fold_idx, test_fold_idx in KFold(len(y_train), n_folds=10):
    X_train_fold, X_test_fold, y_train_fold, y_test_fold = \
        X_train[train_fold_idx], X_train[test_fold_idx], \
        y_train[train_fold_idx], y_train[test_fold_idx]
    
    lr = RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)\
        .fit(X_train_fold, y_train_fold)
    
    mean_abs_error = np.mean(np.abs(lr.predict(X_test_fold) - y_test_fold))
    fold += 1
    print 'fold', fold, ':', mean_abs_error
    scores.append(mean_abs_error)
        
    plot_train_target = np.transpose(y_train_fold).tolist()[0]
    plot_test_target = np.transpose(y_test_fold).tolist()[0]
    plot_train_estimate = np.transpose(lr.predict(X_train_fold)).tolist()[0]
    plot_test_estimate = np.transpose(lr.predict(X_test_fold)).tolist()[0]
    
    sorted_index = sorted(range(len(plot_train_target)), key=plot_train_target.__getitem__)
    
    x = [ plot_train_target[k] for k in sorted_index ]
    y = [ plot_train_estimate[k] for k in sorted_index ]
    
    plt.scatter(x, x, s=10, c='b', marker="s")
    plt.scatter(x, y, s=10, c='r', marker="o")
    
    pl.title('Goodness of fit on training data')
    pl.xlabel('target PM2.5')
    pl.ylabel('estimated PM2.5')
    pl.show()
    
    print 'training R^2:', compute_r_squared(y_train_fold, lr.predict(X_train_fold))
    
    bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_train_target, plot_train_estimate)]
    num_right = len([ b for b in bucket_correctness if b == True])
    num_total = len(bucket_correctness)
    print 'train bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
    
    sorted_index = sorted(range(len(plot_test_target)), key=plot_test_target.__getitem__)
    
    x = [ plot_test_target[k] for k in sorted_index ]
    y = [ plot_test_estimate[k] for k in sorted_index ]
    
    plt.scatter(x, x, s=10, c='b', marker="s")
    plt.scatter(x, y, s=10, c='r', marker="o")
    
    pl.title('Goodness of fit on test data')
    pl.xlabel('target PM2.5')
    pl.ylabel('estimated PM2.5')
    pl.show()

    print 'testing R^2:', compute_r_squared(y_test_fold, lr.predict(X_test_fold))
    
    bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_test_target, plot_test_estimate)]
    num_right = len([ b for b in bucket_correctness if b == True])
    num_total = len(bucket_correctness)
    print 'test bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
    correctnesses.append(float(num_right)/num_total)
    
print
print "10 fold CV mean absolute error is", np.mean(scores)
print "10 fold mean bucket correctness is", mean(correctnesses)

print 'max PM2.5 in training set:', np.max(y_train)
print 'min PM2.5 in training set:', np.min(y_train)


----- Training -----

X_train: (60, 8)
y_train: (60, 1)

fold 1 : 10.7636574152
training R^2: 0.574228770245
train bucket correctness: 35 / 54 = 0.648148148148
testing R^2: 0.178187799287
test bucket correctness: 4 / 6 = 0.666666666667
fold 2 : 8.96148198142
training R^2: 0.552861376176
train bucket correctness: 38 / 54 = 0.703703703704
testing R^2: 0.254403408754
test bucket correctness: 2 / 6 = 0.333333333333
fold 3 : 7.30777911444
training R^2: 0.570214872855
train bucket correctness: 37 / 54 = 0.685185185185
testing R^2: 0.0906084268024
test bucket correctness: 5 / 6 = 0.833333333333
fold 4 : 5.8265869547
training R^2: 0.55302671219
train bucket correctness: 36 / 54 = 0.666666666667
testing R^2: 0.38656255764
test bucket correctness: 4 / 6 = 0.666666666667
fold 5 : 20.4657163575
training R^2: 0.656713068128
train bucket correctness: 40 / 54 = 0.740740740741
testing R^2: 0.0848043893084
test bucket correctness: 2 / 6 = 0.333333333333
fold 6 : 5.49143394654
training R^2: 0.569605152974
train bucket correctness: 36 / 54 = 0.666666666667
testing R^2: -0.273437271206
test bucket correctness: 6 / 6 = 1.0
fold 7 : 14.6338298442
training R^2: 0.558964599951
train bucket correctness: 39 / 54 = 0.722222222222
testing R^2: -7.29424163877
test bucket correctness: 3 / 6 = 0.5
fold 8 : 15.2682369291
training R^2: 0.628190262526
train bucket correctness: 38 / 54 = 0.703703703704
testing R^2: 0.0667200381322
test bucket correctness: 3 / 6 = 0.5
fold 9 : 3.29333519361
training R^2: 0.575327302799
train bucket correctness: 36 / 54 = 0.666666666667
testing R^2: 0.059863418744
test bucket correctness: 6 / 6 = 1.0
fold 10 : 12.5139259278
training R^2: 0.593572881869
train bucket correctness: 37 / 54 = 0.685185185185
testing R^2: -0.350893859309
test bucket correctness: 2 / 6 = 0.333333333333

10 fold CV mean absolute error is 10.4525983665
10 fold mean bucket correctness is 0.616666666667
max PM2.5 in training set: 81.0
min PM2.5 in training set: 1.0

In [4]:
print 'Linear regression model:'
lr = RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)\
        .fit(X_train, y_train)
print lr

means = np.mean(X_train, 0).tolist()
stds = np.std(X_train, 0).tolist()

features = [p.tolist()[0] for p in train_data['parameter_names'][0]]
coefficients = lr.coef_.tolist()[0]
for idx,feature in enumerate(features):
    print feature + ':'
    print "\t", 'coefficient:', coefficients[idx]
    print "\t", 'mean:', means[idx]
    print "\t", 'std:', stds[idx]

y_pred_lr = lr.predict(X_train)

# sum of squared errors
SSE = sum((y_train - y_pred_lr) ** 2);
# total sum of squares
SST = sum((y_train - np.mean(y_train)) ** 2);
# Rsquared
Rsquared = 1 - SSE/SST
print 'sum of squared errors:', SSE
print 'total sum of squareds:', SST
print 'R squared:', Rsquared

print "mean abs error: %f" % np.mean(np.abs(y_pred_lr-y_train))

print

print 'bias'
print lr.intercept_
print
print 'coefficients'
print coefficients
print
print 'means'
print means
print
print 'stds'
print stds
print
print 'mean of PM:', np.mean(y_train)
print 'std of PM:', np.std(y_train)

print 'Training goodness of fit:'

plot_train_target = np.transpose(y_train).tolist()[0]
plot_train_estimate = np.transpose(lr.predict(X_train)).tolist()[0]

sorted_index = sorted(range(len(plot_train_target)), key=plot_train_target.__getitem__)

x = [ plot_train_target[k] for k in sorted_index ]
y = [ plot_train_estimate[k] for k in sorted_index ]

plt.scatter(x, x, s=10, c='b', marker="s")
plt.scatter(x, y, s=10, c='r', marker="o")

pl.title('Goodness of fit on training data')
pl.xlabel('target PM2.5')
pl.ylabel('estimated PM2.5')
pl.show()

bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_train_target, plot_train_estimate)]
num_right = len([ b for b in bucket_correctness if b == True])
num_total = len(bucket_correctness)
print 'train bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total

#print 'predictions:'
#print np.transpose(y_pred_lr).tolist()[0]
#print 'targets:'
#print np.transpose(y_train).tolist()[0]

print


Linear regression model:
RidgeCV(alphas=[  0.1   1.   10. ], cv=None, fit_intercept=True,
    gcv_mode=None, loss_func=None, normalize=True, score_func=None,
    scoring=None, store_cv_values=False)
red mean:
	coefficient: 0.113271353219
	mean: 159.202266366
	std: 11.4534559702
red variance:
	coefficient: -0.0902314608418
	mean: 94.6271682155
	std: 102.668315978
green mean:
	coefficient: -0.482407926671
	mean: 185.568984081
	std: 11.5165131172
green variance:
	coefficient: -0.0224936128764
	mean: 50.8034958626
	std: 50.05727111
blue mean:
	coefficient: 0.285058967328
	mean: 196.826537334
	std: 14.464627242
blue variance:
	coefficient: 0.0941005095484
	mean: 70.1403524028
	std: 39.4544388954
red vs blue mean:
	coefficient: -26.7140166708
	mean: 0.813619503715
	std: 0.086281796962
red vs green mean:
	coefficient: 106.872901204
	mean: 0.859399740212
	std: 0.0580927902972
sum of squared errors: 8660.86028966
total sum of squareds: 20653.4618333
R squared: 0.580658179266
mean abs error: 9.407010

bias
[-22.07924974]

coefficients
[0.11327135321853513, -0.09023146084181878, -0.4824079266706217, -0.02249361287640665, 0.2850589673282063, 0.09410050954837819, -26.71401667075946, 106.87290120356111]

means
[159.20226636594103, 94.62716821553927, 185.56898408131386, 50.80349586259108, 196.82653733378706, 70.1403524028437, 0.8136195037148215, 0.859399740211609]

stds
[11.453455970158883, 102.66831597787879, 11.516513117243365, 50.05727110998052, 14.464627241991547, 39.45443889536016, 0.08628179696201313, 0.058092790297244054]

mean of PM: 29.5716666667
std of PM: 18.5532844502
Training goodness of fit:
train bucket correctness: 42 / 60 = 0.7


In [5]:
for test_dataset in test_datasets:

    print '-'*5, 'Testing', test_dataset.split('/')[-1], '-'*5
    
    test_data =  scipy.io.loadmat(test_dataset)
    
    X_test = test_data['feature_matrix']
    y_test = test_data['label_vector']
    
    print 'X_test:', X_test.shape
    
    means = np.mean(X_test, 0).tolist()
    stds = np.std(X_test, 0).tolist()

    for idx,feature in enumerate(features):
        print feature + ':'
        print "\t", 'mean:', means[idx]
        print "\t", 'std:', stds[idx]
    print
    
    print 'mean of PM:', np.mean(y_test)
    print 'std of PM:', np.std(y_test)
    
    print 'Linear Regression:'
    
    y_pred_lr = lr.predict(X_test)
    print lr
    print
    
    print 'test mean abs error: %f' % np.mean(np.abs(y_pred_lr-y_test))
    
    print 'Training goodness of fit:'

    plot_test_target = np.transpose(y_test).tolist()[0]
    plot_test_estimate = np.transpose(lr.predict(X_test)).tolist()[0]
    
    sorted_index = sorted(range(len(plot_test_target)), key=plot_test_target.__getitem__)
    
    x = [ plot_test_target[k] for k in sorted_index ]
    y = [ plot_test_estimate[k] for k in sorted_index ]
    
    plt.scatter(x, x, s=10, c='b', marker="s")
    plt.scatter(x, y, s=10, c='r', marker="o")
    
    pl.title('Goodness of fit on test data: ' + test_dataset.split('/')[-1])
    pl.xlabel('target PM2.5')
    pl.ylabel('estimated PM2.5')
    pl.show()
    
    print 'testing R^2:', compute_r_squared(y_test_fold, lr.predict(X_test_fold))
    bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_test_target, plot_test_estimate)]
    num_right = len([ b for b in bucket_correctness if b == True])
    num_total = len(bucket_correctness)
    print 'test bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
    
    print
    print


----- Testing newark_raw.mat -----
X_test: (102, 8)
red mean:
	mean: 157.512987667
	std: 14.1547458134
red variance:
	mean: 217.254802953
	std: 97.5947353227
green mean:
	mean: 193.653343687
	std: 7.44164651265
green variance:
	mean: 98.9598535827
	std: 57.3832584987
blue mean:
	mean: 205.636770254
	std: 12.913271971
blue variance:
	mean: 85.2469083247
	std: 39.2774914395
red vs blue mean:
	mean: 0.7706214787
	std: 0.0988997126214
red vs green mean:
	mean: 0.812916730487
	std: 0.0608886631132

mean of PM: 10.2254901961
std of PM: 7.9976919008
Linear Regression:
RidgeCV(alphas=[  0.1   1.   10. ], cv=None, fit_intercept=True,
    gcv_mode=None, loss_func=None, normalize=True, score_func=None,
    scoring=None, store_cv_values=False)

test mean abs error: 7.538682
Training goodness of fit:
testing R^2: -0.0937013807945
test bucket correctness: 71 / 102 = 0.696078431373


----- Testing mexico_raw.mat -----
X_test: (60, 8)
red mean:
	mean: 156.400616965
	std: 4.71778063202
red variance:
	mean: 26.9746008753
	std: 27.5500194193
green mean:
	mean: 179.483001607
	std: 9.31486658144
green variance:
	mean: 17.5293805326
	std: 14.4602030309
blue mean:
	mean: 193.627731864
	std: 15.5315811832
blue variance:
	mean: 52.0037142012
	std: 34.7224632672
red vs blue mean:
	mean: 0.813074654026
	std: 0.071014524447
red vs green mean:
	mean: 0.873481688798
	std: 0.0476883698776

mean of PM: 40.45
std of PM: 17.9345337269
Linear Regression:
RidgeCV(alphas=[  0.1   1.   10. ], cv=None, fit_intercept=True,
    gcv_mode=None, loss_func=None, normalize=True, score_func=None,
    scoring=None, store_cv_values=False)

test mean abs error: 13.942842
Training goodness of fit:
testing R^2: -0.0937013807945
test bucket correctness: 36 / 60 = 0.6


----- Testing newark+mexico_raw.mat -----
X_test: (162, 8)
red mean:
	mean: 157.100998518
	std: 11.6052853122
red variance:
	mean: 146.780654036
	std: 121.331759135
green mean:
	mean: 188.405068843
	std: 10.6690667268
green variance:
	mean: 68.8004191197
	std: 60.8031054357
blue mean:
	mean: 201.188978258
	std: 15.0985984187
blue variance:
	mean: 72.9346142049
	std: 40.9339381731
red vs blue mean:
	mean: 0.786344876969
	std: 0.0919053195584
red vs green mean:
	mean: 0.835348196528
	std: 0.0634979258742

mean of PM: 21.4197530864
std of PM: 19.2984802047
Linear Regression:
RidgeCV(alphas=[  0.1   1.   10. ], cv=None, fit_intercept=True,
    gcv_mode=None, loss_func=None, normalize=True, score_func=None,
    scoring=None, store_cv_values=False)

test mean abs error: 9.910593
Training goodness of fit:
testing R^2: -0.0937013807945
test bucket correctness: 107 / 162 = 0.66049382716



In [6]:
from IPython.display import Image, display
import os

samples = scipy.io.loadmat(\
        '/Users/victorzhong/designproject/showcase/cropped.mat')

feature_matrix = samples['feature_matrix'].tolist()
label_vector = samples['label_vector']
feature_names = [n.tolist()[0] for n in samples['parameter_names'][0]]
print 'features:', feature_names

estimates = np.dot(feature_matrix, np.transpose(lr.coef_)) + \
                    lr.intercept_
print 'mean absolute error:', np.mean(np.abs(estimates - label_vector))


features: [u'red mean', u'red variance', u'green mean', u'green variance', u'blue mean', u'blue variance', u'red vs blue mean', u'red vs green mean']
mean absolute error: 8.04133613806

In [7]:
for idx, f in enumerate([f for f in os.listdir(\
        '/Users/victorzhong/designproject/showcase/cropped/') \
        if f.endswith('.jpg')]):
    
    cropped = '/Users/victorzhong/designproject/showcase/cropped/' + f
    full = '/Users/victorzhong/designproject/showcase/full/' + f
    
    print '-'*5, f, '-'*5
    print 'full image:'
    display(Image(filename=full))
    print 'crop:'
    display(Image(filename=cropped))
    print 'features:'
    for k, name in enumerate(feature_names):
        print "\t", name + ':', feature_matrix[idx][k]
    print 'label:'
    print "\t", label_vector.item(idx)
    print 'estimate:'
    print "\t", estimates.item(idx)
    print 'computation:'
    for k in range(len(coefficients)):
        print "\t", coefficients[k], '*', feature_matrix[idx][k], '+'
    print "\t", lr.intercept_.item(0)
    print


----- newark-2013-09-11-0800-28.5.jpg -----
full image:
crop:
features:
	red mean: 191.024292328
	red variance: 60.1121369782
	green mean: 205.563143307
	green variance: 56.5503542375
	blue mean: 196.431539068
	blue variance: 87.7628185093
	red vs blue mean: 0.972472614301
	red vs green mean: 0.929273065469
label:
	28.5
estimate:
	31.2855693788
computation:
	0.113271353219 * 191.024292328 +
	-0.0902314608418 * 60.1121369782 +
	-0.482407926671 * 205.563143307 +
	-0.0224936128764 * 56.5503542375 +
	0.285058967328 * 196.431539068 +
	0.0941005095484 * 87.7628185093 +
	-26.7140166708 * 0.972472614301 +
	106.872901204 * 0.929273065469 +
	-22.0792497432

----- newark-2013-09-23-1000--0.5.jpg -----
full image:
crop:
features:
	red mean: 144.387540623
	red variance: 238.915014934
	green mean: 201.896350199
	green variance: 96.5587788081
	blue mean: 236.728335042
	blue variance: 31.5360266395
	red vs blue mean: 0.60992927018
	red vs green mean: 0.71515676475
label:
	0.5
estimate:
	3.73602453736
computation:
	0.113271353219 * 144.387540623 +
	-0.0902314608418 * 238.915014934 +
	-0.482407926671 * 201.896350199 +
	-0.0224936128764 * 96.5587788081 +
	0.285058967328 * 236.728335042 +
	0.0941005095484 * 31.5360266395 +
	-26.7140166708 * 0.60992927018 +
	106.872901204 * 0.71515676475 +
	-22.0792497432

----- newark-2013-10-02-0800-23.2.jpg -----
full image:
crop:
features:
	red mean: 174.963890055
	red variance: 200.801520168
	green mean: 199.551905104
	green variance: 135.166803067
	blue mean: 197.481808833
	blue variance: 134.73771451
	red vs blue mean: 0.885974718828
	red vs green mean: 0.876783862141
label:
	23.2
estimate:
	19.3240615471
computation:
	0.113271353219 * 174.963890055 +
	-0.0902314608418 * 200.801520168 +
	-0.482407926671 * 199.551905104 +
	-0.0224936128764 * 135.166803067 +
	0.285058967328 * 197.481808833 +
	0.0941005095484 * 134.73771451 +
	-26.7140166708 * 0.885974718828 +
	106.872901204 * 0.876783862141 +
	-22.0792497432

----- newark-2013-10-04-1000-35.2.jpg -----
full image:
crop:
features:
	red mean: 187.785940123
	red variance: 146.36135831
	green mean: 199.27616071
	green variance: 113.606652281
	blue mean: 193.282356031
	blue variance: 121.59005389
	red vs blue mean: 0.971562764337
	red vs green mean: 0.942340214974
label:
	35.2
estimate:
	28.592138711
computation:
	0.113271353219 * 187.785940123 +
	-0.0902314608418 * 146.36135831 +
	-0.482407926671 * 199.27616071 +
	-0.0224936128764 * 113.606652281 +
	0.285058967328 * 193.282356031 +
	0.0941005095484 * 121.59005389 +
	-26.7140166708 * 0.971562764337 +
	106.872901204 * 0.942340214974 +
	-22.0792497432

----- newark-2014-01-03-1500-11.9.jpg -----
full image:
crop:
features:
	red mean: 127.041387889
	red variance: 205.393846103
	green mean: 178.798789453
	green variance: 60.769453788
	blue mean: 206.107864422
	blue variance: 27.5888656501
	red vs blue mean: 0.616383019859
	red vs green mean: 0.710527114182
label:
	11.9
estimate:
	6.97608463099
computation:
	0.113271353219 * 127.041387889 +
	-0.0902314608418 * 205.393846103 +
	-0.482407926671 * 178.798789453 +
	-0.0224936128764 * 60.769453788 +
	0.285058967328 * 206.107864422 +
	0.0941005095484 * 27.5888656501 +
	-26.7140166708 * 0.616383019859 +
	106.872901204 * 0.710527114182 +
	-22.0792497432

----- simat-2013-12-07-15:00-11.jpg -----
full image:
crop:
features:
	red mean: 143.07964876
	red variance: 54.6033427717
	green mean: 178.81969697
	green variance: 27.7539879589
	blue mean: 206.876687328
	blue variance: 74.775440945
	red vs blue mean: 0.691618038787
	red vs green mean: 0.800133604882
label:
	11.0
estimate:
	35.3574754157
computation:
	0.113271353219 * 143.07964876 +
	-0.0902314608418 * 54.6033427717 +
	-0.482407926671 * 178.81969697 +
	-0.0224936128764 * 27.7539879589 +
	0.285058967328 * 206.876687328 +
	0.0941005095484 * 74.775440945 +
	-26.7140166708 * 0.691618038787 +
	106.872901204 * 0.800133604882 +
	-22.0792497432

----- simat-2013-12-14-12:00-34.jpg -----
full image:
crop:
features:
	red mean: 149.881577135
	red variance: 11.5085570952
	green mean: 165.711088154
	green variance: 11.714080119
	blue mean: 175.893922176
	blue variance: 19.4792171788
	red vs blue mean: 0.852113451565
	red vs green mean: 0.904475245467
label:
	34.0
estimate:
	39.5294356327
computation:
	0.113271353219 * 149.881577135 +
	-0.0902314608418 * 11.5085570952 +
	-0.482407926671 * 165.711088154 +
	-0.0224936128764 * 11.714080119 +
	0.285058967328 * 175.893922176 +
	0.0941005095484 * 19.4792171788 +
	-26.7140166708 * 0.852113451565 +
	106.872901204 * 0.904475245467 +
	-22.0792497432

----- simat-2013-12-18-15:00-56.jpg -----
full image:
crop:
features:
	red mean: 154.885347796
	red variance: 20.036321751
	green mean: 162.241270661
	green variance: 14.6066921646
	blue mean: 163.686398072
	blue variance: 20.9955414948
	red vs blue mean: 0.946232244223
	red vs green mean: 0.954660593849
label:
	56.0
estimate:
	40.4475344705
computation:
	0.113271353219 * 154.885347796 +
	-0.0902314608418 * 20.036321751 +
	-0.482407926671 * 162.241270661 +
	-0.0224936128764 * 14.6066921646 +
	0.285058967328 * 163.686398072 +
	0.0941005095484 * 20.9955414948 +
	-26.7140166708 * 0.946232244223 +
	106.872901204 * 0.954660593849 +
	-22.0792497432

----- simat-2013-12-19-13:00-46.jpg -----
full image:
crop:
features:
	red mean: 156.019851928
	red variance: 14.3292086766
	green mean: 176.574156336
	green variance: 16.6706487484
	blue mean: 188.570196281
	blue variance: 47.5876618014
	red vs blue mean: 0.827383409496
	red vs green mean: 0.883593925441
label:
	46.0
estimate:
	39.3057895488
computation:
	0.113271353219 * 156.019851928 +
	-0.0902314608418 * 14.3292086766 +
	-0.482407926671 * 176.574156336 +
	-0.0224936128764 * 16.6706487484 +
	0.285058967328 * 188.570196281 +
	0.0941005095484 * 47.5876618014 +
	-26.7140166708 * 0.827383409496 +
	106.872901204 * 0.883593925441 +
	-22.0792497432

----- simat-2013-12-21-10:00-22.jpg -----
full image:
crop:
features:
	red mean: 153.563292011
	red variance: 46.497035737
	green mean: 194.142768595
	green variance: 11.4693462837
	blue mean: 217.783006198
	blue variance: 41.0840446137
	red vs blue mean: 0.705120636783
	red vs green mean: 0.790981261483
label:
	22.0
estimate:
	28.8504653243
computation:
	0.113271353219 * 153.563292011 +
	-0.0902314608418 * 46.497035737 +
	-0.482407926671 * 194.142768595 +
	-0.0224936128764 * 11.4693462837 +
	0.285058967328 * 217.783006198 +
	0.0941005095484 * 41.0840446137 +
	-26.7140166708 * 0.705120636783 +
	106.872901204 * 0.790981261483 +
	-22.0792497432


In [8]:
print 'Evaluating on Newark data'

newark_dataset = '/Users/victorzhong/designproject/newark_raw.mat'

train_data = scipy.io.loadmat(newark_dataset)

X_train = train_data['feature_matrix']
y_train = train_data['label_vector']

print 'X_train:', X_train.shape
print 'y_train:', y_train.shape
print

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cross_validation import cross_val_score, KFold

scores = []
fold = 0
correctnesses = []

for train_fold_idx, test_fold_idx in KFold(len(y_train), n_folds=10):
    X_train_fold, X_test_fold, y_train_fold, y_test_fold = \
        X_train[train_fold_idx], X_train[test_fold_idx], \
        y_train[train_fold_idx], y_train[test_fold_idx]
    
    lr = RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)\
        .fit(X_train_fold, y_train_fold)
    
    mean_abs_error = np.mean(np.abs(lr.predict(X_test_fold) - y_test_fold))
    fold += 1
    print 'fold', fold, ':', mean_abs_error
    scores.append(mean_abs_error)
        
    plot_train_target = np.transpose(y_train_fold).tolist()[0]
    plot_test_target = np.transpose(y_test_fold).tolist()[0]
    plot_train_estimate = np.transpose(lr.predict(X_train_fold)).tolist()[0]
    plot_test_estimate = np.transpose(lr.predict(X_test_fold)).tolist()[0]
    
    sorted_index = sorted(range(len(plot_train_target)), key=plot_train_target.__getitem__)
    
    x = [ plot_train_target[k] for k in sorted_index ]
    y = [ plot_train_estimate[k] for k in sorted_index ]
    
    plt.scatter(x, x, s=10, c='b', marker="s")
    plt.scatter(x, y, s=10, c='r', marker="o")
    
    pl.title('Goodness of fit on training data')
    pl.xlabel('target PM2.5')
    pl.ylabel('estimated PM2.5')
    pl.show()
    
    print 'training R^2:', compute_r_squared(y_train_fold, lr.predict(X_train_fold))
    
    bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_train_target, plot_train_estimate)]
    num_right = len([ b for b in bucket_correctness if b == True])
    num_total = len(bucket_correctness)
    print 'train bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
    
    sorted_index = sorted(range(len(plot_test_target)), key=plot_test_target.__getitem__)
    
    x = [ plot_test_target[k] for k in sorted_index ]
    y = [ plot_test_estimate[k] for k in sorted_index ]
    
    plt.scatter(x, x, s=10, c='b', marker="s")
    plt.scatter(x, y, s=10, c='r', marker="o")
    
    pl.title('Goodness of fit on test data')
    pl.xlabel('target PM2.5')
    pl.ylabel('estimated PM2.5')
    pl.show()

    print 'testing R^2:', compute_r_squared(y_test_fold, lr.predict(X_test_fold))
    
    bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_test_target, plot_test_estimate)]
    num_right = len([ b for b in bucket_correctness if b == True])
    num_total = len(bucket_correctness)
    print 'test bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total
    correctnesses.append(float(num_right)/num_total)
    
print
print "10 fold CV mean absolute error is ", np.mean(scores)
print correctnesses
print "10 fold mean bucket correctness is", mean(correctnesses)

print 'max PM2.5 in training set:', np.max(y_train)
print 'min PM2.5 in training set:', np.min(y_train)


Evaluating on Newark data
X_train: (102, 8)
y_train: (102, 1)

fold 1 : 7.56676183583
training R^2: 0.517524046203
train bucket correctness: 81 / 91 = 0.89010989011
testing R^2: 0.530463507937
test bucket correctness: 9 / 11 = 0.818181818182
fold 2 : 4.56282683839
training R^2: 0.53059479818
train bucket correctness: 82 / 91 = 0.901098901099
testing R^2: 0.489936294956
test bucket correctness: 10 / 11 = 0.909090909091
fold 3 : 3.76146397753
training R^2: 0.518051154239
train bucket correctness: 79 / 92 = 0.858695652174
testing R^2: 0.546900858468
test bucket correctness: 10 / 10 = 1.0
fold 4 : 5.55234071409
training R^2: 0.486689191861
train bucket correctness: 80 / 92 = 0.869565217391
testing R^2: 0.301454797266
test bucket correctness: 9 / 10 = 0.9
fold 5 : 4.59859730791
training R^2: 0.530359302571
train bucket correctness: 82 / 92 = 0.891304347826
testing R^2: 0.296902557177
test bucket correctness: 9 / 10 = 0.9
fold 6 : 4.90611724323
training R^2: 0.531299057294
train bucket correctness: 80 / 92 = 0.869565217391
testing R^2: 0.259819358627
test bucket correctness: 7 / 10 = 0.7
fold 7 : 2.46329670319
training R^2: 0.534171002318
train bucket correctness: 80 / 92 = 0.869565217391
testing R^2: -0.401233425474
test bucket correctness: 10 / 10 = 1.0
fold 8 : 3.44584967279
training R^2: 0.514125399841
train bucket correctness: 80 / 92 = 0.869565217391
testing R^2: 0.631483550009
test bucket correctness: 10 / 10 = 1.0
fold 9 : 5.64779590434
training R^2: 0.585836709993
train bucket correctness: 83 / 92 = 0.902173913043
testing R^2: -2.06216222592
test bucket correctness: 6 / 10 = 0.6
fold 10 : 5.5049919179
training R^2: 0.576841259453
train bucket correctness: 83 / 92 = 0.902173913043
testing R^2: -4.57008517872
test bucket correctness: 7 / 10 = 0.7

10 fold CV mean absolute error is  4.80100421152
[0.8181818181818182, 0.9090909090909091, 1.0, 0.9, 0.9, 0.7, 1.0, 1.0, 0.6, 0.7]
10 fold mean bucket correctness is 0.852727272727
max PM2.5 in training set: 36.2
min PM2.5 in training set: 0.2

In [9]:
print 'Fit on Newark data:'

lr = RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)\
        .fit(X_train, y_train)
    
plot_train_target = np.transpose(y_train).tolist()[0]
plot_train_estimate = np.transpose(lr.predict(X_train)).tolist()[0]

sorted_index = sorted(range(len(plot_train_target)), key=plot_train_target.__getitem__)

x = [ plot_train_target[k] for k in sorted_index ]
y = [ plot_train_estimate[k] for k in sorted_index ]

plt.scatter(x, x, s=10, c='b', marker="s")
plt.scatter(x, y, s=10, c='r', marker="o")

pl.title('Goodness of fit on training data')
pl.xlabel('target PM2.5')
pl.ylabel('estimated PM2.5')
pl.show()

print 'R^2 of training:', compute_r_squared(y_train, lr.predict(X_train))

print 'train mean abs error: %f' % np.mean(np.abs(lr.predict(X_train)-y_train))

bucket_correctness = [compute_bucket_correctness(y_t, y_p) for y_t, y_p in zip(plot_train_target, plot_train_estimate)]
num_right = len([ b for b in bucket_correctness if b == True])
num_total = len(bucket_correctness)
print 'train bucket correctness:', num_right, '/', num_total, '=', float(num_right)/num_total


Fit on Newark data:
R^2 of training: 0.530390994729
train mean abs error: 4.391091
train bucket correctness: 90 / 102 = 0.882352941176

In [9]: