In [1]:
%matplotlib inline
from bs4 import BeautifulSoup
from glob import glob
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import collections
import cookielib
import datetime
import locale
import math
import matplotlib.pyplot as plt
import mechanize
import numpy as np
import os
import pandas as pd
import pandas.io.sql as psql
import pickle
from PIL import Image
import pymysql as mdb
import re
import requests
import string
import sys
import time
import unidecode
import urllib
import urllib2
# import MySQLdb
from skimage import io
from skimage import color
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.feature_selection import RFE, RFECV
import astroML
from astroML.plotting import hist
plt.rcParams['figure.figsize'] = 16, 8
locale.setlocale( locale.LC_ALL, '' )
Out[1]:
In [2]:
for infile in sorted(glob("../artists_html/*.html")):
jpg_name = ".".join(infile.split(".")[:-1] + ["jpg"])
if not os.path.isfile(jpg_name):
print infile, jpg_name, "removing", infile
os.remove(infile)
In [3]:
artists = []
for infile in sorted(glob("../artists_html/*.html")):
split_name = infile.split("-")[:-1]
artists.append("-".join(split_name).split("/")[-1])
In [4]:
c = collections.Counter(artists)
artist_list = []
for artist, number in c.most_common():
if number > 100:
artist_list.append(artist)
artist_list
Out[4]:
In [6]:
master_dictionary = {}
for running_number, html_file in enumerate(sorted(glob("../artists_html/*.html"))):
if not running_number % 5000:
print running_number
with open(html_file, 'r') as file_handle:
soup = BeautifulSoup(file_handle)
test_artist = "-".join(html_file.split("-")[:-1]).split("/")[-1]
painting_id = html_file.split("-")[-1].strip(".html")
try:
if test_artist not in master_dictionary:
master_dictionary[test_artist] = {}
for cssid in soup.findAll('div', {"class":re.compile("lsc_adjud")}):
if cssid.string:
if not cssid.string.strip().split("\n")[1].strip()[0] == "$":
continue
hammer_price = float("".join([x for x in cssid.string.strip().split("\n")[1].strip("$") if x in string.digits or x == "."]))
master_dictionary[test_artist][painting_id] = {}
master_dictionary[test_artist][painting_id]['hammer_price'] = hammer_price
master_dictionary[test_artist][painting_id]['log_hammer_price'] = np.log(hammer_price)
for cssid in soup.findAll('div', {"class":re.compile("lsc_title")}):
master_dictionary[test_artist][painting_id]['title'] = unidecode.unidecode(cssid.a.text)
year = "".join([number for number in cssid.date.string.strip().strip("(").strip(")").strip("c.") if number in string.digits][:4])
if year:
try:
master_dictionary[test_artist][painting_id]['creation_date'] = datetime.datetime.toordinal(datetime.datetime(int(year), 1, 1))
except:
print "PROBLEM", year
for cssid in soup.findAll('div', {"class":re.compile("lsc_country")}):
sale_country, sale_date = cssid.string.strip().split("\n")
master_dictionary[test_artist][painting_id]['sale_country'] = sale_country.lower().strip(",")
master_dictionary[test_artist][painting_id]['sale_date'] = datetime.datetime.toordinal(datetime.datetime.strptime(sale_date.strip(), "%m-%d-%Y"))
for cssid in soup.findAll('div', {"class":re.compile("lsc_details")}):
details_raw = cssid.string.strip().split("\n")
if len(details_raw) == 4:
category, _, materials_raw, dimensions_raw = details_raw
materials = materials_raw.strip().strip(',')
master_dictionary[test_artist][painting_id]['category'] = category
master_dictionary[test_artist][painting_id]['materials'] = materials
if dimensions_raw.strip().split().count('cm') == 2:
length, width = float(dimensions_raw.strip().split()[0]), float(dimensions_raw.strip().split()[3])
area = length * width
diagonal_length = np.sqrt(length ** 2.0 + width ** 2.0)
master_dictionary[test_artist][painting_id]['area'] = area
master_dictionary[test_artist][painting_id]['diagonal_length'] = length / width
if len(details_raw) != 4:
category = details_raw[0]
materials = details_raw[2].strip().strip(',')
master_dictionary[test_artist][painting_id]['category'] = category
master_dictionary[test_artist][painting_id]['materials'] = materials
for cssid in soup.findAll('div', {"class":re.compile("lsc_estimate")}):
if len(cssid.string.strip().split()) == 4:
low_estimate = float("".join([x for x in cssid.string.strip().split()[1].strip("$") if x in string.digits or x == "."]))
high_estimate = float("".join([x for x in cssid.string.strip().split()[3].strip("$") if x in string.digits or x == "."]))
auction_estimate = np.average([low_estimate, high_estimate])
# master_dictionary[test_artist][painting_id]['low_estimate'] = low_estimate
# master_dictionary[test_artist][painting_id]['high_estimate'] = high_estimate
# master_dictionary[test_artist][painting_id]['auction_estimate'] = auction_estimate
master_dictionary[test_artist][painting_id]['log_low_estimate'] = np.log(low_estimate)
master_dictionary[test_artist][painting_id]['log_high_estimate'] = np.log(high_estimate)
master_dictionary[test_artist][painting_id]['log_auction_estimate'] = np.log(auction_estimate)
for cssid in soup.findAll('div', {"class":re.compile("lsc_auctioneer")}):
master_dictionary[test_artist][painting_id]['auction_house'] = cssid.string.split("\n")[1].strip().strip(',')
except:
print html_file
In [63]:
artist_list = []
c = collections.Counter(artists)
for artist, number in c.most_common():
if number > 100:
artist_list.append(artist)
sale_countries = []
auction_houses = []
features = ['creation_date',
'diagonal_length',
'area',
'sale_date',
# 'low_estimate',
# 'high_estimate',
# 'auction_estimate',
'log_low_estimate',
'log_high_estimate',
'log_auction_estimate',
]
for artist in artist_list:
for x in master_dictionary[artist]:
category = master_dictionary[artist][x]['category']
if not category == "Painting":
continue
if all(feature in master_dictionary[artist][x] for feature in features):
if 'log_low_estimate' in master_dictionary[artist][x]:
# print "hi"
sale_countries.append(master_dictionary[artist][x]['sale_country'].strip())
auction_houses.append(master_dictionary[artist][x]['auction_house'].strip())
auctionhouse_list = []
auc = collections.Counter(auction_houses)
for auction_house, number in auc.most_common():
if number > 25:
auctionhouse_list.append(auction_house)
auctionhouse_list.append("unknown_auction")
country_list = []
country_counter = collections.Counter(sale_countries)
for country, number in country_counter.most_common():
if number > 25:
country_list.append(country)
country_list.append("unknown_country")
print artist_list, country_list, auctionhouse_list
In [64]:
features, master_dictionary[artist][x]
Out[64]:
In [9]:
# Run image analysis
for artist in artist_list:
for x in master_dictionary[artist]:
price = master_dictionary[artist][x]['hammer_price']
category = master_dictionary[artist][x]['category']
if category == "Painting":
jpg_name = "../artists_html/" + artist + "-" + str(x) + ".jpg"
try:
huh = io.imread(jpg_name)
red, green, blue = np.average(np.average(huh, axis=0), axis=0)
master_dictionary[artist][x]['red'] = red
master_dictionary[artist][x]['green'] = green
master_dictionary[artist][x]['blue'] = blue
except:
pass
In [70]:
features = ['creation_date',
'diagonal_length',
'area',
'sale_date',
'red',
'green',
'blue',
# 'low_estimate',
# 'high_estimate',
# 'auction_estimate',
'log_low_estimate',
'log_high_estimate',
'log_auction_estimate',
]
big_X = []
# big_Y = []
log_big_Y = []
testing_X = []
training_X = []
# testing_Y = []
# training_Y = []
log_testing_Y = []
log_training_Y = []
restrictions = []
information = []
training_information = []
testing_information = []
label = preprocessing.LabelBinarizer()
label.fit(np.array(artist_list))
auction_label = preprocessing.LabelBinarizer()
auction_label.fit(np.array(auctionhouse_list))
country_label = preprocessing.LabelBinarizer()
country_label.fit(np.array(country_list))
keys = []
for artist in artist_list:
for x in master_dictionary[artist]:
price = master_dictionary[artist][x]['hammer_price']
log_price = master_dictionary[artist][x]['log_hammer_price']
category = master_dictionary[artist][x]['category']
if not category == "Painting":
continue
if all(feature in master_dictionary[artist][x] for feature in features):
sale_country = master_dictionary[artist][x]['sale_country'].strip()
if sale_country not in country_list:
sale_country = "unknown_country"
auction_house = master_dictionary[artist][x]['auction_house'].strip()
if auction_house not in auctionhouse_list:
auction_house = "unknown_auction"
feature_list = [master_dictionary[artist][x][feature] for feature in features]
label_list = list(label.transform([artist])[0])
auction_list = list(auction_label.transform([auction_house])[0])
temp_country_list = list(country_label.transform([sale_country])[0])
clever_list = feature_list + label_list + auction_list + temp_country_list
big_X.append(clever_list)
# big_Y.append(price)
log_big_Y.append(log_price)
information.append([artist, x, auction_house, sale_country] + [master_dictionary[artist][x][feature] for feature in features])
if master_dictionary[artist][x]['sale_date'] < datetime.datetime.toordinal(datetime.datetime(2013, 9, 1)):
training_X.append(clever_list)
# training_Y.append(price)
log_training_Y.append(log_price)
training_information.append([artist, x, auction_house, sale_country] + [master_dictionary[artist][x][feature] for feature in features])
elif master_dictionary[artist][x]['sale_date'] > datetime.datetime.toordinal(datetime.datetime(2013, 9, 1)):
testing_X.append(clever_list)
# testing_Y.append(price)
log_testing_Y.append(log_price)
testing_information.append([artist, x, auction_house, sale_country] + [master_dictionary[artist][x][feature] for feature in features])
big_X = np.array(big_X)
# big_Y = np.array(big_Y)
log_big_Y = np.array(log_big_Y)
testing_X = np.array(testing_X)
training_X = np.array(training_X)
# testing_Y = np.array(testing_Y)
# training_Y = np.array(training_Y)
log_testing_Y = np.array(log_testing_Y)
log_training_Y = np.array(log_training_Y)
all_features = features + artist_list + auctionhouse_list + country_list
all_features
Out[70]:
In [71]:
features
Out[71]:
In [72]:
# auction_training_predicted = []
# auction_testing_predicted = []
# auction_predicted = []
log_auction_training_predicted = []
log_auction_testing_predicted = []
log_auction_predicted = []
# for index, price in enumerate(training_Y):
# auction_training_predicted.append(training_information[index][13])
# for index, price in enumerate(testing_Y):
# auction_testing_predicted.append(testing_information[index][13])
# for index, price in enumerate(big_Y):
# auction_predicted.append(information[index][13])
for index, price in enumerate(log_training_Y):
log_auction_training_predicted.append(training_information[index][13])
for index, price in enumerate(log_testing_Y):
log_auction_testing_predicted.append(testing_information[index][13])
for index, price in enumerate(log_big_Y):
log_auction_predicted.append(information[index][13])
# auction_training_predicted = np.array(auction_training_predicted)
# auction_testing_predicted = np.array(auction_testing_predicted)
# auction_predicted = np.array(auction_predicted)
log_auction_training_predicted = np.array(log_auction_training_predicted)
log_auction_testing_predicted = np.array(log_auction_testing_predicted)
log_auction_predicted = np.array(log_auction_predicted)
In [73]:
# Plot the mean auction range prediction vs price and error
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams['legend.scatterpoints'] = 1
# Get transformation function
scaler = preprocessing.StandardScaler().fit(training_X)
new_training_X = scaler.transform(training_X)
new_testing_X = scaler.transform(testing_X)
new_big_X = scaler.transform(big_X)
log_linear = linear_model.LinearRegression()
log_linear.fit(new_training_X, log_training_Y)
log_testing_predicted = log_linear.predict(new_testing_X)
log_training_predicted = log_linear.predict(new_training_X)
print "Score:", log_linear.score(new_testing_X, log_testing_Y)
sca1 = plt.scatter(log_training_Y, log_linear.predict(new_training_X), color='blue', alpha=0.35, s=60,
label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, log_linear.predict(new_testing_X), color='red', alpha=0.75, s=60,
label="Testing (" + str(len(log_testing_Y)) + ")")
price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()])
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()])
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)
MSE_training = np.sqrt(np.sum((log_training_Y - log_linear.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - log_linear.predict(new_testing_X))**2.0/len(log_testing_Y)))
first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)),
"MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")
In [74]:
#log_linear.coef_
for thing, coef in zip(all_features, log_linear.coef_):
print thing, coef
# = features + artist_list + auctionhouse_list + country_list
# things
In [75]:
# MSPE auction house
# Plot the mean auction range prediction vs price and error
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams['legend.scatterpoints'] = 1
sca1 = plt.scatter(log_testing_Y, testing_X[:, 9], color='blue', alpha=0.35, s=60,
label="Auction (" + str(len(log_testing_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, log_linear.predict(new_testing_X), color='red', alpha=0.75, s=60,
label="Testing (" + str(len(log_testing_Y)) + ")")
price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()])
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()])
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)
MSAE_training = np.sqrt(np.sum((log_testing_Y - testing_X[:,9])**2.0/ len(log_testing_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - log_linear.predict(new_testing_X))**2.0/len(log_testing_Y)))
first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSAE Testing " + str(round(MSAE_training, 3)),
"MSPE Testing " + str(round(MSPE_testing, 3))], loc=4, fontsize=fontsize)
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")
In [76]:
from sklearn.linear_model import LassoCV, LassoLarsCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
In [78]:
rf_regressor = RandomForestRegressor(oob_score=True, n_estimators=2000, max_features='auto')
rf_regressor.fit(new_training_X, log_training_Y)
log_rf_predicted = rf_regressor.predict(new_testing_X)
sca1 = plt.scatter(log_training_Y, rf_regressor.predict(new_training_X), color='blue', alpha=0.35, s=60,
label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, rf_regressor.predict(new_testing_X), color='red', alpha=0.75, s=60,
label="Testing (" + str(len(log_testing_Y)) + ")")
price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()])
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()])
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)
MSE_training = np.sqrt(np.sum((log_training_Y - rf_regressor.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - rf_regressor.predict(new_testing_X))**2.0/len(log_testing_Y)))
first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)),
"MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)
plt.title("Random Forests")
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")
In [59]:
# sklearn.linear_model.LassoCV(eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=1, positive=False)
# rf_regressor = RandomForestRegressor(oob_score=True, n_estimators=2000, max_features='sqrt' )
rf_regressor = LassoCV(eps=0.01, n_alphas=1500, alphas=None,
fit_intercept=True, normalize=False, precompute='auto',
max_iter=10000, tol=0.00001,
copy_X=True,
cv=5,
verbose=False,
n_jobs=1, positive=False)
rf_regressor.fit(new_training_X, log_training_Y)
log_rf_predicted = rf_regressor.predict(new_testing_X)
sca1 = plt.scatter(log_training_Y, rf_regressor.predict(new_training_X), color='blue', alpha=0.35, s=60,
label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, rf_regressor.predict(new_testing_X), color='red', alpha=0.75, s=60,
label="Testing (" + str(len(log_testing_Y)) + ")")
price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()])
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()])
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)
MSE_training = np.sqrt(np.sum((log_training_Y - rf_regressor.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - rf_regressor.predict(new_testing_X))**2.0/len(log_testing_Y)))
first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)),
"MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)
plt.title("LassoCV")
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")
In [49]:
rf_regressor = LassoLarsCV(
eps=0.01,
# n_alphas=1500, alphas=None,
fit_intercept=True,
# normalize=False, precompute='auto',
# max_iter=10000, tol=0.00001,
# copy_X=True,
cv=5,
# verbose=True,
# n_jobs=1, positive=False
)
rf_regressor.fit(new_training_X, log_training_Y)
log_rf_predicted = rf_regressor.predict(new_testing_X)
sca1 = plt.scatter(log_training_Y, rf_regressor.predict(new_training_X), color='blue', alpha=0.35, s=60,
label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, rf_regressor.predict(new_testing_X), color='red', alpha=0.75, s=60,
label="Testing (" + str(len(log_testing_Y)) + ")")
price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()])
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()])
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)
MSE_training = np.sqrt(np.sum((log_training_Y - rf_regressor.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - rf_regressor.predict(new_testing_X))**2.0/len(log_testing_Y)))
first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)),
"MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)
plt.title("LassoLarsCV")
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")
In [58]:
from sklearn.cross_validation import ShuffleSplit
rf_regressor = ElasticNetCV(cv=5, l1_ratio=.75)
rf_regressor.fit(new_training_X, log_training_Y)
log_rf_predicted = rf_regressor.predict(new_testing_X)
sca1 = plt.scatter(log_training_Y, rf_regressor.predict(new_training_X), color='blue', alpha=0.35, s=60,
label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, rf_regressor.predict(new_testing_X), color='red', alpha=0.75, s=60,
label="Testing (" + str(len(log_testing_Y)) + ")")
price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()])
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()])
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)
MSE_training = np.sqrt(np.sum((log_training_Y - rf_regressor.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - rf_regressor.predict(new_testing_X))**2.0/len(log_testing_Y)))
first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)),
"MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)
plt.title("ElasticNetCV")
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")
In [ ]:
In [194]:
training_auction_total = np.sum(training_Y - auction_training_predicted)
# training_total = np.sum(training_Y - training_predicted)
# print training_auction_total, training_total
testing_auction_total = np.sum(testing_Y - auction_testing_predicted)
my_testing_predicted = log_linear.predict(new_testing_X)
# log_training_predicted = log_linear.predict(new_training_X)
testing_total = np.sum(testing_Y - my_testing_predicted)
# print testing_auction_total, testing_total
log_training_auction_total = np.sum(log_training_Y - log_auction_training_predicted)
log_training_total = np.sum(log_training_Y - log_training_predicted)
# print log_training_auction_total, log_training_total
log_testing_auction_total = np.sum(log_testing_Y - log_auction_testing_predicted)
log_testing_total = np.sum(log_testing_Y - log_testing_predicted)
# print log_testing_auction_total, log_testing_total
print "This part matters"
log_testing_auction_total = np.sum(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted)
log_testing_total = np.sum(np.e ** log_testing_Y - np.e ** log_testing_predicted)
my_total_miss = np.sum(np.e ** log_testing_Y - np.e ** log_testing_predicted)
my_std = np.std(np.e ** log_testing_Y - np.e ** log_testing_predicted)
auction_total_miss = np.sum(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted)
auction_std = np.std(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted)
print "My miss:", my_total_miss
print "My std:", my_std
print "Auction miss:", auction_total_miss
print "Auction std:", auction_std
plt.rcParams['figure.figsize'] = 8, 8
print "Total sales:", np.sum(np.e ** log_testing_Y)
winby = round(np.abs(auction_total_miss)/1e6 - np.abs(my_total_miss)/1e6,2)
alpha = 0.75
bins = 20
linewidth=25.0
width = 0.5
plt.bar(1, np.abs(auction_total_miss)/1e6, width=width, color="red", alpha=alpha)
# plt.bar(2, np.abs(auction_total_miss)/1e6, width=width, color="green", alpha=alpha)
plt.bar(2, np.abs(my_total_miss)/1e6, width=width, color="green", alpha=alpha)
center = np.abs(my_total_miss)/1e6 + winby/2.0
plt.errorbar(1.8, np.abs(my_total_miss)/1e6 + winby/2.0, yerr=winby/2.0,
color="green",
capsize=20.0,
markeredgewidth=4.0,
elinewidth=20.0,
alpha=alpha)
plt.text(1.95,20., "$" + str(winby) + " million\n saved", fontsize=fontsize)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.xlim(0.5, 3)
plt.xticks(np.array([1, 2]) + width/ 2.0, ('Auction', 'HammerPricer'))
plt.ylabel("Cumulative Error [$] (in millions)", fontsize=fontsize)
# plt.title("Difference between predicted value and actual", fontsize=fontsize)
plt.tight_layout()
plt.savefig("2014-09-29-accuracy-comparison.pdf")
print "Win by: ",
In [33]:
# Give me top 10 closest predictions
my_total_miss
auction_total_miss
# my_total_miss = np.sum(np.e ** log_testing_Y - np.e ** log_testing_predicted)
# auction_total_miss = np.sum(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted)
biggest_auction_misses = np.argsort(np.abs(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted))[-100:]
biggest_auction_misses
for index in biggest_auction_misses:
auction_guess = np.e ** log_auction_testing_predicted[index]
my_guess = np.e ** log_testing_predicted[index]
hammer_price = np.e ** log_testing_Y[index]
if np.abs(hammer_price - auction_guess) > np.abs(hammer_price - my_guess):
if np.abs(hammer_price - my_guess) < hammer_price / 6.0:
artist = testing_information[index][0]
x = testing_information[index][1]
print hammer_price, auction_guess, my_guess, index, artist + "-" + str(x), master_dictionary[artist][x]['title']
In [69]:
In [38]:
export_me = {}
# for index in [338, 46, 247, 192, 202, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
for index in [338, 46, 247, 192, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
auction_guess = np.e ** log_auction_testing_predicted[index]
my_guess = np.e ** log_testing_predicted[index]
hammer_price = np.e ** log_testing_Y[index]
artist = testing_information[index][0]
x = testing_information[index][1]
other = testing_information[index][2:]
print artist + "-" + str(x), hammer_price, auction_guess, my_guess, index, master_dictionary[artist][x]['title'], other
print things
In [137]:
export_me = {}
for index in [338, 46, 247, 192, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
auction_guess = np.e ** log_auction_testing_predicted[index]
my_guess = np.e ** log_testing_predicted[index]
hammer_price = np.e ** log_testing_Y[index]
artist = testing_information[index][0]
x = testing_information[index][1]
mylogstd = 0.39429513769064306
other = testing_information[index][2:]
print "Title:", master_dictionary[artist][x]['title']
# print "Artist:", " ".join(["".join([name[0].upper(), name[1:]]) for name in artist.split("-")])
# print "Auction Date:", datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['sale_date']), "%Y-%m-%d")
# print "Creation Year:", datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['creation_date']), "%Y")
# print "Auction House:", master_dictionary[artist][x]['auction_house'].lower().capitalize()
# print "Country:", " ".join(["".join([name[0].upper(), name[1:]]) for name in master_dictionary[artist][x]['sale_country'].split()])
# print "Auction House Low Estimate:", locale.currency(master_dictionary[artist][x]['low_estimate'], grouping=True)
# print "Auction House Prediction:", locale.currency(master_dictionary[artist][x]['auction_estimate'], grouping=True)
# print "Auction House High Estimate:", locale.currency(master_dictionary[artist][x]['high_estimate'], grouping=True)
# print "HammerPricer Low:", locale.currency(round(np.e ** (np.log(my_guess) - mylogstd), -2), grouping=True)
# print "HammerPricer Prediction:", locale.currency(round(my_guess, -2), grouping=True)
# print "HammerPricer High:", locale.currency(round(np.e ** (np.log(my_guess) + mylogstd), -2), grouping=True)
# print "Sale Price:", locale.currency(hammer_price, grouping=True)
# print
# print artist + "-" + str(x), hammer_price, auction_guess, my_guess, index, master_dictionary[artist][x]['title'], other
In [87]:
# con = mdb.connect(user="root", host="localhost", passwd="", database='test1')
con = mdb.connect('localhost', "root", "")
# mdb.connect()
cursor = con.cursor()
sql = "CREATE DATABASE IF NOT EXISTS test1"
cursor.execute(sql)
Out[87]:
In [118]:
keys = ["title", "artist", "auction_date", "creation_date", "auction_house", "country", "auction_house_low", "auction_house_prediction", "auction_house_high", "hammer_price_low", "hammer_price_prediction", "hammer_price_high", "sale_price", "key", "image"]
','.join(keys)
Out[118]:
In [147]:
con = mdb.connect('localhost', "root", "", 'test1')
with con:
cur = con.cursor()
cur.execute("DROP TABLE IF EXISTS Auctions")
cur.execute("CREATE TABLE Auctions(id INT PRIMARY KEY AUTO_INCREMENT, \
title VARCHAR(250), \
artist VARCHAR(250), \
auction_date VARCHAR(250), \
creation_date VARCHAR(250), \
auction_house VARCHAR(250), \
country VARCHAR(250), \
auction_house_low VARCHAR(250), \
auction_house_prediction VARCHAR(250), \
auction_house_high VARCHAR(250), \
hammer_price_low VARCHAR(250), \
hammer_price_prediction VARCHAR(250), \
hammer_price_high VARCHAR(250), \
sale_price VARCHAR(250), \
artkey VARCHAR(250), \
image VARCHAR(250))")
for index in [338, 46, 247, 192, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
auction_guess = np.e ** log_auction_testing_predicted[index]
my_guess = np.e ** log_testing_predicted[index]
hammer_price = np.e ** log_testing_Y[index]
artist = testing_information[index][0]
x = testing_information[index][1]
mylogstd = 0.39429513769064306
other = testing_information[index][2:]
values = [master_dictionary[artist][x]['title'].strip("\""),
" ".join(["".join([name[0].upper(), name[1:]]) for name in artist.split("-")]),
datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['sale_date']), "%Y-%m-%d"),
datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['creation_date']), "%Y"),
master_dictionary[artist][x]['auction_house'].lower().capitalize(),
" ".join(["".join([name[0].upper(), name[1:]]) for name in master_dictionary[artist][x]['sale_country'].split()]),
locale.currency(master_dictionary[artist][x]['low_estimate'], grouping=True)[:-3],
locale.currency(master_dictionary[artist][x]['auction_estimate'], grouping=True)[:-3],
locale.currency(master_dictionary[artist][x]['high_estimate'], grouping=True)[:-3],
locale.currency(round(np.e ** (np.log(my_guess) - mylogstd), -2), grouping=True)[:-3],
locale.currency(round(my_guess, -2), grouping=True)[:-3],
locale.currency(round(np.e ** (np.log(my_guess) + mylogstd), -2), grouping=True)[:-3],
locale.currency(hammer_price, grouping=True)[:-3],
artist + "-" + str(x),
"../static/images/" + artist + "-" + str(x) + ".jpg",]
build_string = "\",\"".join(values)
cur.execute("INSERT INTO Auctions(title,artist,auction_date,creation_date,auction_house,country,auction_house_low,auction_house_prediction,auction_house_high,hammer_price_low,hammer_price_prediction,hammer_price_high,sale_price,artkey,image ) VALUES(\"%s\")" % (build_string))
cur.execute("SELECT image FROM Auctions")
rows = cur.fetchall()
for row in rows:
print row
In [168]:
# Dictionary Cursor
with con:
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute("SELECT * FROM Auctions")
rows = cur.fetchall()
for row in rows:
print row['title'], row["auction_house_low"]
In [125]:
values = [master_dictionary[artist][x]['title'].strip("\""),
" ".join(["".join([name[0].upper(), name[1:]]) for name in artist.split("-")]),
datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['sale_date']), "%Y-%m-%d"),
datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['creation_date']), "%Y"),
master_dictionary[artist][x]['auction_house'].lower().capitalize(),
" ".join(["".join([name[0].upper(), name[1:]]) for name in master_dictionary[artist][x]['sale_country'].split()]),
locale.currency(master_dictionary[artist][x]['low_estimate'], grouping=True),
locale.currency(master_dictionary[artist][x]['auction_estimate'], grouping=True),
locale.currency(master_dictionary[artist][x]['high_estimate'], grouping=True),
locale.currency(round(np.e ** (np.log(my_guess) - mylogstd), -2), grouping=True),
locale.currency(round(my_guess, -2), grouping=True),
locale.currency(round(np.e ** (np.log(my_guess) + mylogstd), -2), grouping=True),
locale.currency(hammer_price, grouping=True),
artist + "-" + str(x),
"../static/images/" + artist + "-" + str(x) + ".jpg",]
"\",\"".join(values)
Out[125]:
In [167]:
Out[167]:
In [75]:
export_me = {}
for index in [338, 46, 247, 192, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
auction_guess = np.e ** log_auction_testing_predicted[index]
my_guess = np.e ** log_testing_predicted[index]
hammer_price = np.e ** log_testing_Y[index]
artist = testing_information[index][0]
x = testing_information[index][1]
mylogstd = 0.39429513769064306
other = testing_information[index][2:]
print "Title:", master_dictionary[artist][x]['title']
print "Artist:", " ".join(["".join([name[0].upper(), name[1:]]) for name in artist.split("-")])
print "Auction Date:", datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['sale_date']), "%Y-%m-%d")
print "Creation Year:", datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['creation_date']), "%Y")
print "Auction House:", master_dictionary[artist][x]['auction_house'].lower().capitalize()
print "Country:", " ".join(["".join([name[0].upper(), name[1:]]) for name in master_dictionary[artist][x]['sale_country'].split()])
print "Auction House Low Estimate:", locale.currency(master_dictionary[artist][x]['low_estimate'], grouping=True)
print "Auction House Prediction:", locale.currency(master_dictionary[artist][x]['auction_estimate'], grouping=True)
print "Auction House High Estimate:", locale.currency(master_dictionary[artist][x]['high_estimate'], grouping=True)
print "HammerPricer Low:", locale.currency(round(np.e ** (np.log(my_guess) - mylogstd), -2), grouping=True)
print "HammerPricer Prediction:", locale.currency(round(my_guess, -2), grouping=True)
print "HammerPricer High:", locale.currency(round(np.e ** (np.log(my_guess) + mylogstd), -2), grouping=True)
print "Sale Price:", locale.currency(hammer_price, grouping=True)
print "key:", artist + "-" + str(x)
print "image:", "../static/images/" + artist + "-" + str(x) + ".jpg"
print
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [197]:
auction_training_predicted = np.array(auction_training_predicted)
auction_testing_predicted = np.array(auction_testing_predicted)
auction_predicted = np.array(auction_predicted)
log_auction_training_predicted = np.array(log_auction_training_predicted)
log_auction_testing_predicted = np.array(log_auction_testing_predicted)
log_auction_predicted = np.array(log_auction_predicted)
# Plot the mean auction range prediction vs price and error
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams['legend.scatterpoints'] = 1
# Get transformation function
# sca1 = plt.scatter(log_training_Y, log_auction_training_predicted, color='green', alpha=0.35, s=60,
# label="Training (" + str(len(log_training_Y)) + ")")
sca1 = plt.scatter(log_testing_Y, log_auction_testing_predicted, color='green', alpha=0.5, s=60,
label="Auction Estimate (" + str(len(log_testing_Y)) + ")")
price_lower = 6.0
price_upper = 18.0
# plt.xlim(price_lower, price_upper)
# plt.ylim(price_lower, price_upper)
# plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()])
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()])
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)
MSE_training = np.sqrt(np.sum((log_training_Y - log_auction_training_predicted)**2.0/ len(log_training_Y)))
# MSPE_testing = np.sqrt(np.sum((log_testing_Y - log_auction_testing_predicted)**2.0/len(log_testing_Y)))
first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
# plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,2)),
# "MSPE Testing " + str(round(MSPE_testing,2))], loc=4, fontsize=fontsize)
plt.tight_layout()
# Plot the mean auction range prediction vs price and error
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams['legend.scatterpoints'] = 1
# Get transformation function
scaler = preprocessing.StandardScaler().fit(training_X)
new_training_X = scaler.transform(training_X)
new_testing_X = scaler.transform(testing_X)
new_big_X = scaler.transform(big_X)
log_linear = linear_model.LinearRegression()
log_linear.fit(new_training_X, log_training_Y)
log_testing_predicted = log_linear.predict(new_testing_X)
log_training_predicted = log_linear.predict(new_training_X)
print "Score:", log_linear.score(new_testing_X, log_testing_Y)
# sca1 = plt.scatter(log_training_Y, log_linear.predict(new_training_X), color='blue', alpha=0.35, s=60,
# label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, log_linear.predict(new_testing_X), color='red', alpha=0.5, s=60,
label="HammerPricer Prediction (" + str(len(log_testing_Y)) + ")")
price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()])
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()])
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)
# MSE_training = np.sqrt(np.sum((log_training_Y - log_linear.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - log_linear.predict(new_testing_X))**2.0/len(log_testing_Y)))
first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSPE Auction " + str(round(MSE_training,3)),
"MSPE HammerPricer " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)
plt.tight_layout()
# plt.savefig("2014-09-23-HammerPricer-wins.pdf")
In [269]:
auction_wins = []
my_wins = []
for index, row in enumerate(testing_information):
hammer_price = testing_Y[index]
auction_low = row[11]
auction_high = row[12]
auction_pred = auction_testing_predicted[index]
half_interval = auction_pred - auction_low
my_prediction = np.e ** log_testing_predicted[index]
if auction_pred - half_interval < hammer_price and hammer_price < auction_pred + half_interval:
auction_wins.append(hammer_price)
if my_prediction - half_interval < hammer_price and hammer_price < my_prediction + half_interval:
my_wins.append(hammer_price)
print len(auction_wins), len(my_wins)
In [258]:
print np.std(testing_Y - my_prediction), np.std(testing_Y - auction_testing_predicted)
In [484]:
my_delta = []
auction_delta = []
for index, price in enumerate(testing_Y):
# my_delta.append(np.e ** price - np.e ** testing_predicted[index])
# auction_delta.append(np.e ** price - np.average(np.e ** testing_information[index][12] - np.e ** testing_information[index][11]))
my_delta.append(np.e ** testing_predicted[index] - np.e ** price)
auction_delta.append(np.average(np.e ** testing_information[index][12] - np.e ** testing_information[index][11]) - np.e ** price)
my_delta = np.array(my_delta)
auction_delta = np.array(auction_delta)
In [ ]:
In [ ]:
In [80]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn import feature_selection
In [63]:
# # Plot the mean auction range prediction vs price and error
# plt.rcParams['figure.figsize'] = 12, 12
# plt.rcParams['legend.scatterpoints'] = 1
# Get transformation function
scaler = preprocessing.StandardScaler().fit(training_X)
new_training_X = scaler.transform(training_X)
new_testing_X = scaler.transform(testing_X)
new_big_X = scaler.transform(big_X)
log_linear = linear_model.LinearRegression()
log_linear.fit(new_training_X, log_training_Y)
log_testing_predicted = log_linear.predict(new_testing_X)
log_training_predicted = log_linear.predict(new_training_X)
# print "Score:", log_linear.score(new_testing_X, log_testing_Y)
# sca1 = plt.scatter(log_training_Y, log_linear.predict(new_training_X), color='blue', alpha=0.35, s=60,
# label="Training (" + str(len(log_training_Y)) + ")")
# sca2 = plt.scatter(log_testing_Y, log_linear.predict(new_testing_X), color='red', alpha=0.75, s=60,
# label="Testing (" + str(len(log_testing_Y)) + ")")
plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(things)
# for k in range(1, feature_length + 1):
for k in range(1, feature_length):
selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
results = selector.fit(new_training_X, log_training_Y)
linear = linear_model.LinearRegression()
linear.fit(new_testing_X[:, [index for index in results.get_support(indices=True)]], log_testing_Y)
plt.scatter(k, np.average(((linear.predict(new_testing_X[:, [index for index in results.get_support(indices=True)]]) - log_testing_Y))**2),
color="blue",
s=120.0)
xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
xlabels = [things[x] for x in xlabels]
plt.xticks(np.arange(1, feature_length + 1, 1.0))
plt.axes().set_xticklabels([x for x in xlabels])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.xlim(0, feature_length)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel("MSP Error", fontsize=fontsize)
# plt.ylim(0.2, 1.5)
plt.tight_layout()
# plt.savefig("2014-09-23-relative-error-features.pdf")
In [21]:
# # Plot the mean auction range prediction vs price and error
# plt.rcParams['figure.figsize'] = 12, 12
# plt.rcParams['legend.scatterpoints'] = 1
# Get transformation function
plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(things)
# for k in range(1, feature_length + 1):
for k in range(1, feature_length):
selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
results = selector.fit(new_training_X, log_training_Y)
linear = linear_model.LinearRegression()
linear.fit(new_testing_X[:, [index for index in results.get_support(indices=True)]], log_testing_Y)
plt.scatter(np.average(((linear.predict(new_testing_X[:, [index for index in results.get_support(indices=True)]]) - log_testing_Y))**2),
k,
color="blue",
s=120.0)
xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
xlabels = [things[x] for x in xlabels]
plt.yticks(np.arange(1, feature_length + 1, 1.0))
plt.axes().set_yticklabels([x for x in xlabels])
# plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.ylim(0, feature_length)
plt.yticks(fontsize=fontsize)
plt.ylabel("Feature", fontsize=fontsize)
plt.xlabel("MSP Error", fontsize=fontsize)
# plt.ylim(0.2, 1.5)
plt.tight_layout()
# plt.savefig("2014-09-23-relative-error-features.pdf")
In [326]:
# # Plot the mean auction range prediction vs price and error
# plt.rcParams['figure.figsize'] = 12, 12
# plt.rcParams['legend.scatterpoints'] = 1
# Get transformation function
scaler = preprocessing.StandardScaler().fit(training_X)
new_training_X = scaler.transform(training_X)
new_testing_X = scaler.transform(testing_X)
new_big_X = scaler.transform(big_X)
log_linear = linear_model.LinearRegression()
log_linear.fit(new_training_X, log_training_Y)
log_testing_predicted = log_linear.predict(new_testing_X)
log_training_predicted = log_linear.predict(new_training_X)
plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(things)
# for k in range(1, feature_length + 1):
for k in range(1, 2 * feature_length / 3):
selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
results = selector.fit(new_training_X, log_training_Y)
linear = linear_model.LinearRegression()
linear.fit(new_testing_X[:, [index for index in results.get_support(indices=True)]], log_testing_Y)
plt.scatter(k, linear.score(new_testing_X[:, [index for index in results.get_support(indices=True)]],
log_testing_Y),
color="blue",
s=120.0)
xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
xlabels = [things[x] for x in xlabels]
plt.xticks(np.arange(1, feature_length + 1, 1.0))
plt.axes().set_xticklabels([x for x in xlabels])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.xlim(0, 2 * feature_length / 3)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel(r"R$^2$", fontsize=fontsize)
# plt.ylim(0.2, 1.5)
plt.tight_layout()
# plt.savefig("2014-09-23-relative-error-features.pdf")
In [23]:
for index, thing in enumerate(things):
print index, thing
In [197]:
int(" 87")
Out[197]:
In [32]:
dates_indices = [0, 3]
physical_indices = [1, 2]
color_indicies = range(4, 7)
estimate_indices = range(7, 10)
log_estimate_indices = range(10, 13)
artist_indices = range(13,50)
auction_indices = range(50, 68)
country_indicies = range(68, 80)
In [ ]:
list_of_indices = [dates_indices,
physical_indices,
color_indicies,
estimate_indices,
log_estimate_indices,
artist_indices,
auction_indices,
country_indicies,
]
In [55]:
k = 1
indexlist = dates_indices + physical_indices + color_indicies + artist_indices + auction_indices + country_indicies
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x not in indexlist]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x not in indexlist]]
noest_things = [things[x] for x in range(len(things)) if x not in indexlist]
linear = linear_model.LinearRegression()
linear.fit(noest_training_X, log_training_Y)
plt.scatter(k, 1.0 / len(log_training_Y) * np.sqrt(np.sum(((linear.predict(noest_training_X) - log_training_Y))**2)),
color="blue",
s=120.0)
k = 2
indexlist = []
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x not in indexlist]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x not in indexlist]]
noest_things = [things[x] for x in range(len(things)) if x not in indexlist]
linear = linear_model.LinearRegression()
linear.fit(noest_training_X, log_training_Y)
plt.scatter(k, 1.0 / len(log_training_Y) * np.sqrt(np.sum(((linear.predict(noest_training_X) - log_training_Y))**2)),
color="blue",
s=120.0)
# xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
# xlabels = [things[x] for x in xlabels]
# plt.xticks(np.arange(1, feature_length + 1, 1.0))
# plt.axes().set_xticklabels([x for x in xlabels])
Out[55]:
In [90]:
import itertools
In [94]:
indices_dict = {}
indices_dict["dates_indices"] = dates_indices
indices_dict["physical_indices"] = physical_indices
indices_dict["color_indicies"] = color_indicies
indices_dict["estimate_indices"] = estimate_indices
indices_dict["log_estimate_indices"] = log_estimate_indices
indices_dict["artist_indices"] = artist_indices
indices_dict["auction_indices"] = auction_indices
indices_dict["country_indicies"] = country_indicies
keys = ["dates_indices",
"physical_indices",
"color_indicies",
"estimate_indices",
"log_estimate_indices",
"artist_indices",
"auction_indices",
"country_indicies",]
In [118]:
# keys = ["dates_indices",
# "physical_indices",
# "color_indicies",
# "estimate_indices",
# "log_estimate_indices",
# "artist_indices",
# "auction_indices",
# "country_indicies",]
# # + ["log_estimate_indices", ]
# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
# print index, guts[0]
# indexlist = indices_dict[guts[0]]
# noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
# noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
# linear = linear_model.LinearRegression()
# linear.fit(noest_training_X, log_training_Y)
# plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
# color="blue",
# s=120.0)
# keys = ["dates_indices",
# "physical_indices",
# "color_indicies",
# "artist_indices",
# "auction_indices",
# "country_indicies",]
# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
# print index, guts[0]
# indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices",]
# noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
# noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
# linear = linear_model.LinearRegression()
# linear.fit(noest_training_X, log_training_Y)
# plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
# color="blue",
# s=120.0)
# keys = ["dates_indices",
# "physical_indices",
# "color_indicies",
# "auction_indices",
# "country_indicies",]
# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
# print index, guts[0]
# indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices",]
# noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
# noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
# linear = linear_model.LinearRegression()
# linear.fit(noest_training_X, log_training_Y)
# plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
# color="blue",
# s=120.0)
# keys = ["dates_indices",
# "physical_indices",
# "color_indicies",
# "country_indicies",]
# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
# print index, guts[0]
# indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices", "auction_indices",]
# noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
# noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
# linear = linear_model.LinearRegression()
# linear.fit(noest_training_X, log_training_Y)
# plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
# color="blue",
# s=120.0)
# keys = ["physical_indices",
# "color_indicies",
# "country_indicies",]
# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
# print index, guts[0]
# indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices", "auction_indices","dates_indices",]
# noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
# noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
# linear = linear_model.LinearRegression()
# linear.fit(noest_training_X, log_training_Y)
# plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
# color="blue",
# s=120.0)
# keys = ["color_indicies",
# "country_indicies",]
# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
# print index, guts[0]
# indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices", "auction_indices","dates_indices","physical_indices",]
# noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
# noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
# linear = linear_model.LinearRegression()
# linear.fit(noest_training_X, log_training_Y)
# plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
# color="blue",
# s=120.0)
keys = ["color_indicies",
"country_indicies",]
temp = []
for index, guts in enumerate(itertools.combinations(keys, 1)):
print index, guts[0]
indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices", "auction_indices","dates_indices","physical_indices","country_indicies","color_indicies",]
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
linear = linear_model.LinearRegression()
linear.fit(noest_training_X, log_training_Y)
plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
color="blue",
s=120.0)
In [133]:
orderedlist = ["estimate_indices", "artist_indices", "auction_indices","dates_indices","physical_indices","country_indicies","color_indicies",]
betternames = ["Auction Estimate", "Artist", "Auction House", "Creation/Sale Date", "Physical Dimensions", "Country of Sale", "Image Analysis"]
temp = []
indexlist = ["log_estimate_indices", ]
for index, key in enumerate(orderedlist):
print index, key
indexlist += indices_dict[key]
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
linear = linear_model.LinearRegression()
linear.fit(noest_training_X, log_training_Y)
plt.scatter(index, 1.0 / len(log_testing_Y) * np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2),
color="blue",
label=betternames[index],
s=120.0)
# plt.legend()
plt.xticks(np.arange(len(betternames)))
plt.axes().set_xticklabels([x for x in betternames])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
# plt.xlim(0, feature_length + 1)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel("Mean Squared Predicted Error", fontsize=fontsize)
plt.tight_layout()
In [169]:
plt.rcParams['figure.figsize'] = 14, 9
orderedlist = ["estimate_indices", "artist_indices", "auction_indices","dates_indices","physical_indices","country_indicies","color_indicies",]
betternames = ["Auction Estimate",
" + Artist",
" + Auction House",
" + Creation/Sale Date",
" + Physical Dimensions",
" + Country of Sale",
" + Color Analysis"]
indexlist = ["log_estimate_indices", ]
for index, key in enumerate(orderedlist[::-1]):
print index, key
indexlist += indices_dict[key]
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
linear = linear_model.LinearRegression()
linear.fit(noest_training_X, log_training_Y)
plt.barh(len(orderedlist) - index,
1.0 / len(log_testing_Y) * np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2),
alpha=0.75,
)
# plt.legend()
# plt.yticks(np.arange(len(betternames)))
plt.yticks(np.arange(1, len(betternames)+1) + .35)
plt.axes().set_yticklabels([x for x in betternames[::-1]])
# plt.yticks(rotation=90)
plt.yticks(fontsize=fontsize)
# plt.xlim(0, feature_length + 1)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
# plt.ylabel("Feature", fontsize=fontsize)
plt.xlabel("Mean Squared Predicted Error", fontsize=fontsize)
plt.ylim(.7, 8)
plt.tight_layout()
plt.savefig("2014-09-29-feature-importance.pdf")
In [111]:
log_linear.coef_
Out[111]:
In [ ]:
ordered_features = ["log_estimate_indices",
"color_indicies",
"country_indicies",
"auction_indices",
"physical_indices",
"artist_indices",
"dates_indices",]
indexlist = []
for index, key in range(len(ordered_features)):
indexlist += indices_dict[key]
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x not in indexlist]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x not in indexlist]]
linear = linear_model.LinearRegression()
linear.fit(noest_training_X, log_training_Y)
plt.scatter(index, linear.score(noest_testing_X, log_testing_Y),
color="blue",
s=120.0)
In [24]:
selector = RFECV(linear, step=1, cv=5)
results = selector.fit(new_training_X, log_training_Y)
print results.n_features_
results.score(new_testing_X, log_testing_Y)
Out[24]:
In [175]:
In [275]:
Out[275]:
In [25]:
selector = SelectKBest(score_func=feature_selection.f_regression, k=1)
results = selector.fit(new_training_X, training_Y)
# things = features + artist_list
# for number, index in enumerate(np.argsort(results.ranking_)):
# print number + 1, results.ranking_[index], things[index]
In [26]:
for k in xrange(1, 5):
selector = SelectKBest(score_func=feature_selection.f_regression, k=1)
results = selector.fit(new_training_X, training_Y)
results.get_support(indices=True)
In [27]:
plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(things)
for k in range(1, feature_length + 1):
selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
results = selector.fit(new_testing_X, log_testing_Y)
linear = linear_model.LinearRegression()
linear.fit(new_testing_X[:, [index for index in results.get_support(indices=True)]], log_testing_Y)
plt.scatter(k, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(new_testing_X[:, [index for index in results.get_support(indices=True)]]) - log_testing_Y))**2)),
color="blue",
s=120.0)
xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
xlabels = [things[x] for x in xlabels]
plt.xticks(np.arange(1, feature_length + 1, 1.0))
plt.axes().set_xticklabels([x for x in xlabels])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.xlim(0, feature_length + 1)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel("Error", fontsize=fontsize)
# plt.ylim(0.2, 1.5)
plt.tight_layout()
# plt.savefig("2014-09-23-relative-error-features.pdf")
In [28]:
# no_estimate =
things = np.array(things)
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x not in [7,8]]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x not in [7,8]]]
noest_things = [things[x] for x in range(len(things)) if x not in [7,8]]
noest_things
Out[28]:
In [147]:
len(things)
Out[147]:
In [255]:
plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(noest_things)
for k in range(1, feature_length):
selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
results = selector.fit(noest_testing_X, testing_Y)
linear = linear_model.LinearRegression()
linear.fit(noest_training_X[:, [index for index in results.get_support(indices=True)]], training_Y)
plt.scatter(k, np.sqrt(np.average((linear.predict(noest_testing_X[:, [index for index in results.get_support(indices=True)]]) - testing_Y)**2.0)),
color="blue",
s=120.0)
# plt.scatter(k, linear.score(noest_testing_X[:, [index for index in results.get_support(indices=True)]], testing_Y),
# color="blue",
# s=120.0)
xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
xlabels = [noest_things[x] for x in xlabels]
plt.xticks(np.arange(1, feature_length, 1.0))
plt.axes().set_xticklabels([x for x in xlabels])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.xlim(0, feature_length + 1)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel("Error", fontsize=fontsize)
plt.tight_layout()
# plt.ylim(0.5, 1.5)
# plt.savefig("2014-09-19-feature-importance.pdf")
In [275]:
auctionhouse_list
Out[275]:
In [151]:
correct = []
incorrect = []
mean_estimate = []
low_estimates = []
high_estimates = []
for index, row in enumerate(big_X):
low_estimate = row[7]
high_estimate = row[8]
hammer_price = big_Y[index]
low_estimates.append(low_estimate)
high_estimates.append(high_estimate)
mean_estimate.append(np.average([low_estimate, high_estimate]))
if (hammer_price > low_estimate) and (hammer_price < high_estimate):
correct.append(hammer_price)
else:
incorrect.append(hammer_price)
mean_estimate = np.array(mean_estimate)
low_estimates = np.array(low_estimates)
high_estimates = np.array(high_estimates)
In [153]:
print len(correct), len(incorrect), np.average((high_estimates - low_estimates) / 2.0)
In [271]:
# The plan:
# What... is the correct on the testing training set.
# It's really hard to do right.
In [212]:
# The plan:
# Find out my sigma
# Find out correct-ness
# limit their estimate and myself to my sigma
# How many correct/incorrect does my prediction make on TEST data?
correct = []
incorrect = []
my_correct = []
my_incorrect = []
linear.fit(new_training_X, training_Y)
standard_deviation = np.std(linear.predict(new_testing_X) - testing_Y) / 2.0
print standard_deviation
for index, row in enumerate(big_X):
low_estimate = row[7]
high_estimate = row[8]
average = np.average([low_estimate, high_estimate])
hammer_price = big_Y[index]
if (hammer_price > average - standard_deviation) and (hammer_price < average + standard_deviation):
correct.append(hammer_price)
else:
incorrect.append(hammer_price)
print "Auction houses are right:", len(correct)
print "Auction houses are wrong:", len(incorrect)
my_correct = []
my_incorrect = []
bands = []
for index, row in enumerate(new_big_X):
low_estimate = big_X[index][7]
high_estimate = big_X[index][8]
band = (high_estimate - low_estimate) / 2.0
bands.append(band)
hammer_price = big_Y[index]
if (hammer_price > linear.predict(new_big_X[index]) - standard_deviation) and (hammer_price < linear.predict(new_big_X[index]) + standard_deviation):
my_correct.append(hammer_price)
else:
my_incorrect.append(hammer_price)
print "My algorithm is right:", len(my_correct)
print "My algorithm is wrong:", len(my_incorrect)
print "Percentage:", float(len(my_correct)) / (float(len(my_correct)) + float(len(my_incorrect)))
In [61]:
from sklearn.preprocessing import PolynomialFeatures
In [62]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
poly_X = poly.fit_transform(new_big_X)
poly_training_X = poly.fit_transform(new_training_X)
poly_testing_X = poly.fit_transform(new_testing_X)
In [ ]:
plt.rcParams['figure.figsize'] = 12, 12
polylinear = linear_model.LinearRegression()
polylinear.fit(poly_training_X, log_training_Y)
# print polylinearlinear.score(poly_testing_X, log_testing_Y)
In [69]:
plt.scatter(polylinear.predict(poly_testing_X), log_testing_Y, color='blue', alpha=0.25, s=60)
plt.scatter(log_linear.predict(new_testing_X), log_testing_Y, color='red', alpha=0.25, s=60)
print polylinear.score(poly_testing_X, log_testing_Y)
print log_linear.score(new_testing_X, log_testing_Y)
In [70]:
plt.scatter(polylinear.predict(poly_training_X), log_training_Y, color='blue', alpha=0.25, s=60)
plt.scatter(log_linear.predict(new_training_X), log_training_Y, color='red', alpha=0.25, s=60)
print polylinear.score(poly_training_X, log_training_Y)
print log_linear.score(new_training_X, log_training_Y)
In [79]:
things[np.argmin(log_linear.coef_)]
Out[79]:
In [ ]:
In [ ]:
In [8]:
huh = io.imread("../artists_html/antoni-tapies-6138135.jpg")
fig, (ax1, ax2) = plt.subplots(1,2)
ax1.imshow(huh)
red, green, blue = np.average(np.average(huh, axis=0), axis=0)
ax2.imshow(Image.new("RGB", huh.shape[:2], (int(red), int(green), int(blue))))
Out[8]:
In [10]:
# prices = []
# categories = []
# materials = []
# artist = 'helen-frankenthaler'
# for x in master_dictionary[artist]:
# price = master_dictionary[artist][x]['hammer_price']
# category = master_dictionary[artist][x]['category']
# if category == "Painting":
# prices.append(price)
# materials.append(master_dictionary[artist][x]['materials'])
# jpg_name = "../artists_html/" + artist + "-" + str(x) + ".jpg"
# huh = io.imread(jpg_name)
# red, green, blue = np.average(np.average(huh, axis=0), axis=0)
# master_dictionary[artist][x]['red'] = red
# master_dictionary[artist][x]['green'] = green
# master_dictionary[artist][x]['blue'] = blue
# categories.append(category)
# print set(categories)
# print set(materials)
In [69]:
cutsize = .5
plt.scatter(linear.predict(new_big_X), big_Y, color='blue', alpha=0.25, s=60)
index = np.abs(linear.predict(new_big_X) - big_Y) > cutsize
plt.scatter(linear.predict(new_big_X)[index], big_Y[index], color="red")
# print np.argwhere(index == True)
# # for info in np.argwhere(index == True):
# # print information[info]
Out[69]: