In [1]:
%matplotlib inline

from bs4 import BeautifulSoup
from glob import glob
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import collections
import cookielib
import datetime
import locale
import math
import matplotlib.pyplot as plt
import mechanize
import numpy as np
import os
import pandas as pd
import pandas.io.sql as psql
import pickle
from PIL import Image 
import pymysql as mdb
import re
import requests
import string
import sys
import time
import unidecode 
import urllib
import urllib2

# import MySQLdb  

from skimage import io
from skimage import color
from sklearn import preprocessing
from sklearn import linear_model 
from sklearn.feature_selection import RFE, RFECV

import astroML
from astroML.plotting import hist 

plt.rcParams['figure.figsize'] = 16, 8

locale.setlocale( locale.LC_ALL, '' )


Out[1]:
'en_AU.UTF-8'

In [2]:
for infile in sorted(glob("../artists_html/*.html")):
    jpg_name = ".".join(infile.split(".")[:-1] + ["jpg"])
    if not os.path.isfile(jpg_name):
        print infile, jpg_name, "removing", infile
        os.remove(infile)

In [3]:
artists = []
for infile in sorted(glob("../artists_html/*.html")):
    split_name = infile.split("-")[:-1]
    artists.append("-".join(split_name).split("/")[-1])

In [4]:
c = collections.Counter(artists)
artist_list = []
for artist, number in c.most_common():
    if number > 100:
        artist_list.append(artist)
artist_list


Out[4]:
['karel-appel',
 'sam-francis',
 'antoni-tapies',
 'roberto-matta',
 'jasper-johns',
 'robert-motherwell',
 'mark-tobey',
 'cy-twombly',
 'louise-nevelson',
 'helen-frankenthaler',
 'alberto-burri',
 'aaron-siskind',
 'joan-mitchell',
 'louise-bourgeois',
 'theodoros-stamos',
 'kenneth-noland',
 'isamu-noguchi',
 'adolph-gottlieb',
 'franz-kline',
 'philip-guston',
 'jules-olitski',
 'arshile-gorky',
 'norman-bluhm',
 'gene-davis',
 'al-held',
 'mark-rothko',
 'jackson-pollock',
 'esteban-vicente',
 'morris-cole-graves',
 'ad-reinhardt',
 'rita-letendre',
 'morris-louis',
 'william-baziotes',
 'norman-wilfred-lewis',
 'milton-resnick',
 'lee-krasner',
 'barnett-newman']

In [6]:
master_dictionary = {}
for running_number, html_file in enumerate(sorted(glob("../artists_html/*.html"))):
    if not running_number % 5000:
        print running_number
    with open(html_file, 'r') as file_handle:
        soup = BeautifulSoup(file_handle)
    test_artist = "-".join(html_file.split("-")[:-1]).split("/")[-1]
    painting_id = html_file.split("-")[-1].strip(".html")
    try:
        if test_artist not in master_dictionary:
            master_dictionary[test_artist] = {}
        for cssid in soup.findAll('div', {"class":re.compile("lsc_adjud")}): 
            if cssid.string:
                if not cssid.string.strip().split("\n")[1].strip()[0] == "$":
                    continue
                hammer_price = float("".join([x for x in cssid.string.strip().split("\n")[1].strip("$") if x in string.digits or x == "."]))
                master_dictionary[test_artist][painting_id] = {}
                master_dictionary[test_artist][painting_id]['hammer_price'] = hammer_price
                master_dictionary[test_artist][painting_id]['log_hammer_price'] = np.log(hammer_price)
                for cssid in soup.findAll('div', {"class":re.compile("lsc_title")}):
                    master_dictionary[test_artist][painting_id]['title'] = unidecode.unidecode(cssid.a.text)
                    year = "".join([number for number in cssid.date.string.strip().strip("(").strip(")").strip("c.") if number in string.digits][:4])
                    if year:
                        try:
                            master_dictionary[test_artist][painting_id]['creation_date'] = datetime.datetime.toordinal(datetime.datetime(int(year), 1, 1))
                        except:
                            print "PROBLEM", year
                for cssid in soup.findAll('div', {"class":re.compile("lsc_country")}): 
                    sale_country, sale_date = cssid.string.strip().split("\n")
                    master_dictionary[test_artist][painting_id]['sale_country'] = sale_country.lower().strip(",")
                    master_dictionary[test_artist][painting_id]['sale_date'] = datetime.datetime.toordinal(datetime.datetime.strptime(sale_date.strip(), "%m-%d-%Y"))
                for cssid in soup.findAll('div', {"class":re.compile("lsc_details")}): 
                    details_raw = cssid.string.strip().split("\n")
                    if len(details_raw) == 4:
                        category, _, materials_raw, dimensions_raw = details_raw
                        materials = materials_raw.strip().strip(',')
                        master_dictionary[test_artist][painting_id]['category'] = category
                        master_dictionary[test_artist][painting_id]['materials'] = materials
                        if dimensions_raw.strip().split().count('cm') == 2:
                            length, width = float(dimensions_raw.strip().split()[0]), float(dimensions_raw.strip().split()[3])
                            area = length * width
                            diagonal_length = np.sqrt(length ** 2.0 + width ** 2.0)
                            master_dictionary[test_artist][painting_id]['area'] = area
                            master_dictionary[test_artist][painting_id]['diagonal_length'] = length / width
                    if len(details_raw) != 4:
                        category = details_raw[0]
                        materials = details_raw[2].strip().strip(',')
                        master_dictionary[test_artist][painting_id]['category'] = category
                        master_dictionary[test_artist][painting_id]['materials'] = materials
                for cssid in soup.findAll('div', {"class":re.compile("lsc_estimate")}): 
                    if len(cssid.string.strip().split()) == 4:
                        low_estimate = float("".join([x for x in cssid.string.strip().split()[1].strip("$") if x in string.digits or x == "."]))
                        high_estimate = float("".join([x for x in cssid.string.strip().split()[3].strip("$") if x in string.digits or x == "."]))
                        auction_estimate = np.average([low_estimate, high_estimate])
#                         master_dictionary[test_artist][painting_id]['low_estimate'] = low_estimate
#                         master_dictionary[test_artist][painting_id]['high_estimate'] = high_estimate
#                         master_dictionary[test_artist][painting_id]['auction_estimate'] = auction_estimate
                        master_dictionary[test_artist][painting_id]['log_low_estimate'] = np.log(low_estimate)
                        master_dictionary[test_artist][painting_id]['log_high_estimate'] = np.log(high_estimate)
                        master_dictionary[test_artist][painting_id]['log_auction_estimate'] = np.log(auction_estimate)
                for cssid in soup.findAll('div', {"class":re.compile("lsc_auctioneer")}): 
                        master_dictionary[test_artist][painting_id]['auction_house'] = cssid.string.split("\n")[1].strip().strip(',')
    except:
        print html_file


0
5000
10000
15000
20000
25000
30000

In [63]:
artist_list = []
c = collections.Counter(artists)
for artist, number in c.most_common():
    if number > 100:
        artist_list.append(artist)

sale_countries = []
auction_houses = []

features = ['creation_date',
            'diagonal_length',
            'area',
            'sale_date',
#             'low_estimate',
#             'high_estimate', 
#             'auction_estimate',
            'log_low_estimate',
            'log_high_estimate', 
            'log_auction_estimate',
            ]

for artist in artist_list:
    for x in master_dictionary[artist]:
        category = master_dictionary[artist][x]['category']
        if not category == "Painting":
            continue
        if all(feature in master_dictionary[artist][x] for feature in features):
            if 'log_low_estimate' in master_dictionary[artist][x]:
#                 print "hi"
                sale_countries.append(master_dictionary[artist][x]['sale_country'].strip())
                auction_houses.append(master_dictionary[artist][x]['auction_house'].strip())

auctionhouse_list = []        
auc = collections.Counter(auction_houses)
for auction_house, number in auc.most_common():
    if number > 25:
        auctionhouse_list.append(auction_house)
auctionhouse_list.append("unknown_auction")

country_list = []
country_counter = collections.Counter(sale_countries)
for country, number in country_counter.most_common():
    if number > 25:
        country_list.append(country)
country_list.append("unknown_country")

print artist_list, country_list, auctionhouse_list


['karel-appel', 'sam-francis', 'antoni-tapies', 'roberto-matta', 'jasper-johns', 'robert-motherwell', 'mark-tobey', 'cy-twombly', 'louise-nevelson', 'helen-frankenthaler', 'alberto-burri', 'aaron-siskind', 'joan-mitchell', 'louise-bourgeois', 'theodoros-stamos', 'kenneth-noland', 'isamu-noguchi', 'adolph-gottlieb', 'franz-kline', 'philip-guston', 'jules-olitski', 'arshile-gorky', 'norman-bluhm', 'gene-davis', 'al-held', 'mark-rothko', 'jackson-pollock', 'esteban-vicente', 'morris-cole-graves', 'ad-reinhardt', 'rita-letendre', 'morris-louis', 'william-baziotes', 'norman-wilfred-lewis', 'milton-resnick', 'lee-krasner', 'barnett-newman'] [u'united states', u'united kingdom', u'italy', u'france', u'netherlands', u'germany', u'sweden', u'switzerland', u'canada', u'denmark', u'austria', 'unknown_country'] [u"Christie's", u"Sotheby's", u'Farsetti', u'Bonhams', u'William Doyle', u'Swann Galleries', u'Bukowskis', u'Heffel Fine Art', u'Germann Auktionshaus', u'Bruun Rasmussen', u'Villa Grisebach', u'Finarte', u'Ketterer Kunst GmbH', u'Dorotheum', u'Stockholms Auktionsverk', u'Lempertz', u'Loudmer', 'unknown_auction']

In [64]:
features, master_dictionary[artist][x]


Out[64]:
(['creation_date',
  'diagonal_length',
  'area',
  'sale_date',
  'log_low_estimate',
  'log_high_estimate',
  'log_auction_estimate'],
 {'area': 3850.0,
  'auction_house': u"Sotheby's",
  'category': u'Print-Multiple',
  'creation_date': 716606,
  'diagonal_length': 1.2727272727272727,
  'hammer_price': 250000.0,
  'log_auction_estimate': 12.611537753638338,
  'log_hammer_price': 12.429216196844383,
  'log_high_estimate': 12.765688433465597,
  'log_low_estimate': 12.429216196844383,
  'materials': u'Lithograph in colors',
  'sale_country': u'united states',
  'sale_date': 728784,
  'title': '<<18 Cantos>>'})

In [9]:
# Run image analysis
for artist in artist_list:
    for x in master_dictionary[artist]:
        price = master_dictionary[artist][x]['hammer_price']
        category  = master_dictionary[artist][x]['category']
        if category == "Painting":
            jpg_name = "../artists_html/" + artist + "-" + str(x) + ".jpg"
            try: 
                huh = io.imread(jpg_name)
                red, green, blue = np.average(np.average(huh, axis=0), axis=0)
                master_dictionary[artist][x]['red'] = red
                master_dictionary[artist][x]['green'] = green
                master_dictionary[artist][x]['blue'] = blue
            except:
                pass

In [70]:
features = ['creation_date',
            'diagonal_length',
            'area',
            'sale_date',
            'red',
            'green',
            'blue',
#             'low_estimate',
#             'high_estimate', 
#             'auction_estimate',
            'log_low_estimate',
            'log_high_estimate', 
            'log_auction_estimate',
            ]

big_X = []
# big_Y = []
log_big_Y = []

testing_X = []
training_X = []
# testing_Y = []
# training_Y = []
log_testing_Y = []
log_training_Y = []

restrictions = []
information = []
training_information = []
testing_information = []

label = preprocessing.LabelBinarizer()
label.fit(np.array(artist_list))

auction_label = preprocessing.LabelBinarizer()
auction_label.fit(np.array(auctionhouse_list))

country_label = preprocessing.LabelBinarizer()
country_label.fit(np.array(country_list))

keys = []
for artist in artist_list:
    for x in master_dictionary[artist]:
        price = master_dictionary[artist][x]['hammer_price']
        log_price = master_dictionary[artist][x]['log_hammer_price']
        category = master_dictionary[artist][x]['category']
        if not category == "Painting":
            continue
        if all(feature in master_dictionary[artist][x] for feature in features):
            sale_country = master_dictionary[artist][x]['sale_country'].strip()
            if sale_country not in country_list:
                sale_country = "unknown_country"
            auction_house = master_dictionary[artist][x]['auction_house'].strip()
            if auction_house not in auctionhouse_list:
                auction_house = "unknown_auction"
            feature_list = [master_dictionary[artist][x][feature] for feature in features]
            label_list = list(label.transform([artist])[0])
            auction_list = list(auction_label.transform([auction_house])[0])
            temp_country_list = list(country_label.transform([sale_country])[0])
            clever_list = feature_list + label_list + auction_list + temp_country_list
            big_X.append(clever_list)
#             big_Y.append(price)
            log_big_Y.append(log_price)
            information.append([artist, x, auction_house, sale_country] + [master_dictionary[artist][x][feature] for feature in features])
            if master_dictionary[artist][x]['sale_date'] < datetime.datetime.toordinal(datetime.datetime(2013, 9, 1)):
                training_X.append(clever_list)
#                 training_Y.append(price)
                log_training_Y.append(log_price)
                training_information.append([artist, x, auction_house, sale_country] + [master_dictionary[artist][x][feature] for feature in features])
            elif master_dictionary[artist][x]['sale_date'] > datetime.datetime.toordinal(datetime.datetime(2013, 9, 1)):
                testing_X.append(clever_list)
#                 testing_Y.append(price)
                log_testing_Y.append(log_price)
                testing_information.append([artist, x, auction_house, sale_country] + [master_dictionary[artist][x][feature] for feature in features])

big_X = np.array(big_X)
# big_Y = np.array(big_Y)
log_big_Y = np.array(log_big_Y)

testing_X = np.array(testing_X)
training_X = np.array(training_X)
# testing_Y = np.array(testing_Y)
# training_Y = np.array(training_Y)

log_testing_Y = np.array(log_testing_Y)
log_training_Y = np.array(log_training_Y)

all_features = features + artist_list + auctionhouse_list + country_list
all_features


Out[70]:
['creation_date',
 'diagonal_length',
 'area',
 'sale_date',
 'red',
 'green',
 'blue',
 'log_low_estimate',
 'log_high_estimate',
 'log_auction_estimate',
 'karel-appel',
 'sam-francis',
 'antoni-tapies',
 'roberto-matta',
 'jasper-johns',
 'robert-motherwell',
 'mark-tobey',
 'cy-twombly',
 'louise-nevelson',
 'helen-frankenthaler',
 'alberto-burri',
 'aaron-siskind',
 'joan-mitchell',
 'louise-bourgeois',
 'theodoros-stamos',
 'kenneth-noland',
 'isamu-noguchi',
 'adolph-gottlieb',
 'franz-kline',
 'philip-guston',
 'jules-olitski',
 'arshile-gorky',
 'norman-bluhm',
 'gene-davis',
 'al-held',
 'mark-rothko',
 'jackson-pollock',
 'esteban-vicente',
 'morris-cole-graves',
 'ad-reinhardt',
 'rita-letendre',
 'morris-louis',
 'william-baziotes',
 'norman-wilfred-lewis',
 'milton-resnick',
 'lee-krasner',
 'barnett-newman',
 u"Christie's",
 u"Sotheby's",
 u'Farsetti',
 u'Bonhams',
 u'William Doyle',
 u'Swann Galleries',
 u'Bukowskis',
 u'Heffel Fine Art',
 u'Germann Auktionshaus',
 u'Bruun Rasmussen',
 u'Villa Grisebach',
 u'Finarte',
 u'Ketterer Kunst GmbH',
 u'Dorotheum',
 u'Stockholms Auktionsverk',
 u'Lempertz',
 u'Loudmer',
 'unknown_auction',
 u'united states',
 u'united kingdom',
 u'italy',
 u'france',
 u'netherlands',
 u'germany',
 u'sweden',
 u'switzerland',
 u'canada',
 u'denmark',
 u'austria',
 'unknown_country']

In [71]:
features


Out[71]:
['creation_date',
 'diagonal_length',
 'area',
 'sale_date',
 'red',
 'green',
 'blue',
 'log_low_estimate',
 'log_high_estimate',
 'log_auction_estimate']

In [72]:
# auction_training_predicted = []
# auction_testing_predicted = []
# auction_predicted = []
log_auction_training_predicted = []
log_auction_testing_predicted = []
log_auction_predicted = []

# for index, price in enumerate(training_Y):
#     auction_training_predicted.append(training_information[index][13])
# for index, price in enumerate(testing_Y):
#     auction_testing_predicted.append(testing_information[index][13])
# for index, price in enumerate(big_Y):
#     auction_predicted.append(information[index][13])
    
for index, price in enumerate(log_training_Y):
    log_auction_training_predicted.append(training_information[index][13])
for index, price in enumerate(log_testing_Y):
    log_auction_testing_predicted.append(testing_information[index][13])
for index, price in enumerate(log_big_Y):
    log_auction_predicted.append(information[index][13])

# auction_training_predicted = np.array(auction_training_predicted)
# auction_testing_predicted = np.array(auction_testing_predicted)
# auction_predicted = np.array(auction_predicted)

log_auction_training_predicted = np.array(log_auction_training_predicted)
log_auction_testing_predicted = np.array(log_auction_testing_predicted)
log_auction_predicted = np.array(log_auction_predicted)

In [73]:
# Plot the mean auction range prediction vs price and error
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams['legend.scatterpoints'] = 1 
# Get transformation function
scaler = preprocessing.StandardScaler().fit(training_X)
new_training_X = scaler.transform(training_X)
new_testing_X = scaler.transform(testing_X)
new_big_X = scaler.transform(big_X)
log_linear = linear_model.LinearRegression()
log_linear.fit(new_training_X, log_training_Y)
log_testing_predicted = log_linear.predict(new_testing_X)
log_training_predicted = log_linear.predict(new_training_X)

print "Score:", log_linear.score(new_testing_X, log_testing_Y)

sca1 = plt.scatter(log_training_Y, log_linear.predict(new_training_X), color='blue', alpha=0.35, s=60, 
            label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, log_linear.predict(new_testing_X), color='red', alpha=0.75, s=60, 
            label="Testing (" + str(len(log_testing_Y)) + ")")

price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()]) 
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()]) 
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)

MSE_training = np.sqrt(np.sum((log_training_Y - log_linear.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - log_linear.predict(new_testing_X))**2.0/len(log_testing_Y)))

first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)), 
                          "MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")


Score: 0.953804469683

In [74]:
#log_linear.coef_
for thing, coef in zip(all_features, log_linear.coef_):
    print thing, coef
#                         = features + artist_list + auctionhouse_list + country_list
# things


creation_date -0.037423941947
diagonal_length -0.00392280484031
area 0.0328059815228
sale_date 0.094482025977
red 0.0198785134218
green 0.00590429414385
blue -0.0276653116465
log_low_estimate 0.948341536355
log_high_estimate 0.446734032538
log_auction_estimate 0.102982779899
karel-appel -181792435363.0
sam-francis -145490854200.0
antoni-tapies -208691893098.0
roberto-matta -144306224308.0
jasper-johns -266343075898.0
robert-motherwell -356514953595.0
mark-tobey -110460699658.0
cy-twombly -56142713879.2
louise-nevelson -203812600511.0
helen-frankenthaler -102312040874.0
alberto-burri -206268022949.0
aaron-siskind -143111357037.0
joan-mitchell -228522926462.0
louise-bourgeois 458360533484.0
theodoros-stamos -100601294618.0
kenneth-noland -115066809717.0
isamu-noguchi -255091599488.0
adolph-gottlieb -215009362544.0
franz-kline -517035138818.0
philip-guston -270572350793.0
jules-olitski -118035118319.0
arshile-gorky -37444980958.8
norman-bluhm -81501480643.0
gene-davis -210290816853.0
al-held -237057556306.0
mark-rothko -163228730417.0
jackson-pollock -110460699658.0
esteban-vicente -186543145708.0
morris-cole-graves -183801243467.0
ad-reinhardt -118035118319.0
rita-letendre -150131815398.0
morris-louis -138224170389.0
william-baziotes -265120222651.0
norman-wilfred-lewis -426412788663.0
milton-resnick -509533017115.0
lee-krasner -292285367085.0
barnett-newman -115066809717.0
Christie's 122232316491.0
Sotheby's 104082424382.0
Farsetti 116956823661.0
Bonhams 670643174352.0
William Doyle 91081933987.6
Swann Galleries 229257332068.0
Bukowskis 100992675638.0
Heffel Fine Art 108548375923.0
Germann Auktionshaus 97803058714.9
Bruun Rasmussen 94503462102.9
Villa Grisebach 92808771069.4
Finarte 89321086446.2
Ketterer Kunst GmbH 648552847655.0
Dorotheum 91081933987.6
Stockholms Auktionsverk 112833206227.0
Lempertz 102549462515.0
Loudmer 127282257069.0
unknown_auction 358441024211.0
united states 918444514761.0
united kingdom 1.26391688048e+12
italy 947440866390.0
france 2.65253857121e+12
netherlands 1.80001218334e+12
germany 3.21881355659e+12
sweden 2.60058482087e+12
switzerland 1.50474771888e+12
canada 1.51354272578e+12
denmark 4.96297697147e+12
austria 6.1916203187e+12
unknown_country 1.11730906018e+12

In [75]:
# MSPE auction house
# Plot the mean auction range prediction vs price and error
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams['legend.scatterpoints'] = 1 

sca1 = plt.scatter(log_testing_Y, testing_X[:, 9], color='blue', alpha=0.35, s=60, 
            label="Auction (" + str(len(log_testing_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, log_linear.predict(new_testing_X), color='red', alpha=0.75, s=60, 
            label="Testing (" + str(len(log_testing_Y)) + ")")

price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()]) 
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()]) 
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)

MSAE_training = np.sqrt(np.sum((log_testing_Y - testing_X[:,9])**2.0/ len(log_testing_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - log_linear.predict(new_testing_X))**2.0/len(log_testing_Y)))

first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSAE Testing " + str(round(MSAE_training, 3)), 
                          "MSPE Testing " + str(round(MSPE_testing, 3))], loc=4, fontsize=fontsize)
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")



In [76]:
from sklearn.linear_model import LassoCV, LassoLarsCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor

In [78]:
rf_regressor = RandomForestRegressor(oob_score=True, n_estimators=2000, max_features='auto')
rf_regressor.fit(new_training_X, log_training_Y)
log_rf_predicted = rf_regressor.predict(new_testing_X)
sca1 = plt.scatter(log_training_Y, rf_regressor.predict(new_training_X), color='blue', alpha=0.35, s=60, 
            label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, rf_regressor.predict(new_testing_X), color='red', alpha=0.75, s=60, 
            label="Testing (" + str(len(log_testing_Y)) + ")")

price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()]) 
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()]) 
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)

MSE_training = np.sqrt(np.sum((log_training_Y - rf_regressor.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - rf_regressor.predict(new_testing_X))**2.0/len(log_testing_Y)))

first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)), 
                          "MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)

plt.title("Random Forests")
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")



In [59]:
# sklearn.linear_model.LassoCV(eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=1, positive=False)

# rf_regressor = RandomForestRegressor(oob_score=True, n_estimators=2000, max_features='sqrt' )
rf_regressor = LassoCV(eps=0.01, n_alphas=1500, alphas=None, 
                       fit_intercept=True, normalize=False, precompute='auto', 
                       max_iter=10000, tol=0.00001, 
                       copy_X=True, 
                       cv=5, 
                       verbose=False, 
                       n_jobs=1, positive=False)

rf_regressor.fit(new_training_X, log_training_Y)
log_rf_predicted = rf_regressor.predict(new_testing_X)
sca1 = plt.scatter(log_training_Y, rf_regressor.predict(new_training_X), color='blue', alpha=0.35, s=60, 
            label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, rf_regressor.predict(new_testing_X), color='red', alpha=0.75, s=60, 
            label="Testing (" + str(len(log_testing_Y)) + ")")

price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()]) 
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()]) 
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)

MSE_training = np.sqrt(np.sum((log_training_Y - rf_regressor.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - rf_regressor.predict(new_testing_X))**2.0/len(log_testing_Y)))

first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)), 
                          "MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)

plt.title("LassoCV")
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")



In [49]:
rf_regressor = LassoLarsCV(
                       eps=0.01, 
# n_alphas=1500, alphas=None, 
                       fit_intercept=True, 
#                         normalize=False, precompute='auto', 
#                        max_iter=10000, tol=0.00001, 
#                        copy_X=True, 
                       cv=5, 
#                        verbose=True, 
#                        n_jobs=1, positive=False
                        )

rf_regressor.fit(new_training_X, log_training_Y)
log_rf_predicted = rf_regressor.predict(new_testing_X)
sca1 = plt.scatter(log_training_Y, rf_regressor.predict(new_training_X), color='blue', alpha=0.35, s=60, 
            label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, rf_regressor.predict(new_testing_X), color='red', alpha=0.75, s=60, 
            label="Testing (" + str(len(log_testing_Y)) + ")")

price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()]) 
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()]) 
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)

MSE_training = np.sqrt(np.sum((log_training_Y - rf_regressor.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - rf_regressor.predict(new_testing_X))**2.0/len(log_testing_Y)))

first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)), 
                          "MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)

plt.title("LassoLarsCV")
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")


/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:295: ConvergenceWarning: Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. 46 iterations, alpha=3.485e-17, previous alpha=1.742e-17, with an active set of 45 regressors.
  ConvergenceWarning)
/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:295: ConvergenceWarning: Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. 45 iterations, alpha=1.929e-18, previous alpha=9.646e-19, with an active set of 42 regressors.
  ConvergenceWarning)
/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/sklearn/linear_model/least_angle.py:295: ConvergenceWarning: Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. 32 iterations, alpha=1.091e-18, previous alpha=7.732e-19, with an active set of 31 regressors.
  ConvergenceWarning)

In [58]:
from sklearn.cross_validation import ShuffleSplit
rf_regressor = ElasticNetCV(cv=5, l1_ratio=.75)

rf_regressor.fit(new_training_X, log_training_Y)
log_rf_predicted = rf_regressor.predict(new_testing_X)
sca1 = plt.scatter(log_training_Y, rf_regressor.predict(new_training_X), color='blue', alpha=0.35, s=60, 
            label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, rf_regressor.predict(new_testing_X), color='red', alpha=0.75, s=60, 
            label="Testing (" + str(len(log_testing_Y)) + ")")

price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()]) 
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()]) 
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)

MSE_training = np.sqrt(np.sum((log_training_Y - rf_regressor.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - rf_regressor.predict(new_testing_X))**2.0/len(log_testing_Y)))

first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,3)), 
                          "MSPE Testing " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)

plt.title("ElasticNetCV")
plt.tight_layout()
# plt.savefig("2014-09-24-price-prediction.pdf")



In [ ]:


In [194]:
training_auction_total = np.sum(training_Y - auction_training_predicted)
# training_total = np.sum(training_Y - training_predicted)
# print training_auction_total, training_total

testing_auction_total = np.sum(testing_Y - auction_testing_predicted)

my_testing_predicted = log_linear.predict(new_testing_X)
# log_training_predicted = log_linear.predict(new_training_X)

testing_total = np.sum(testing_Y - my_testing_predicted)
# print testing_auction_total, testing_total

log_training_auction_total = np.sum(log_training_Y - log_auction_training_predicted)
log_training_total = np.sum(log_training_Y - log_training_predicted)
# print log_training_auction_total, log_training_total

log_testing_auction_total = np.sum(log_testing_Y - log_auction_testing_predicted)
log_testing_total = np.sum(log_testing_Y - log_testing_predicted)
# print log_testing_auction_total, log_testing_total

print "This part matters"

log_testing_auction_total = np.sum(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted)
log_testing_total = np.sum(np.e ** log_testing_Y - np.e ** log_testing_predicted)
my_total_miss = np.sum(np.e ** log_testing_Y - np.e ** log_testing_predicted)
my_std = np.std(np.e ** log_testing_Y - np.e ** log_testing_predicted)
auction_total_miss = np.sum(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted)
auction_std = np.std(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted)
print "My miss:", my_total_miss
print "My std:", my_std
print "Auction miss:", auction_total_miss
print "Auction std:", auction_std

plt.rcParams['figure.figsize'] = 8, 8

print "Total sales:", np.sum(np.e ** log_testing_Y)
winby = round(np.abs(auction_total_miss)/1e6 - np.abs(my_total_miss)/1e6,2)


alpha = 0.75
bins = 20
linewidth=25.0
width = 0.5
plt.bar(1, np.abs(auction_total_miss)/1e6, width=width, color="red", alpha=alpha)
# plt.bar(2, np.abs(auction_total_miss)/1e6, width=width, color="green", alpha=alpha)
plt.bar(2, np.abs(my_total_miss)/1e6, width=width, color="green", alpha=alpha)
center = np.abs(my_total_miss)/1e6 + winby/2.0
plt.errorbar(1.8, np.abs(my_total_miss)/1e6 + winby/2.0, yerr=winby/2.0,
             color="green",
             capsize=20.0,
             markeredgewidth=4.0,
             elinewidth=20.0,
             alpha=alpha)
plt.text(1.95,20., "$" + str(winby) + " million\n  saved", fontsize=fontsize)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.xlim(0.5, 3)
plt.xticks(np.array([1, 2]) + width/ 2.0, ('Auction', 'HammerPricer'))
plt.ylabel("Cumulative Error [$] (in millions)", fontsize=fontsize)
# plt.title("Difference between predicted value and actual", fontsize=fontsize)
plt.tight_layout()
plt.savefig("2014-09-29-accuracy-comparison.pdf")
print "Win by: ",


This part matters
My miss: -18560868.631
My std: 717468.779258
Auction miss: 22911329.5
Auction std: 809186.823287
Total sales: 314620625.0
Win by: 

In [33]:
# Give me top 10 closest predictions 
my_total_miss
auction_total_miss

# my_total_miss = np.sum(np.e ** log_testing_Y - np.e ** log_testing_predicted)
# auction_total_miss = np.sum(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted)

biggest_auction_misses = np.argsort(np.abs(np.e ** log_testing_Y - np.e ** log_auction_testing_predicted))[-100:]
biggest_auction_misses
for index in biggest_auction_misses:
    auction_guess = np.e ** log_auction_testing_predicted[index]
    my_guess = np.e ** log_testing_predicted[index]
    hammer_price = np.e ** log_testing_Y[index]
    if np.abs(hammer_price - auction_guess) > np.abs(hammer_price - my_guess):
        if np.abs(hammer_price - my_guess) < hammer_price / 6.0:
            artist = testing_information[index][0]
            x = testing_information[index][1]
            print hammer_price, auction_guess, my_guess, index, artist + "-" + str(x), master_dictionary[artist][x]['title']


240000.0 175000.0 219592.235356 338 philip-guston-8444429 "Shoes"
274907.0 202137.5 232786.554886 126 sam-francis-7975359 Untitled
639280.0 559370.0 646521.423458 247 alberto-burri-7973646 "Bianco Plastica Bl 3"
716970.0 810899.0 747054.084854 202 robert-motherwell-8067678 The Summer Studio
3100000.0 3000000.0 3119977.11124 384 mark-rothko-8444131 Untitled
2600000.0 2500000.0 2626642.78992 212 cy-twombly-8465027 Untitled
500000.0 400000.0 431728.855339 226 helen-frankenthaler-8024873 "Vivaldi"
800000.0 700000.0 744505.036104 219 cy-twombly-8465053 Untitled
350000.0 250000.0 297883.953093 200 robert-motherwell-8040227 Royal Dirge
700000.0 600000.0 693954.999726 201 robert-motherwell-8444439 "Open #83"
1900000.0 1750000.0 1994345.44753 197 jasper-johns-8020886 Gray Numbers
825000.0 1000000.0 956430.282186 172 roberto-matta-8066111 Morphologie psychologique (fleureur)
513840.0 299740.0 428578.330413 393 ad-reinhardt-8576577 Black and White
1605600.0 1364760.0 1517959.01805 243 alberto-burri-7977035 Rosso plastica
850000.0 600000.0 709371.788022 211 cy-twombly-8024780 Untitled
1705300.0 1449505.0 1601719.30635 238 alberto-burri-8583029 Rosso plastica
1350000.0 1050000.0 1152549.05189 408 morris-louis-8040307 Gamma Kappa
2200000.0 1750000.0 1840419.68338 381 mark-rothko-8444438 Untitled
6500000.0 6000000.0 6564800.70456 215 cy-twombly-8444107 Untitled
2500000.0 3000000.0 2900338.97699 331 franz-kline-8034030 Untitled
2300000.0 1600000.0 1927331.13156 394 ad-reinhardt-8020840 "Abstract Painting, Red"
4174560.0 3452040.0 3842324.87187 259 alberto-burri-7977037 Sacco
10750000.0 10000000.0 10529009.5238 382 mark-rothko-8465028 Untitled
2900000.0 4000000.0 3286514.62515 334 franz-kline-8020858 Composition
41000000.0 30000000.0 41238349.4126 379 mark-rothko-8020839 No. 11 (Untitled)

In [69]:


In [38]:
export_me = {}
# for index in [338, 46, 247, 192, 202, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
for index in [338, 46, 247, 192, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
    auction_guess = np.e ** log_auction_testing_predicted[index]
    my_guess = np.e ** log_testing_predicted[index]
    hammer_price = np.e ** log_testing_Y[index]
    artist = testing_information[index][0]
    x = testing_information[index][1]
    other = testing_information[index][2:]
    print artist + "-" + str(x), hammer_price, auction_guess, my_guess, index, master_dictionary[artist][x]['title'], other

print things


philip-guston-8444429 240000.0 175000.0 219592.235356 338 "Shoes" [u"Christie's", u'united states', 719893, 45.151522676428087, 990.4499999999999, 735367, 157.86949367088604, 136.32670886075942, 138.73708860759496, 150000.0, 200000.0, 175000.0, 11.918390573078392, 12.206072645530174, 12.072541252905651]
karel-appel-8011173 310500.0 236250.0 255978.29696 46 Deux personnages [u"Christie's", u'netherlands', 712954, 145.81237944701402, 10279.5, 735177, 125.02092105263151, 107.21065789473687, 93.36921052631574, 202500.0, 270000.0, 236250.0, 12.218495165528731, 12.506177237980511, 12.37264584535599]
alberto-burri-7973646 639280.0 559370.0 646521.423458 247 "Bianco Plastica Bl 3" [u"Sotheby's", u'united kingdom', 717702, 90.13878188659973, 3750.0, 735158, 192.14776119402987, 185.69746268656715, 180.96119402985073, 479460.0, 639280.0, 559370.0, 13.080415749596563, 13.368097822048345, 13.234566429423822]
roberto-matta-8033394 380000.0 300000.0 315397.23182 192 La rosa [u"Christie's", u'united states', 709301, 81.319677815397171, 3225.7999999999997, 735191, 111.3535, 72.910749999999993, 60.5, 250000.0, 350000.0, 300000.0, 12.429216196844383, 12.765688433465597, 12.611537753638338]
helen-frankenthaler-8024873 500000.0 400000.0 431728.855339 226 "Vivaldi" [u"Christie's", u'united states', 726103, 218.95764430592507, 23245.42, 735185, 184.60794871794866, 144.49692307692311, 88.229102564102604, 300000.0, 500000.0, 400000.0, 12.611537753638338, 13.122363377404328, 12.899219826090119]
cy-twombly-8444107 6500000.0 6000000.0 6564800.70456 215 Untitled [u"Christie's", u'united states', 716971, 148.51276039451963, 10980.0, 735366, 179.20423913043484, 163.43434782608702, 164.41934782608692, 5000000.0, 7000000.0, 6000000.0, 15.424948470398375, 15.761420707019587, 15.60727002719233]
mark-rothko-8465028 10750000.0 10000000.0 10529009.5238 382 Untitled [u"Sotheby's", u'united states', 711858, 113.2646458520928, 6385.56, 735367, 228.94239130434781, 135.11869565217395, 94.792282608695629, 8000000.0, 12000000.0, 10000000.0, 15.89495209964411, 16.300417207752275, 16.11809565095832]
mark-rothko-8020839 41000000.0 30000000.0 41238349.4126 379 No. 11 (Untitled) [u"Christie's", u'united states', 714415, 268.63255573366382, 35776.68, 735184, 211.93136363636367, 132.63636363636363, 85.746022727272731, 25000000.0, 35000000.0, 30000000.0, 17.034386382832476, 17.370858619453688, 17.216707939626428]
alberto-burri-7977037 4174560.0 3452040.0 3842324.87187 259 Sacco [u"Christie's", u'united kingdom', 712954, 131.38451202481971, 8537.1, 735159, 86.368488372093012, 61.467093023255806, 47.974883720930237, 2890080.0, 4014000.0, 3452040.0, 14.876794741366384, 15.205298808338421, 15.054475918603837]
franz-kline-8020858 2900000.0 4000000.0 3286514.62515 334 Composition [u"Christie's", u'united states', 711858, 94.446175147541041, 4251.96, 735184, 87.032297297297333, 85.148783783783756, 81.419054054054044, 3000000.0, 5000000.0, 4000000.0, 14.914122846632385, 15.424948470398375, 15.201804919084164]
jasper-johns-8020886 1900000.0 1750000.0 1994345.44753 197 Gray Numbers [u"Christie's", u'united states', 715145, 17.780044994318772, 151.93999999999997, 735184, 127.68400000000004, 127.0145333333333, 118.99999999999997, 1500000.0, 2000000.0, 1750000.0, 14.220975666072439, 14.508657738524219, 14.375126345899696]
morris-louis-8040307 1350000.0 1050000.0 1152549.05189 408 Gamma Kappa [u"Sotheby's", u'united states', 715510, 476.78296949450703, 104139.0, 735186, 225.83205882352948, 221.21867647058835, 214.03602941176473, 900000.0, 1200000.0, 1050000.0, 13.710150042306449, 13.997832114758229, 13.864300722133706]
ad-reinhardt-8020840 2300000.0 1600000.0 1927331.13156 394 "Abstract Painting, Red" [u"Christie's", u'united states', 712954, 107.97541386815797, 5829.3, 735184, 218.69049999999999, 76.22829999999999, 97.226800000000026, 1400000.0, 1800000.0, 1600000.0, 14.151982794585487, 14.403297222866392, 14.28551418721001]
['creation_date', 'diagonal_length', 'area', 'sale_date', 'red', 'green', 'blue', 'low_estimate', 'high_estimate', 'auction_estimate', 'log_low_estimate', 'log_high_estimate', 'log_auction_estimate', 'karel-appel', 'sam-francis', 'antoni-tapies', 'roberto-matta', 'jasper-johns', 'robert-motherwell', 'mark-tobey', 'cy-twombly', 'louise-nevelson', 'helen-frankenthaler', 'alberto-burri', 'aaron-siskind', 'joan-mitchell', 'louise-bourgeois', 'theodoros-stamos', 'kenneth-noland', 'isamu-noguchi', 'adolph-gottlieb', 'franz-kline', 'philip-guston', 'jules-olitski', 'arshile-gorky', 'norman-bluhm', 'gene-davis', 'al-held', 'mark-rothko', 'jackson-pollock', 'esteban-vicente', 'morris-cole-graves', 'ad-reinhardt', 'rita-letendre', 'morris-louis', 'william-baziotes', 'norman-wilfred-lewis', 'milton-resnick', 'lee-krasner', 'barnett-newman', u"Christie's", u"Sotheby's", u'Farsetti', u'Bonhams', u'William Doyle', u'Swann Galleries', u'Bukowskis', u'Heffel Fine Art', u'Germann Auktionshaus', u'Bruun Rasmussen', u'Villa Grisebach', u'Finarte', u'Ketterer Kunst GmbH', u'Dorotheum', u'Stockholms Auktionsverk', u'Lempertz', u'Loudmer', 'unknown_auction', u'united states', u'united kingdom', u'italy', u'france', u'netherlands', u'germany', u'sweden', u'switzerland', u'canada', u'denmark', u'austria', 'unknown_country']

In [137]:
export_me = {}
for index in [338, 46, 247, 192, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
    auction_guess = np.e ** log_auction_testing_predicted[index]
    my_guess = np.e ** log_testing_predicted[index]
    hammer_price = np.e ** log_testing_Y[index]
    artist = testing_information[index][0]
    x = testing_information[index][1]
    mylogstd = 0.39429513769064306 
    other = testing_information[index][2:]
    print "Title:", master_dictionary[artist][x]['title']
#     print "Artist:", " ".join(["".join([name[0].upper(), name[1:]]) for name in artist.split("-")])
#     print "Auction Date:", datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['sale_date']), "%Y-%m-%d")
#     print "Creation Year:", datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['creation_date']), "%Y")
#     print "Auction House:", master_dictionary[artist][x]['auction_house'].lower().capitalize()
#     print "Country:", " ".join(["".join([name[0].upper(), name[1:]]) for name in master_dictionary[artist][x]['sale_country'].split()])
#     print "Auction House Low Estimate:", locale.currency(master_dictionary[artist][x]['low_estimate'], grouping=True)
#     print "Auction House Prediction:", locale.currency(master_dictionary[artist][x]['auction_estimate'], grouping=True)
#     print "Auction House High Estimate:", locale.currency(master_dictionary[artist][x]['high_estimate'], grouping=True)
#     print "HammerPricer Low:", locale.currency(round(np.e ** (np.log(my_guess) - mylogstd), -2), grouping=True)
#     print "HammerPricer Prediction:", locale.currency(round(my_guess, -2), grouping=True)
#     print "HammerPricer High:", locale.currency(round(np.e ** (np.log(my_guess) + mylogstd), -2), grouping=True)
#     print "Sale Price:", locale.currency(hammer_price, grouping=True)
#     print
#     print artist + "-" + str(x), hammer_price, auction_guess, my_guess, index, master_dictionary[artist][x]['title'], other


Title: "Shoes"
Title: Deux personnages
Title: "Bianco Plastica Bl 3"
Title: La rosa
Title: "Vivaldi"
Title: Untitled
Title: Untitled
Title: No. 11 (Untitled)
Title: Sacco
Title: Composition
Title: Gray Numbers
Title: Gamma Kappa
Title: "Abstract Painting, Red"

Make a DATABASE


In [87]:
# con = mdb.connect(user="root", host="localhost", passwd="", database='test1') 
con = mdb.connect('localhost', "root", "")
# mdb.connect()
cursor = con.cursor()
sql = "CREATE DATABASE IF NOT EXISTS test1"
cursor.execute(sql)


Out[87]:
1

In [118]:
keys = ["title", "artist", "auction_date", "creation_date", "auction_house", "country", "auction_house_low", "auction_house_prediction", "auction_house_high", "hammer_price_low", "hammer_price_prediction", "hammer_price_high", "sale_price", "key", "image"]
','.join(keys)


Out[118]:
'title,artist,auction_date,creation_date,auction_house,country,auction_house_low,auction_house_prediction,auction_house_high,hammer_price_low,hammer_price_prediction,hammer_price_high,sale_price,key,image'

In [147]:
con = mdb.connect('localhost', "root", "", 'test1') 
with con:
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS Auctions")
    cur.execute("CREATE TABLE Auctions(id INT PRIMARY KEY AUTO_INCREMENT, \
                 title VARCHAR(250), \
                 artist VARCHAR(250), \
                 auction_date VARCHAR(250), \
                 creation_date VARCHAR(250), \
                 auction_house VARCHAR(250), \
                 country VARCHAR(250), \
                 auction_house_low VARCHAR(250), \
                 auction_house_prediction VARCHAR(250), \
                 auction_house_high VARCHAR(250), \
                 hammer_price_low VARCHAR(250), \
                 hammer_price_prediction VARCHAR(250), \
                 hammer_price_high VARCHAR(250), \
                 sale_price VARCHAR(250), \
                 artkey VARCHAR(250), \
                 image VARCHAR(250))")
    
    for index in [338, 46, 247, 192, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
        auction_guess = np.e ** log_auction_testing_predicted[index]
        my_guess = np.e ** log_testing_predicted[index]
        hammer_price = np.e ** log_testing_Y[index]
        artist = testing_information[index][0]
        x = testing_information[index][1]
        mylogstd = 0.39429513769064306 
        other = testing_information[index][2:]

        values = [master_dictionary[artist][x]['title'].strip("\""),
                " ".join(["".join([name[0].upper(), name[1:]]) for name in artist.split("-")]),
                datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['sale_date']), "%Y-%m-%d"),
                datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['creation_date']), "%Y"),
                master_dictionary[artist][x]['auction_house'].lower().capitalize(),
                " ".join(["".join([name[0].upper(), name[1:]]) for name in master_dictionary[artist][x]['sale_country'].split()]),
                locale.currency(master_dictionary[artist][x]['low_estimate'], grouping=True)[:-3],
                locale.currency(master_dictionary[artist][x]['auction_estimate'], grouping=True)[:-3],
                locale.currency(master_dictionary[artist][x]['high_estimate'], grouping=True)[:-3],
                locale.currency(round(np.e ** (np.log(my_guess) - mylogstd), -2), grouping=True)[:-3],
                locale.currency(round(my_guess, -2), grouping=True)[:-3],
                locale.currency(round(np.e ** (np.log(my_guess) + mylogstd), -2), grouping=True)[:-3],
                locale.currency(hammer_price, grouping=True)[:-3],
                artist + "-" + str(x),
                "../static/images/" + artist + "-" + str(x) + ".jpg",]
        build_string = "\",\"".join(values)
        cur.execute("INSERT INTO Auctions(title,artist,auction_date,creation_date,auction_house,country,auction_house_low,auction_house_prediction,auction_house_high,hammer_price_low,hammer_price_prediction,hammer_price_high,sale_price,artkey,image ) VALUES(\"%s\")" % (build_string))
        cur.execute("SELECT image FROM Auctions")
    rows = cur.fetchall()
    for row in rows:
        print row


('../static/images/philip-guston-8444429.jpg',)
('../static/images/karel-appel-8011173.jpg',)
('../static/images/alberto-burri-7973646.jpg',)
('../static/images/roberto-matta-8033394.jpg',)
('../static/images/helen-frankenthaler-8024873.jpg',)
('../static/images/cy-twombly-8444107.jpg',)
('../static/images/mark-rothko-8465028.jpg',)
('../static/images/mark-rothko-8020839.jpg',)
('../static/images/alberto-burri-7977037.jpg',)
('../static/images/franz-kline-8020858.jpg',)
('../static/images/jasper-johns-8020886.jpg',)
('../static/images/morris-louis-8040307.jpg',)
('../static/images/ad-reinhardt-8020840.jpg',)

In [168]:
# Dictionary Cursor
with con:
    cur = con.cursor(mdb.cursors.DictCursor)
    cur.execute("SELECT * FROM Auctions")
    rows = cur.fetchall()
    for row in rows:
        print row['title'], row["auction_house_low"]


---------------------------------------------------------------------------
OperationalError                          Traceback (most recent call last)
<ipython-input-168-226bb2444eb3> in <module>()
      5     rows = cur.fetchall()
      6     for row in rows:
----> 7         print row['title'], row["auction_house_low"]

/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/pymysql/connections.pyc in __exit__(self, exc, value, traceback)
    713         ''' On successful exit, commit. On exception, rollback. '''
    714         if exc:
--> 715             self.rollback()
    716         else:
    717             self.commit()

/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/pymysql/connections.pyc in rollback(self)
    676     def rollback(self):
    677         ''' Roll back the current transaction '''
--> 678         self._execute_command(COM_QUERY, "ROLLBACK")
    679         self._read_ok_packet()
    680 

/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/pymysql/connections.pyc in _execute_command(self, command, sql)
    886 
    887         prelude = struct.pack('<i', chunk_size) + int2byte(command)
--> 888         self._write_bytes(prelude + sql[:chunk_size-1])
    889         if DEBUG: dump_packet(prelude + sql)
    890 

/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/pymysql/connections.pyc in _write_bytes(self, data)
    846             self.socket.sendall(data)
    847         except IOError as e:
--> 848             raise OperationalError(2006, "MySQL server has gone away (%r)" % (e,))
    849 
    850     def _read_query_result(self, unbuffered=False):

OperationalError: (2006, "MySQL server has gone away (error(32, 'Broken pipe'))")

In [125]:
values = [master_dictionary[artist][x]['title'].strip("\""),
            " ".join(["".join([name[0].upper(), name[1:]]) for name in artist.split("-")]),
            datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['sale_date']), "%Y-%m-%d"),
            datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['creation_date']), "%Y"),
            master_dictionary[artist][x]['auction_house'].lower().capitalize(),
            " ".join(["".join([name[0].upper(), name[1:]]) for name in master_dictionary[artist][x]['sale_country'].split()]),
            locale.currency(master_dictionary[artist][x]['low_estimate'], grouping=True),
            locale.currency(master_dictionary[artist][x]['auction_estimate'], grouping=True),
            locale.currency(master_dictionary[artist][x]['high_estimate'], grouping=True),
            locale.currency(round(np.e ** (np.log(my_guess) - mylogstd), -2), grouping=True),
            locale.currency(round(my_guess, -2), grouping=True),
            locale.currency(round(np.e ** (np.log(my_guess) + mylogstd), -2), grouping=True),
            locale.currency(hammer_price, grouping=True),
            artist + "-" + str(x),
            "../static/images/" + artist + "-" + str(x) + ".jpg",]
"\",\"".join(values)


Out[125]:
u'Abstract Painting, Red","Ad Reinhardt","2013-11-12","1953","Christie\'s ","United States","$1,400,000.00","$1,600,000.00","$1,800,000.00","$1,299,300.00","$1,927,300.00","$2,858,900.00","$2,300,000.00","ad-reinhardt-8020840","../static/images/ad-reinhardt-8020840.jpg'

In [167]:



Out[167]:
<dictionary-keyiterator at 0x11771c788>

In [75]:
export_me = {}
for index in [338, 46, 247, 192, 226, 215, 382, 379, 259, 334, 197, 408, 394]:
    auction_guess = np.e ** log_auction_testing_predicted[index]
    my_guess = np.e ** log_testing_predicted[index]
    hammer_price = np.e ** log_testing_Y[index]
    artist = testing_information[index][0]
    x = testing_information[index][1]
    mylogstd = 0.39429513769064306 
    other = testing_information[index][2:]
    print "Title:", master_dictionary[artist][x]['title']
    print "Artist:", " ".join(["".join([name[0].upper(), name[1:]]) for name in artist.split("-")])
    print "Auction Date:", datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['sale_date']), "%Y-%m-%d")
    print "Creation Year:", datetime.datetime.strftime(datetime.datetime.fromordinal(master_dictionary[artist][x]['creation_date']), "%Y")
    print "Auction House:", master_dictionary[artist][x]['auction_house'].lower().capitalize()
    print "Country:", " ".join(["".join([name[0].upper(), name[1:]]) for name in master_dictionary[artist][x]['sale_country'].split()])
    print "Auction House Low Estimate:", locale.currency(master_dictionary[artist][x]['low_estimate'], grouping=True)
    print "Auction House Prediction:", locale.currency(master_dictionary[artist][x]['auction_estimate'], grouping=True)
    print "Auction House High Estimate:", locale.currency(master_dictionary[artist][x]['high_estimate'], grouping=True)
    print "HammerPricer Low:", locale.currency(round(np.e ** (np.log(my_guess) - mylogstd), -2), grouping=True)
    print "HammerPricer Prediction:", locale.currency(round(my_guess, -2), grouping=True)
    print "HammerPricer High:", locale.currency(round(np.e ** (np.log(my_guess) + mylogstd), -2), grouping=True)
    print "Sale Price:", locale.currency(hammer_price, grouping=True)
    print "key:", artist + "-" + str(x)
    print "image:", "../static/images/" + artist + "-" + str(x) + ".jpg"
    print


Title: "Shoes"
Artist: Philip Guston
Auction Date: 2014-05-14
Creation Year: 1972
Auction House: Christie's 
Country: United States
Auction House Low Estimate: $150,000.00
Auction House Prediction: $175,000.00
Auction House High Estimate: $200,000.00
HammerPricer Low: $148,000.00
HammerPricer Prediction: $219,600.00
HammerPricer High: $325,700.00
Sale Price: $240,000.00
key: philip-guston-8444429
image: ../static/images/philip-guston-8444429.jpg

Title: Deux personnages
Artist: Karel Appel
Auction Date: 2013-11-05
Creation Year: 1953
Auction House: Christie's 
Country: Netherlands
Auction House Low Estimate: $202,500.00
Auction House Prediction: $236,250.00
Auction House High Estimate: $270,000.00
HammerPricer Low: $172,600.00
HammerPricer Prediction: $256,000.00
HammerPricer High: $379,700.00
Sale Price: $310,500.00
key: karel-appel-8011173
image: ../static/images/karel-appel-8011173.jpg

Title: "Bianco Plastica Bl 3"
Artist: Alberto Burri
Auction Date: 2013-10-17
Creation Year: 1966
Auction House: Sotheby's 
Country: United Kingdom
Auction House Low Estimate: $479,460.00
Auction House Prediction: $559,370.00
Auction House High Estimate: $639,280.00
HammerPricer Low: $435,900.00
HammerPricer Prediction: $646,500.00
HammerPricer High: $959,000.00
Sale Price: $639,280.00
key: alberto-burri-7973646
image: ../static/images/alberto-burri-7973646.jpg

Title: La rosa
Artist: Roberto Matta
Auction Date: 2013-11-19
Creation Year: 1943
Auction House: Christie's 
Country: United States
Auction House Low Estimate: $250,000.00
Auction House Prediction: $300,000.00
Auction House High Estimate: $350,000.00
HammerPricer Low: $212,600.00
HammerPricer Prediction: $315,400.00
HammerPricer High: $467,800.00
Sale Price: $380,000.00
key: roberto-matta-8033394
image: ../static/images/roberto-matta-8033394.jpg

Title: "Vivaldi"
Artist: Helen Frankenthaler
Auction Date: 2013-11-13
Creation Year: 1989
Auction House: Christie's 
Country: United States
Auction House Low Estimate: $300,000.00
Auction House Prediction: $400,000.00
Auction House High Estimate: $500,000.00
HammerPricer Low: $291,100.00
HammerPricer Prediction: $431,700.00
HammerPricer High: $640,400.00
Sale Price: $500,000.00
key: helen-frankenthaler-8024873
image: ../static/images/helen-frankenthaler-8024873.jpg

Title: Untitled
Artist: Cy Twombly
Auction Date: 2014-05-13
Creation Year: 1964
Auction House: Christie's 
Country: United States
Auction House Low Estimate: $5,000,000.00
Auction House Prediction: $6,000,000.00
Auction House High Estimate: $7,000,000.00
HammerPricer Low: $4,425,700.00
HammerPricer Prediction: $6,564,800.00
HammerPricer High: $9,737,800.00
Sale Price: $6,500,000.00
key: cy-twombly-8444107
image: ../static/images/cy-twombly-8444107.jpg

Title: Untitled
Artist: Mark Rothko
Auction Date: 2014-05-14
Creation Year: 1950
Auction House: Sotheby's
Country: United States
Auction House Low Estimate: $8,000,000.00
Auction House Prediction: $10,000,000.00
Auction House High Estimate: $12,000,000.00
HammerPricer Low: $7,098,200.00
HammerPricer Prediction: $10,529,000.00
HammerPricer High: $15,618,100.00
Sale Price: $10,750,000.00
key: mark-rothko-8465028
image: ../static/images/mark-rothko-8465028.jpg

Title: No. 11 (Untitled)
Artist: Mark Rothko
Auction Date: 2013-11-12
Creation Year: 1957
Auction House: Christie's 
Country: United States
Auction House Low Estimate: $25,000,000.00
Auction House Prediction: $30,000,000.00
Auction House High Estimate: $35,000,000.00
HammerPricer Low: $27,801,000.00
HammerPricer Prediction: $41,238,300.00
HammerPricer High: $61,170,400.00
Sale Price: $41,000,000.00
key: mark-rothko-8020839
image: ../static/images/mark-rothko-8020839.jpg

Title: Sacco
Artist: Alberto Burri
Auction Date: 2013-10-18
Creation Year: 1953
Auction House: Christie's 
Country: United Kingdom
Auction House Low Estimate: $2,890,080.00
Auction House Prediction: $3,452,040.00
Auction House High Estimate: $4,014,000.00
HammerPricer Low: $2,590,300.00
HammerPricer Prediction: $3,842,300.00
HammerPricer High: $5,699,500.00
Sale Price: $4,174,560.00
key: alberto-burri-7977037
image: ../static/images/alberto-burri-7977037.jpg

Title: Composition
Artist: Franz Kline
Auction Date: 2013-11-12
Creation Year: 1950
Auction House: Christie's 
Country: United States
Auction House Low Estimate: $3,000,000.00
Auction House Prediction: $4,000,000.00
Auction House High Estimate: $5,000,000.00
HammerPricer Low: $2,215,600.00
HammerPricer Prediction: $3,286,500.00
HammerPricer High: $4,875,000.00
Sale Price: $2,900,000.00
key: franz-kline-8020858
image: ../static/images/franz-kline-8020858.jpg

Title: Gray Numbers
Artist: Jasper Johns
Auction Date: 2013-11-12
Creation Year: 1959
Auction House: Christie's 
Country: United States
Auction House Low Estimate: $1,500,000.00
Auction House Prediction: $1,750,000.00
Auction House High Estimate: $2,000,000.00
HammerPricer Low: $1,344,500.00
HammerPricer Prediction: $1,994,300.00
HammerPricer High: $2,958,300.00
Sale Price: $1,900,000.00
key: jasper-johns-8020886
image: ../static/images/jasper-johns-8020886.jpg

Title: Gamma Kappa
Artist: Morris Louis
Auction Date: 2013-11-14
Creation Year: 1960
Auction House: Sotheby's
Country: United States
Auction House Low Estimate: $900,000.00
Auction House Prediction: $1,050,000.00
Auction House High Estimate: $1,200,000.00
HammerPricer Low: $777,000.00
HammerPricer Prediction: $1,152,500.00
HammerPricer High: $1,709,600.00
Sale Price: $1,350,000.00
key: morris-louis-8040307
image: ../static/images/morris-louis-8040307.jpg

Title: "Abstract Painting, Red"
Artist: Ad Reinhardt
Auction Date: 2013-11-12
Creation Year: 1953
Auction House: Christie's 
Country: United States
Auction House Low Estimate: $1,400,000.00
Auction House Prediction: $1,600,000.00
Auction House High Estimate: $1,800,000.00
HammerPricer Low: $1,299,300.00
HammerPricer Prediction: $1,927,300.00
HammerPricer High: $2,858,900.00
Sale Price: $2,300,000.00
key: ad-reinhardt-8020840
image: ../static/images/ad-reinhardt-8020840.jpg


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [197]:
auction_training_predicted = np.array(auction_training_predicted)
auction_testing_predicted = np.array(auction_testing_predicted)
auction_predicted = np.array(auction_predicted)

log_auction_training_predicted = np.array(log_auction_training_predicted)
log_auction_testing_predicted = np.array(log_auction_testing_predicted)
log_auction_predicted = np.array(log_auction_predicted)

# Plot the mean auction range prediction vs price and error
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams['legend.scatterpoints'] = 1 
# Get transformation function

# sca1 = plt.scatter(log_training_Y, log_auction_training_predicted, color='green', alpha=0.35, s=60, 
#             label="Training (" + str(len(log_training_Y)) + ")")
sca1 = plt.scatter(log_testing_Y, log_auction_testing_predicted, color='green', alpha=0.5, s=60, 
            label="Auction Estimate (" + str(len(log_testing_Y)) + ")")

price_lower = 6.0
price_upper = 18.0
# plt.xlim(price_lower, price_upper)
# plt.ylim(price_lower, price_upper)
# plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()]) 
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()]) 
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)

MSE_training = np.sqrt(np.sum((log_training_Y - log_auction_training_predicted)**2.0/ len(log_training_Y)))
# MSPE_testing = np.sqrt(np.sum((log_testing_Y - log_auction_testing_predicted)**2.0/len(log_testing_Y)))

first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
# plt.legend([sca1, sca2], ["MSE Training " + str(round(MSE_training,2)), 
#                           "MSPE Testing " + str(round(MSPE_testing,2))], loc=4, fontsize=fontsize)
plt.tight_layout()
# Plot the mean auction range prediction vs price and error
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams['legend.scatterpoints'] = 1 
# Get transformation function
scaler = preprocessing.StandardScaler().fit(training_X)
new_training_X = scaler.transform(training_X)
new_testing_X = scaler.transform(testing_X)
new_big_X = scaler.transform(big_X)
log_linear = linear_model.LinearRegression()
log_linear.fit(new_training_X, log_training_Y)
log_testing_predicted = log_linear.predict(new_testing_X)
log_training_predicted = log_linear.predict(new_training_X)

print "Score:", log_linear.score(new_testing_X, log_testing_Y)

# sca1 = plt.scatter(log_training_Y, log_linear.predict(new_training_X), color='blue', alpha=0.35, s=60, 
#             label="Training (" + str(len(log_training_Y)) + ")")
sca2 = plt.scatter(log_testing_Y, log_linear.predict(new_testing_X), color='red', alpha=0.5, s=60, 
            label="HammerPricer Prediction (" + str(len(log_testing_Y)) + ")")

price_lower = 6.0
price_upper = 18.0
plt.xlim(price_lower, price_upper)
plt.ylim(price_lower, price_upper)
plt.plot(np.linspace(price_lower, price_upper, 10), np.linspace(price_lower, price_upper, 10), color='black')
fontsize = 25.0
plt.axes().set_xticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_xticks()]) 
plt.axes().set_yticklabels([str(locale.currency(round(np.e**(x),-2), grouping=True))[:-3] for x in plt.axes().get_yticks()]) 
plt.xticks(rotation=55)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.ylabel("Predicted [$]", fontsize=fontsize)
plt.xlabel("Actual Hammer Price [$]", fontsize=fontsize)

# MSE_training = np.sqrt(np.sum((log_training_Y - log_linear.predict(new_training_X))**2.0/ len(log_training_Y)))
MSPE_testing = np.sqrt(np.sum((log_testing_Y - log_linear.predict(new_testing_X))**2.0/len(log_testing_Y)))

first_legend = plt.legend(fontsize=fontsize, loc=2)
ax = plt.gca().add_artist(first_legend)
plt.legend([sca1, sca2], ["MSPE Auction " + str(round(MSE_training,3)), 
                          "MSPE HammerPricer " + str(round(MSPE_testing,3))], loc=4, fontsize=fontsize)
plt.tight_layout()
# plt.savefig("2014-09-23-HammerPricer-wins.pdf")


Score: 0.953582095669

In [269]:
auction_wins = []
my_wins = []
for index, row in enumerate(testing_information):
    hammer_price = testing_Y[index]
    auction_low = row[11]
    auction_high = row[12]
    auction_pred = auction_testing_predicted[index]
    half_interval = auction_pred - auction_low
    my_prediction = np.e ** log_testing_predicted[index]
    if auction_pred - half_interval < hammer_price and hammer_price < auction_pred + half_interval:
        auction_wins.append(hammer_price)
    if my_prediction - half_interval < hammer_price and hammer_price < my_prediction + half_interval:
        my_wins.append(hammer_price)
        
print len(auction_wins), len(my_wins)


120 118

In [258]:
print  np.std(testing_Y - my_prediction), np.std(testing_Y - auction_testing_predicted)


2983238.76139 809186.823287

Checking sanity


In [484]:
my_delta = []
auction_delta = []
for index, price in enumerate(testing_Y):
#     my_delta.append(np.e ** price - np.e ** testing_predicted[index])
#     auction_delta.append(np.e ** price - np.average(np.e ** testing_information[index][12] - np.e ** testing_information[index][11]))
    my_delta.append(np.e ** testing_predicted[index] - np.e ** price)
    auction_delta.append(np.average(np.e ** testing_information[index][12] - np.e ** testing_information[index][11]) - np.e ** price)
    
my_delta = np.array(my_delta)
auction_delta = np.array(auction_delta)

In [ ]:


In [ ]:

Feature-Selection by hand


In [80]:
from sklearn.feature_selection import SelectKBest, f_regression 
from sklearn import feature_selection

In [63]:
# # Plot the mean auction range prediction vs price and error
# plt.rcParams['figure.figsize'] = 12, 12
# plt.rcParams['legend.scatterpoints'] = 1 
# Get transformation function
scaler = preprocessing.StandardScaler().fit(training_X)
new_training_X = scaler.transform(training_X)
new_testing_X = scaler.transform(testing_X)
new_big_X = scaler.transform(big_X)
log_linear = linear_model.LinearRegression()
log_linear.fit(new_training_X, log_training_Y)
log_testing_predicted = log_linear.predict(new_testing_X)
log_training_predicted = log_linear.predict(new_training_X)

# print "Score:", log_linear.score(new_testing_X, log_testing_Y)

# sca1 = plt.scatter(log_training_Y, log_linear.predict(new_training_X), color='blue', alpha=0.35, s=60, 
#             label="Training (" + str(len(log_training_Y)) + ")")
# sca2 = plt.scatter(log_testing_Y, log_linear.predict(new_testing_X), color='red', alpha=0.75, s=60, 
#             label="Testing (" + str(len(log_testing_Y)) + ")")

plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(things)
# for k in range(1, feature_length + 1):
for k in range(1, feature_length):
    selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
    results = selector.fit(new_training_X, log_training_Y)
    linear = linear_model.LinearRegression()
    linear.fit(new_testing_X[:, [index for index in results.get_support(indices=True)]], log_testing_Y)
    plt.scatter(k, np.average(((linear.predict(new_testing_X[:, [index for index in results.get_support(indices=True)]]) - log_testing_Y))**2),
                color="blue", 
                s=120.0)
    xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
    
xlabels = [things[x] for x in xlabels]

plt.xticks(np.arange(1, feature_length + 1, 1.0)) 
plt.axes().set_xticklabels([x for x in xlabels])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.xlim(0, feature_length)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel("MSP Error", fontsize=fontsize)
# plt.ylim(0.2, 1.5)
plt.tight_layout()
# plt.savefig("2014-09-23-relative-error-features.pdf")



In [21]:
# # Plot the mean auction range prediction vs price and error
# plt.rcParams['figure.figsize'] = 12, 12
# plt.rcParams['legend.scatterpoints'] = 1 
# Get transformation function

plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(things)
# for k in range(1, feature_length + 1):
for k in range(1, feature_length):
    selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
    results = selector.fit(new_training_X, log_training_Y)
    linear = linear_model.LinearRegression()
    linear.fit(new_testing_X[:, [index for index in results.get_support(indices=True)]], log_testing_Y)
    plt.scatter(np.average(((linear.predict(new_testing_X[:, [index for index in results.get_support(indices=True)]]) - log_testing_Y))**2),
                k, 
                color="blue", 
                s=120.0)
    xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
    
xlabels = [things[x] for x in xlabels]

plt.yticks(np.arange(1, feature_length + 1, 1.0)) 
plt.axes().set_yticklabels([x for x in xlabels])
# plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.ylim(0, feature_length)
plt.yticks(fontsize=fontsize)
plt.ylabel("Feature", fontsize=fontsize)
plt.xlabel("MSP Error", fontsize=fontsize)
# plt.ylim(0.2, 1.5)
plt.tight_layout()
# plt.savefig("2014-09-23-relative-error-features.pdf")



In [326]:
# # Plot the mean auction range prediction vs price and error
# plt.rcParams['figure.figsize'] = 12, 12
# plt.rcParams['legend.scatterpoints'] = 1 
# Get transformation function
scaler = preprocessing.StandardScaler().fit(training_X)
new_training_X = scaler.transform(training_X)
new_testing_X = scaler.transform(testing_X)
new_big_X = scaler.transform(big_X)
log_linear = linear_model.LinearRegression()
log_linear.fit(new_training_X, log_training_Y)
log_testing_predicted = log_linear.predict(new_testing_X)
log_training_predicted = log_linear.predict(new_training_X)

plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(things)
# for k in range(1, feature_length + 1):
for k in range(1, 2 * feature_length / 3):
    selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
    results = selector.fit(new_training_X, log_training_Y)
    linear = linear_model.LinearRegression()
    linear.fit(new_testing_X[:, [index for index in results.get_support(indices=True)]], log_testing_Y)
    plt.scatter(k, linear.score(new_testing_X[:, [index for index in results.get_support(indices=True)]], 
                                log_testing_Y),
                color="blue", 
                s=120.0)
    xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
    
xlabels = [things[x] for x in xlabels]

plt.xticks(np.arange(1, feature_length + 1, 1.0)) 
plt.axes().set_xticklabels([x for x in xlabels])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.xlim(0, 2 * feature_length / 3)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel(r"R$^2$", fontsize=fontsize)
# plt.ylim(0.2, 1.5)
plt.tight_layout()
# plt.savefig("2014-09-23-relative-error-features.pdf")



In [23]:
for index, thing in enumerate(things):
    print index, thing


0 creation_date
1 diagonal_length
2 area
3 sale_date
4 red
5 green
6 blue
7 low_estimate
8 high_estimate
9 auction_estimate
10 log_low_estimate
11 log_high_estimate
12 log_auction_estimate
13 karel-appel
14 sam-francis
15 antoni-tapies
16 roberto-matta
17 jasper-johns
18 robert-motherwell
19 mark-tobey
20 cy-twombly
21 louise-nevelson
22 helen-frankenthaler
23 alberto-burri
24 aaron-siskind
25 joan-mitchell
26 louise-bourgeois
27 theodoros-stamos
28 kenneth-noland
29 isamu-noguchi
30 adolph-gottlieb
31 franz-kline
32 philip-guston
33 jules-olitski
34 arshile-gorky
35 norman-bluhm
36 gene-davis
37 al-held
38 mark-rothko
39 jackson-pollock
40 esteban-vicente
41 morris-cole-graves
42 ad-reinhardt
43 rita-letendre
44 morris-louis
45 william-baziotes
46 norman-wilfred-lewis
47 milton-resnick
48 lee-krasner
49 barnett-newman
50 Christie's
51 Sotheby's
52 Farsetti
53 Bonhams
54 William Doyle
55 Swann Galleries
56 Bukowskis
57 Heffel Fine Art
58 Germann Auktionshaus
59 Bruun Rasmussen
60 Villa Grisebach
61 Finarte
62 Ketterer Kunst GmbH
63 Dorotheum
64 Stockholms Auktionsverk
65 Lempertz
66 Loudmer
67 unknown_auction
68 united states
69 united kingdom
70 italy
71 france
72 netherlands
73 germany
74 sweden
75 switzerland
76 canada
77 denmark
78 austria
79 unknown_country

In [197]:
int(" 87")


Out[197]:
87

In [32]:
dates_indices = [0, 3]
physical_indices = [1, 2]
color_indicies = range(4, 7)
estimate_indices = range(7, 10)
log_estimate_indices = range(10, 13)
artist_indices = range(13,50)
auction_indices = range(50, 68)
country_indicies = range(68, 80)

In [ ]:
list_of_indices = [dates_indices,
                   physical_indices,
                   color_indicies, 
                   estimate_indices,
                   log_estimate_indices,
                   artist_indices,
                   auction_indices,
                   country_indicies,
                   ]

In [55]:
k = 1
indexlist = dates_indices + physical_indices + color_indicies + artist_indices + auction_indices + country_indicies
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x not in indexlist]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x not in indexlist]]
noest_things = [things[x] for x in range(len(things)) if x not in indexlist]

linear = linear_model.LinearRegression()
linear.fit(noest_training_X, log_training_Y)
plt.scatter(k, 1.0 / len(log_training_Y) * np.sqrt(np.sum(((linear.predict(noest_training_X) - log_training_Y))**2)),
            color="blue", 
            s=120.0)
k = 2
indexlist = []
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x not in indexlist]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x not in indexlist]]
noest_things = [things[x] for x in range(len(things)) if x not in indexlist]

linear = linear_model.LinearRegression()
linear.fit(noest_training_X, log_training_Y)
plt.scatter(k, 1.0 / len(log_training_Y) * np.sqrt(np.sum(((linear.predict(noest_training_X) - log_training_Y))**2)),
            color="blue", 
            s=120.0)

# xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])

# xlabels = [things[x] for x in xlabels]

# plt.xticks(np.arange(1, feature_length + 1, 1.0)) 
# plt.axes().set_xticklabels([x for x in xlabels])


Out[55]:
<matplotlib.collections.PathCollection at 0x119d0e750>

In [90]:
import itertools

In [94]:
indices_dict = {}
indices_dict["dates_indices"] = dates_indices
indices_dict["physical_indices"] = physical_indices 
indices_dict["color_indicies"] = color_indicies 
indices_dict["estimate_indices"] = estimate_indices 
indices_dict["log_estimate_indices"] = log_estimate_indices 
indices_dict["artist_indices"] = artist_indices 
indices_dict["auction_indices"] = auction_indices 
indices_dict["country_indicies"] = country_indicies

keys = ["dates_indices",
        "physical_indices",
        "color_indicies",
        "estimate_indices",
        "log_estimate_indices",
        "artist_indices",
        "auction_indices",
        "country_indicies",]

In [118]:
# keys = ["dates_indices",
#         "physical_indices",
#         "color_indicies",
#         "estimate_indices",
#         "log_estimate_indices",
#         "artist_indices",
#         "auction_indices",
#         "country_indicies",]
# #  + ["log_estimate_indices", ]
# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
#     print index, guts[0]
#     indexlist = indices_dict[guts[0]]
#     noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
#     noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
#     linear = linear_model.LinearRegression()
#     linear.fit(noest_training_X, log_training_Y)    
#     plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
#             color="blue", 
#             s=120.0)
# keys = ["dates_indices",
#         "physical_indices",
#         "color_indicies",
#         "artist_indices",
#         "auction_indices",
#         "country_indicies",]

# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
#     print index, guts[0]
#     indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices",]
#     noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
#     noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
#     linear = linear_model.LinearRegression()
#     linear.fit(noest_training_X, log_training_Y)    
#     plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
#             color="blue", 
#             s=120.0)

    
# keys = ["dates_indices",
#         "physical_indices",
#         "color_indicies",
#         "auction_indices",
#         "country_indicies",]

# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
#     print index, guts[0]
#     indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices",]
#     noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
#     noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
#     linear = linear_model.LinearRegression()
#     linear.fit(noest_training_X, log_training_Y)    
#     plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
#             color="blue", 
#             s=120.0)
# keys = ["dates_indices",
#         "physical_indices",
#         "color_indicies",
#         "country_indicies",]

# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
#     print index, guts[0]
#     indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices", "auction_indices",]
#     noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
#     noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
#     linear = linear_model.LinearRegression()
#     linear.fit(noest_training_X, log_training_Y)    
#     plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
#             color="blue", 
#             s=120.0)
# keys = ["physical_indices",
#         "color_indicies",
#         "country_indicies",]

# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
#     print index, guts[0]
#     indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices", "auction_indices","dates_indices",]
#     noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
#     noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
#     linear = linear_model.LinearRegression()
#     linear.fit(noest_training_X, log_training_Y)    
#     plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
#             color="blue", 
#             s=120.0)
# keys = ["color_indicies",
#         "country_indicies",]

# temp = []
# for index, guts in enumerate(itertools.combinations(keys, 1)):
#     print index, guts[0]
#     indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices", "auction_indices","dates_indices","physical_indices",]
#     noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
#     noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
#     linear = linear_model.LinearRegression()
#     linear.fit(noest_training_X, log_training_Y)    
#     plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
#             color="blue", 
#             s=120.0)
keys = ["color_indicies",
        "country_indicies",]

temp = []
for index, guts in enumerate(itertools.combinations(keys, 1)):
    print index, guts[0]
    indexlist = indices_dict[guts[0]] + ["log_estimate_indices", "estimate_indices", "artist_indices", "auction_indices","dates_indices","physical_indices","country_indicies","color_indicies",]
    noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
    noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
    linear = linear_model.LinearRegression()
    linear.fit(noest_training_X, log_training_Y)    
    plt.scatter(index, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2)),
            color="blue", 
            s=120.0)


0 color_indicies
1 country_indicies

In [133]:
orderedlist = ["estimate_indices", "artist_indices", "auction_indices","dates_indices","physical_indices","country_indicies","color_indicies",]
betternames = ["Auction Estimate", "Artist", "Auction House", "Creation/Sale Date", "Physical Dimensions", "Country of Sale", "Image Analysis"]
temp = []
indexlist = ["log_estimate_indices", ]
for index, key in enumerate(orderedlist):
    print index, key
    indexlist += indices_dict[key] 
    noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
    noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
    linear = linear_model.LinearRegression()
    linear.fit(noest_training_X, log_training_Y)    
    plt.scatter(index, 1.0 / len(log_testing_Y) * np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2),
            color="blue", 
            label=betternames[index],
            s=120.0)
# plt.legend()
plt.xticks(np.arange(len(betternames))) 
plt.axes().set_xticklabels([x for x in betternames])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
# plt.xlim(0, feature_length + 1)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel("Mean Squared Predicted Error", fontsize=fontsize)
plt.tight_layout()


0 estimate_indices
1 artist_indices
2 auction_indices
3 dates_indices
4 physical_indices
5 country_indicies
6 color_indicies

In [169]:
plt.rcParams['figure.figsize'] = 14, 9

orderedlist = ["estimate_indices", "artist_indices", "auction_indices","dates_indices","physical_indices","country_indicies","color_indicies",]
betternames = ["Auction Estimate", 
               " + Artist", 
               " + Auction House", 
               " + Creation/Sale Date", 
               " + Physical Dimensions", 
               " + Country of Sale", 
               " + Color Analysis"]

indexlist = ["log_estimate_indices", ]
for index, key in enumerate(orderedlist[::-1]):
    print index, key
    indexlist += indices_dict[key] 
    noest_training_X = new_training_X[:,[x for x in range(len(things)) if x in indexlist]]
    noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x in indexlist]]
    linear = linear_model.LinearRegression()
    linear.fit(noest_training_X, log_training_Y)    
    plt.barh(len(orderedlist) - index, 
             1.0 / len(log_testing_Y) * np.sum(((linear.predict(noest_testing_X) - log_testing_Y))**2), 
             alpha=0.75,
            )

# plt.legend()
# plt.yticks(np.arange(len(betternames))) 
plt.yticks(np.arange(1, len(betternames)+1) + .35) 
plt.axes().set_yticklabels([x for x in betternames[::-1]])

# plt.yticks(rotation=90)
plt.yticks(fontsize=fontsize)
# plt.xlim(0, feature_length + 1)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
# plt.ylabel("Feature", fontsize=fontsize)
plt.xlabel("Mean Squared Predicted Error", fontsize=fontsize)
plt.ylim(.7, 8)
plt.tight_layout()
plt.savefig("2014-09-29-feature-importance.pdf")


0 color_indicies
1 country_indicies
2 physical_indices
3 dates_indices
4 auction_indices
5 artist_indices
6 estimate_indices

In [111]:
log_linear.coef_


Out[111]:
array([ -4.81690356e-02,   1.05117152e-01,  -4.80674497e-02,
         9.73790564e-02,   1.87287091e-02,   8.48289834e-03,
        -3.12011187e-02,   9.67314345e+10,   1.34758477e+11,
        -2.31353744e+11,   9.59854798e-01,   4.76908541e-01,
         2.30440592e-02,  -3.82038007e+08,   3.82572744e+10,
         5.48761850e+10,   3.79457723e+10,   7.00357435e+10,
         9.37467203e+10,   2.90459859e+10,   1.47629019e+10,
         5.35931597e+10,   2.69032706e+10,   5.42388207e+10,
         3.76315782e+10,   6.00908171e+10,  -1.15575554e+10,
         2.64534246e+10,   3.02571769e+10,   6.70771326e+10,
         5.65373833e+10,   1.35956004e+11,   7.11478445e+10,
         3.10377029e+10,   9.84627467e+09,   2.14310688e+10,
         5.52966269e+10,   6.23350247e+10,   4.29215044e+10,
         2.90459859e+10,   4.90521027e+10,   4.83311109e+10,
         3.10377029e+10,   3.94776297e+10,   3.63464772e+10,
         6.97141904e+10,   1.12126574e+11,   1.33983298e+11,
         7.68573500e+10,   3.02571769e+10,  -9.87140131e+10,
        -8.40562799e+10,  -9.44535599e+10,  -5.41607007e+11,
        -7.35571695e+10,  -1.85146710e+11,  -8.15610192e+10,
        -8.76629529e+10,  -7.89851056e+10,  -7.63203731e+10,
        -7.49517518e+10,  -7.21351207e+10,  -5.23767005e+11,
        -7.35571695e+10,  -9.11233536e+10,  -8.28182700e+10,
        -1.02792312e+11,  -2.89474609e+11,  -5.09805593e+11,
        -7.01568667e+11,  -5.25900743e+11,  -1.47235786e+12,
        -9.99141770e+11,  -1.78668295e+12,  -1.44351963e+12,
        -8.35247846e+11,  -8.40129734e+11,  -2.75482446e+12,
        -3.43681367e+12,  -6.20190331e+11])

In [ ]:
ordered_features = ["log_estimate_indices", 
                     "color_indicies", 
                     "country_indicies", 
                     "auction_indices", 
                     "physical_indices",
                     "artist_indices",
                     "dates_indices",]
indexlist = []
for index, key in range(len(ordered_features)):
    indexlist += indices_dict[key]
    noest_training_X = new_training_X[:,[x for x in range(len(things)) if x not in indexlist]]
    noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x not in indexlist]]
    linear = linear_model.LinearRegression()
    linear.fit(noest_training_X, log_training_Y)
    plt.scatter(index, linear.score(noest_testing_X, log_testing_Y),
                color="blue", 
                s=120.0)

Feature selection


In [24]:
selector = RFECV(linear, step=1, cv=5)
results = selector.fit(new_training_X, log_training_Y)

print results.n_features_
results.score(new_testing_X, log_testing_Y)


1
Out[24]:
0.26572495438280308

In [175]:


In [275]:



Out[275]:
[0]

In [25]:
selector = SelectKBest(score_func=feature_selection.f_regression, k=1)
results = selector.fit(new_training_X, training_Y)
# things = features + artist_list
# for number, index in enumerate(np.argsort(results.ranking_)):
#     print number + 1, results.ranking_[index], things[index]

In [26]:
for k in xrange(1, 5):
    selector = SelectKBest(score_func=feature_selection.f_regression, k=1)
    results = selector.fit(new_training_X, training_Y)
    results.get_support(indices=True)

In [27]:
plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(things)
for k in range(1, feature_length + 1):
    selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
    results = selector.fit(new_testing_X, log_testing_Y)
    linear = linear_model.LinearRegression()
    linear.fit(new_testing_X[:, [index for index in results.get_support(indices=True)]], log_testing_Y)
    plt.scatter(k, 1.0 / len(log_testing_Y) * np.sqrt(np.sum(((linear.predict(new_testing_X[:, [index for index in results.get_support(indices=True)]]) - log_testing_Y))**2)),
                color="blue", 
                s=120.0)
    xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
    
xlabels = [things[x] for x in xlabels]

plt.xticks(np.arange(1, feature_length + 1, 1.0)) 
plt.axes().set_xticklabels([x for x in xlabels])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.xlim(0, feature_length + 1)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel("Error", fontsize=fontsize)
# plt.ylim(0.2, 1.5)
plt.tight_layout()
# plt.savefig("2014-09-23-relative-error-features.pdf")



In [28]:
# no_estimate = 
things = np.array(things)
noest_training_X = new_training_X[:,[x for x in range(len(things)) if x not in [7,8]]]
noest_testing_X = new_testing_X[:,[x for x in range(len(things)) if x not in [7,8]]]
noest_things = [things[x] for x in range(len(things)) if x not in [7,8]]
noest_things


Out[28]:
[u'creation_date',
 u'diagonal_length',
 u'area',
 u'sale_date',
 u'red',
 u'green',
 u'blue',
 u'auction_estimate',
 u'log_low_estimate',
 u'log_high_estimate',
 u'log_auction_estimate',
 u'karel-appel',
 u'sam-francis',
 u'antoni-tapies',
 u'roberto-matta',
 u'jasper-johns',
 u'robert-motherwell',
 u'mark-tobey',
 u'cy-twombly',
 u'louise-nevelson',
 u'helen-frankenthaler',
 u'alberto-burri',
 u'aaron-siskind',
 u'joan-mitchell',
 u'louise-bourgeois',
 u'theodoros-stamos',
 u'kenneth-noland',
 u'isamu-noguchi',
 u'adolph-gottlieb',
 u'franz-kline',
 u'philip-guston',
 u'jules-olitski',
 u'arshile-gorky',
 u'norman-bluhm',
 u'gene-davis',
 u'al-held',
 u'mark-rothko',
 u'jackson-pollock',
 u'esteban-vicente',
 u'morris-cole-graves',
 u'ad-reinhardt',
 u'rita-letendre',
 u'morris-louis',
 u'william-baziotes',
 u'norman-wilfred-lewis',
 u'milton-resnick',
 u'lee-krasner',
 u'barnett-newman',
 u"Christie's",
 u"Sotheby's",
 u'Farsetti',
 u'Bonhams',
 u'William Doyle',
 u'Swann Galleries',
 u'Bukowskis',
 u'Heffel Fine Art',
 u'Germann Auktionshaus',
 u'Bruun Rasmussen',
 u'Villa Grisebach',
 u'Finarte',
 u'Ketterer Kunst GmbH',
 u'Dorotheum',
 u'Stockholms Auktionsverk',
 u'Lempertz',
 u'Loudmer',
 u'unknown_auction',
 u'united states',
 u'united kingdom',
 u'italy',
 u'france',
 u'netherlands',
 u'germany',
 u'sweden',
 u'switzerland',
 u'canada',
 u'denmark',
 u'austria',
 u'unknown_country']

In [147]:
len(things)


Out[147]:
37

In [255]:
plt.rcParams['figure.figsize'] = 16, 12
xlabels = []
feature_length = len(noest_things)
for k in range(1, feature_length):
    selector = SelectKBest(score_func=feature_selection.f_regression, k=k)
    results = selector.fit(noest_testing_X, testing_Y)
    linear = linear_model.LinearRegression()
    linear.fit(noest_training_X[:, [index for index in results.get_support(indices=True)]], training_Y)
    plt.scatter(k, np.sqrt(np.average((linear.predict(noest_testing_X[:, [index for index in results.get_support(indices=True)]]) - testing_Y)**2.0)),
                color="blue", 
                s=120.0)
#     plt.scatter(k, linear.score(noest_testing_X[:, [index for index in results.get_support(indices=True)]], testing_Y),
#                 color="blue", 
#                 s=120.0)
    xlabels.append(list(set(results.get_support(indices=True)).difference(xlabels))[0])
    
xlabels = [noest_things[x] for x in xlabels]

plt.xticks(np.arange(1, feature_length, 1.0)) 
plt.axes().set_xticklabels([x for x in xlabels])
plt.xticks(rotation=90)
plt.xticks(fontsize=fontsize)
plt.xlim(0, feature_length + 1)
plt.yticks(fontsize=fontsize)
plt.xlabel("Feature", fontsize=fontsize)
plt.ylabel("Error", fontsize=fontsize)
plt.tight_layout()
# plt.ylim(0.5, 1.5)
# plt.savefig("2014-09-19-feature-importance.pdf")



In [275]:
auctionhouse_list


Out[275]:
[u"Christie's", u"Sotheby's", u'Farsetti', 'unknown_auction']

In [151]:
correct = []
incorrect = []
mean_estimate = []
low_estimates = []
high_estimates = []
for index, row in enumerate(big_X):
    low_estimate = row[7]
    high_estimate = row[8]
    hammer_price = big_Y[index]
    low_estimates.append(low_estimate)
    high_estimates.append(high_estimate)
    mean_estimate.append(np.average([low_estimate, high_estimate]))
    if (hammer_price > low_estimate) and (hammer_price < high_estimate):
        correct.append(hammer_price)
    else:
        incorrect.append(hammer_price)
mean_estimate = np.array(mean_estimate)
low_estimates = np.array(low_estimates)
high_estimates = np.array(high_estimates)




In [153]:
print len(correct), len(incorrect), np.average((high_estimates - low_estimates) / 2.0)


1023 3394 0.158682048837

In [271]:
# The plan:
# What... is the correct on the testing training set.
# It's really hard to do right.

In [212]:
# The plan:
# Find out my sigma
# Find out correct-ness
# limit their estimate and myself to my sigma
# How many correct/incorrect does my prediction make on TEST data?
correct = []
incorrect = []
my_correct = []
my_incorrect = []

linear.fit(new_training_X, training_Y)
standard_deviation = np.std(linear.predict(new_testing_X) - testing_Y) / 2.0
print standard_deviation

for index, row in enumerate(big_X):
    low_estimate = row[7]
    high_estimate = row[8]
    average = np.average([low_estimate, high_estimate])
    hammer_price = big_Y[index]
    if (hammer_price > average - standard_deviation) and (hammer_price < average + standard_deviation):
        correct.append(hammer_price)
    else:
        incorrect.append(hammer_price)
        
print "Auction houses are right:", len(correct)
print "Auction houses are wrong:", len(incorrect)

my_correct = []
my_incorrect = []
bands = []

for index, row in enumerate(new_big_X):
    low_estimate = big_X[index][7]
    high_estimate = big_X[index][8]
    band = (high_estimate - low_estimate) / 2.0
    bands.append(band)
    hammer_price = big_Y[index]
    if (hammer_price > linear.predict(new_big_X[index]) - standard_deviation) and (hammer_price < linear.predict(new_big_X[index]) + standard_deviation):
        my_correct.append(hammer_price)
    else:
        my_incorrect.append(hammer_price)

print "My algorithm is right:", len(my_correct)
print "My algorithm is wrong:", len(my_incorrect)
print "Percentage:", float(len(my_correct)) / (float(len(my_correct)) + float(len(my_incorrect)))


0.177822123448
Auction houses are right: 1696
Auction houses are wrong: 2721
My algorithm is right: 1697
My algorithm is wrong: 2720
Percentage: 0.384197419063

Feature mixing


In [61]:
from sklearn.preprocessing import PolynomialFeatures

In [62]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
poly_X = poly.fit_transform(new_big_X)
poly_training_X = poly.fit_transform(new_training_X)
poly_testing_X = poly.fit_transform(new_testing_X)

In [ ]:
plt.rcParams['figure.figsize'] = 12, 12
polylinear = linear_model.LinearRegression()
polylinear.fit(poly_training_X, log_training_Y)
# print polylinearlinear.score(poly_testing_X, log_testing_Y)

In [69]:
plt.scatter(polylinear.predict(poly_testing_X), log_testing_Y, color='blue', alpha=0.25, s=60)
plt.scatter(log_linear.predict(new_testing_X), log_testing_Y, color='red', alpha=0.25, s=60)

print polylinear.score(poly_testing_X, log_testing_Y)
print log_linear.score(new_testing_X, log_testing_Y)


-4.70294011438e+19
0.95428713476

In [70]:
plt.scatter(polylinear.predict(poly_training_X), log_training_Y, color='blue', alpha=0.25, s=60)
plt.scatter(log_linear.predict(new_training_X), log_training_Y, color='red', alpha=0.25, s=60)

print polylinear.score(poly_training_X, log_training_Y)
print log_linear.score(new_training_X, log_training_Y)


0.959321811901
0.942120679383

In [79]:
things[np.argmin(log_linear.coef_)]


Out[79]:
u'austria'

Look up

from sklearn.cross_validation import KFold, cross_val_score
from sklearn.linear_model import LassoCV 

In [ ]:


In [ ]:


In [8]:
huh = io.imread("../artists_html/antoni-tapies-6138135.jpg")
fig, (ax1, ax2) = plt.subplots(1,2)
ax1.imshow(huh)
red, green, blue = np.average(np.average(huh, axis=0), axis=0)
ax2.imshow(Image.new("RGB", huh.shape[:2], (int(red), int(green), int(blue))))


Out[8]:
<matplotlib.image.AxesImage at 0x110792e50>

In [10]:
# prices = []
# categories = []
# materials = []
# artist = 'helen-frankenthaler'
# for x in master_dictionary[artist]:
#     price = master_dictionary[artist][x]['hammer_price']
#     category  = master_dictionary[artist][x]['category']
#     if category == "Painting":
#         prices.append(price)
#         materials.append(master_dictionary[artist][x]['materials'])
#         jpg_name = "../artists_html/" + artist + "-" + str(x) + ".jpg"
#         huh = io.imread(jpg_name)
#         red, green, blue = np.average(np.average(huh, axis=0), axis=0)
#         master_dictionary[artist][x]['red'] = red
#         master_dictionary[artist][x]['green'] = green
#         master_dictionary[artist][x]['blue'] = blue
#     categories.append(category)
    
# print set(categories)
# print set(materials)


set([u'Print-Multiple', u'Ceramic', u'Sculpture-Volume', u'Drawing-Watercolor', u'Painting', u'Tapestry'])
set([u'Acrylic/canvas', u'Oil', u'Oil/masonite', u'Oil/canvas/board', u'Oil/canvas', u'Oil/paper/canvas', u'Oil/paper', u'Acrylic/canvas/board', u'Acrylic/paper', u'Oil/board', u'Mixed media', u'Painting', u'Acrylic'])

Largest error


In [69]:
cutsize = .5
plt.scatter(linear.predict(new_big_X), big_Y, color='blue', alpha=0.25, s=60)
index = np.abs(linear.predict(new_big_X) - big_Y) > cutsize
plt.scatter(linear.predict(new_big_X)[index], big_Y[index], color="red")
# print np.argwhere(index == True)
# # for info in np.argwhere(index == True):
# #     print information[info]


Out[69]:
<matplotlib.collections.PathCollection at 0x1140f5b10>