In [1]:
#!/usr/bin/env python

%matplotlib inline

from bs4 import BeautifulSoup
from glob import glob
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import cookielib
import json
import math
import matplotlib.pyplot as plt
import mechanize
import numpy as np
import os
import pandas as pd
import pandas.io.sql as psql
import pickle
import pymysql as mdb
import re
import requests
import string
import sys
import time
import unicodedata
import urllib
import urllib2

In [2]:
artists = {}

for infile in sorted(glob("../artists/names_*txt")):
    keyname = ''.join(infile.split("/")[-1].split(".")[0].split("_")[1:])
    artists[keyname] = []
    with open(infile, 'r') as file_handle:
        for rline in file_handle.readlines():
            artists[keyname].append(rline.strip())

In [3]:
artists['abstractexpressionism'][0]


Out[3]:
'Aaron Siskind'

In [4]:
hand_cultivated_list = ['10251/sam-francis',
                         '10298/helen-frankenthaler',
                         '108323/elmer-nelson-bischoff',
                         '11682/arshile-gorky',
                         '11716/adolph-gottlieb',
                         '11898/morris-cole-graves',
                         '11979/richard-stankiewicz',
                         '120233/perle-fine',
                         '12389/philip-guston',
                         '12850/grace-hartigan',
                         '13123/al-held',
                         '135074/alma-woolsey-thomas',
                         '14751/jasper-johns',
                         '15498/franz-kline',
                         '15820/lee-krasner',
                         '17360/rita-letendre',
                         '1761/william-baziotes',
                         '17952/morris-louis',
                         '18279/george-mcneil',
                         '19187/roberto-matta',
                         '193240/norman-wilfred-lewis',
                         '19945/joan-mitchell',
                         '20537/robert-motherwell',
                         '21043/louise-nevelson',
                         '21052/barnett-newman',
                         '21257/kenneth-noland',
                         '21520/jules-olitski',
                         '21742/alfonso-ossorio',
                         '225523/lester-l--johnson',
                         '23185/jackson-pollock',
                         '24077/ad-reinhardt',
                         '24190/milton-resnick',
                         '25021/mark-rothko',
                         '251530/joseph-marioni',
                         '266633/alfred-viggo-jensen',
                         '26765/aaron-siskind',
                         '27300/theodoros-stamos',
                         '27512/clyfford-e--still',
                         '28011/antoni-tapies',
                         '28495/mark-tobey',
                         '28957/cy-twombly',
                         '28958/jack-tworkov',
                         '2907/norman-bluhm',
                         '29655/esteban-vicente',
                         '31184/adja-yunkers',
                         '32280/ibram-lassaw',
                         '327132/james-brooks',
                         '33409/nell-blaine',
                         '3547/louise-bourgeois',
                         '38579/george-wellman-morrison',
                         '394619/alfred-immanuel-jensen',
                         '3989/james-brooks',
                         '42273/herbert-ferber',
                         '42531/alberto-burri',
                         '444931/raphael-collazo',
                         '567405/david-burnham-smith',
                         '57005/seymour-lipton',
                         '58481/stephen-greene',
                         '58485/cleve-gray',
                         '60862/gertrude-greene',
                         '7082/gene-davis',
                         '711/karel-appel',
                         '71489/balcomb-greene',
                         '72286/lawrence-calcagno',
                         '72967/alfred-v--jensen',
                         '73554/david-hare',
                         '77024/mary-callery',
                         '82853/edward-dugmore',
                         '83534/albert-kotin',
                         '8585/isamu-noguchi',
                         '9718/john-millard-ferren']

In [5]:
def readSqlSelectIntoPandasDf(sql_command):
    ''' Read a select statement into a panda dataframe '''
    mysql_cn= mdb.connect('localhost', 'root','','abstrart_db')
    df = psql.frame_query(sql_command, con=mysql_cn)
    mysql_cn.close()
    return df

def loginToArtPrice():
    browser = mechanize.Browser()
    artprice_url = 'http://www.artprice.com/identity'
    browser.open(artprice_url)
    browser.select_form(nr = 0)
    with open('ap.config', 'r') as file_handle:
        credentials = json.load(file_handle)
    username = credentials["username"]
    password = credentials["password"]
    browser.form['login'] = username
    browser.form['pass'] = password
    browser.submit()
    return browser

In [6]:
class artPriceBrowser(object):
    '''
        Here we login to ArtPrice.com with the user account from the homepage
        Sleeps are required for page to completely load
    '''
    
    def __init__(self, username, password):
        driver = webdriver.Chrome('/usr/bin/chromedriver')
        print 'finished the driver'
        driver.get("http://artprice.com/identity")
        print 'b4 sleep'
        time.sleep(1)
        print 'after sleep'
        elem = driver.find_element_by_id("login")
        elem.send_keys(username)
        elem = driver.find_element_by_id("pass")
        elem.send_keys(password)
        elem = driver.find_element_by_name("commit")
        elem.click()
        print 'supposedly logged in before sleep 2'
        time.sleep(2)
        print 'after sleep 2'
        self._driver = driver
    
    def downloadHtml(self):
        elem = self._driver.find_element_by_xpath("//*")
        source_code = elem.get_attribute("outerHTML")
        return source_code
    
    def saveHtml(self, html_source, file_dir, file_name):
        full_name = os.path.join(file_dir,file_name+'.html') 
        f = open(full_name, 'w')
        f.write(html_source.encode('utf-8'))
        f.close()
        
    def getArtistPage(self, artist_link, page_number, scroll = 5):
        prefix_url = 'http://www.artprice.com/artist/'
        suffix_url = '/lots/futures?iso3=USD&p=%s&sort=price_desc&unite_to=cm' %(page_number)
        tmp_url = '%s%s%s' %(prefix_url, artist_link, suffix_url)
        self._driver.get(tmp_url)
        time.sleep(2)
        for i in range(scroll):
            self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
        return self.downloadHtml()
    
    def saveArtistPage(self, artist_link, page_number, file_name, file_dir = '../auctions_html/', scroll = 5):
        prefix_url = 'http://www.artprice.com/artist/'
        suffix_url = '/lots/futures?iso3=USD&p=%s&sort=price_desc&unite_to=cm' %(page_number)
        tmp_url = '%s%s%s' %(prefix_url, artist_link, suffix_url)
        self._driver.get(tmp_url)
        time.sleep(1)
        for i in range(scroll):
            self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
        html_source = self.downloadHtml()
        return self.saveHtml(html_source, file_dir, file_name)
    
    def logOut(self):
        elem = self._driver.find_element_by_id("logout_lnk")
        elem.click()
        

def readSqlSelectIntoPandasDf(sql_command):
    ''' Read a select statement into a panda dataframe '''
    mysql_cn= mdb.connect('localhost', 'root','','abstrart_db')
    df = psql.frame_query(sql_command, con=mysql_cn)
    mysql_cn.close()
    return df

In [7]:
with open('ap.config', 'r') as file_handle:
    credentials = json.load(file_handle)
    
username = credentials["username"]
password = credentials["password"]

browser = artPriceBrowser(username, password)


finished the driver
b4 sleep
after sleep
supposedly logged in before sleep 2
after sleep 2

In [8]:
# First, define the directory with the html files
artists = hand_cultivated_list
# Third, define where to save all auctions
auction_id_regex = re.compile("(?:(?m)pasts/([0-9]{1,}))")
npages_regex = re.compile("(?:(?m)amp;p=([0-9]{0,}))")
minimum_tag_names_to_scrape = ['lsc_title', 'lsc_details', 'lsc_adjud', 'lsc_auctioneer', 
                               'lsc_country', 'lsc_link']
optional_tag_names_to_scrape = ['lsc_estimate ', 'lsc_image']
minimum_fields_dict = {'auction_id':0, 'auction_link':1, 'title':2, 'artist_id':3, 'artist_name':4, 
                       'category':5, 'medium':6, 'size_width':7, 'size_length':8, 
                       'sales_date_day':9, 'sales_date_month':10, 'sales_date_year':11, 
                       'auction_house_name':12, 'auction_house_city':13, 'auction_house_country':14, 
                       'hammer_price':15}
optional_fields_dict = {'year_of_creation':16, 'low_estimate':17, 'high_estimate':18, 'img_link':19}

In [12]:
# START SCRAPING ALL AUCTIONS FOR EACH ARTIST PAGE
auction_count = 1
cutoff = 5
count = 1
for artist_link in hand_cultivated_list[8:]:
    page_number = 1 
    artist_id = artist_link.split('/')[0]
    artist_name = artist_link.split('/')[1]
    try:
        time.sleep(2)
        # open the artist page with all the auction info on first page
        artist_html_p1 = browser.getArtistPage(artist_link, page_number, scroll = 5)
        soup = BeautifulSoup(artist_html_p1)
        npages_match = npages_regex.findall(artist_html_p1)
        npages_match = max([int(page) for page in npages_match])
        print 'getting the links for %s from %s pages' %(artist_name, npages_match)
        div = soup.find(id="lots")
        first_result_link = "http://www.artprice.com" + div.find('div', 
                                                                 attrs={'class':"lot_sml_container"}).find('a').get('href')
        print count, first_result_link
        art_id = first_result_link.split("lot/futures/")[1].split("/")[0]
        file_name_stem = artist_id + '-' + artist_name.replace('-','') + '-' + art_id
        for i in range(5):
            browser._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
        page = browser.downloadHtml()
        soup = BeautifulSoup(page)
        div = soup.find(id="lots")
        for index, whatever in enumerate(div.findAll('div', attrs={'class':"lot_sml_container"})):
            file_name_stem = "../auctions_html/"
            blah = whatever.find('div',attrs={'class':"lsc_title"}).find('a').get('href').split("/")
            file_name_stem += '-'.join([blah[2],blah[3],blah[6]])
            with open(file_name_stem + ".html", 'w') as file_handle:
                print >>file_handle, whatever
            try:
                jpeg_url = whatever.findAll('img',{'class':'img_repro'})[0]['src']
                urllib.urlretrieve(jpeg_url, file_name_stem + ".jpg")
            except:
                print "No jpg for", index
                pass
        for page_number in range(2, npages_match+1):
            time.sleep(2)
            artist_html_p1 = browser.getArtistPage(artist_link, page_number, scroll = 5)
            soup = BeautifulSoup(artist_html_p1)
            div = soup.find(id="lots")
            for index, whatever in enumerate(div.findAll('div', attrs={'class':"lot_sml_container"})):
                file_name_stem = "../auctions_html/"
                blah = whatever.find('div',attrs={'class':"lsc_title"}).find('a').get('href').split("/")
                file_name_stem += '-'.join([blah[2],blah[3],blah[6]])
                with open(file_name_stem + ".html", 'w') as file_handle:
                    print >>file_handle, whatever
                try:
                    jpeg_url = whatever.findAll('img',{'class':'img_repro'})[0]['src']
                    urllib.urlretrieve(jpeg_url, file_name_stem + ".jpg")
                except:
                    print "No jpg for", artist_name, page_number, index
                    pass
    except:
        print '(FirstTryFailed) auction %s, for artist %s, page %s, did not work' %(auction_count, artist_name, page_number)
        print "moving on!"
        time.sleep(5)


getting the links for philip-guston from 1 pages
1 http://www.artprice.com/artist/12389/philip-guston/lot/futures/8715439/magician-s-table?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist philip-guston, page 1, did not work
moving on!
getting the links for grace-hartigan from 1 pages
1 http://www.artprice.com/artist/12850/grace-hartigan/lot/futures/8716262/theodora?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist grace-hartigan, page 1, did not work
moving on!
getting the links for al-held from 1 pages
1 http://www.artprice.com/artist/13123/al-held/lot/futures/8720067/pace-ii?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist al-held, page 1, did not work
moving on!
getting the links for alma-woolsey-thomas from 1 pages
(FirstTryFailed) auction 1, for artist alma-woolsey-thomas, page 1, did not work
moving on!
getting the links for jasper-johns from 1 pages
1 http://www.artprice.com/artist/14751/jasper-johns/lot/futures/8718156/alphabet?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist jasper-johns, page 1, did not work
moving on!
getting the links for franz-kline from 1 pages
(FirstTryFailed) auction 1, for artist franz-kline, page 1, did not work
moving on!
getting the links for lee-krasner from 1 pages
(FirstTryFailed) auction 1, for artist lee-krasner, page 1, did not work
moving on!
getting the links for rita-letendre from 1 pages
1 http://www.artprice.com/artist/17360/rita-letendre/lot/futures/8737659/reflet?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist rita-letendre, page 1, did not work
moving on!
getting the links for william-baziotes from 1 pages
1 http://www.artprice.com/artist/1761/william-baziotes/lot/futures/8720099/fleur-du-mal?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist william-baziotes, page 1, did not work
moving on!
getting the links for morris-louis from 1 pages
(FirstTryFailed) auction 1, for artist morris-louis, page 1, did not work
moving on!
getting the links for george-mcneil from 1 pages
(FirstTryFailed) auction 1, for artist george-mcneil, page 1, did not work
moving on!
getting the links for roberto-matta from 1 pages
1 http://www.artprice.com/artist/19187/roberto-matta/lot/futures/8716875/senza-titolo?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist roberto-matta, page 1, did not work
moving on!
getting the links for norman-wilfred-lewis from 1 pages
(FirstTryFailed) auction 1, for artist norman-wilfred-lewis, page 1, did not work
moving on!
getting the links for joan-mitchell from 1 pages
1 http://www.artprice.com/artist/19945/joan-mitchell/lot/futures/8719995/untitled?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist joan-mitchell, page 1, did not work
moving on!
getting the links for robert-motherwell from 1 pages
1 http://www.artprice.com/artist/20537/robert-motherwell/lot/futures/8715075/man-in-grey?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist robert-motherwell, page 1, did not work
moving on!
getting the links for louise-nevelson from 1 pages
1 http://www.artprice.com/artist/21043/louise-nevelson/lot/futures/8720104/small-column-vi?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist louise-nevelson, page 1, did not work
moving on!
getting the links for barnett-newman from 1 pages
1 http://www.artprice.com/artist/21052/barnett-newman/lot/futures/8746023/note-i-from-notes?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist barnett-newman, page 1, did not work
moving on!
getting the links for kenneth-noland from 1 pages
1 http://www.artprice.com/artist/21257/kenneth-noland/lot/futures/8720012/sun-bouquet?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist kenneth-noland, page 1, did not work
moving on!
getting the links for jules-olitski from 1 pages
(FirstTryFailed) auction 1, for artist jules-olitski, page 1, did not work
moving on!
getting the links for alfonso-ossorio from 1 pages
1 http://www.artprice.com/artist/21742/alfonso-ossorio/lot/futures/8733387/in-a-wood-lightly?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist alfonso-ossorio, page 1, did not work
moving on!
getting the links for lester-l--johnson from 1 pages
(FirstTryFailed) auction 1, for artist lester-l--johnson, page 1, did not work
moving on!
getting the links for jackson-pollock from 1 pages
(FirstTryFailed) auction 1, for artist jackson-pollock, page 1, did not work
moving on!
getting the links for ad-reinhardt from 1 pages
1 http://www.artprice.com/artist/24077/ad-reinhardt/lot/futures/8715076/untitled?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist ad-reinhardt, page 1, did not work
moving on!
getting the links for milton-resnick from 1 pages
1 http://www.artprice.com/artist/24190/milton-resnick/lot/futures/8724103/you-and-me?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist milton-resnick, page 1, did not work
moving on!
getting the links for mark-rothko from 1 pages
(FirstTryFailed) auction 1, for artist mark-rothko, page 1, did not work
moving on!
getting the links for joseph-marioni from 1 pages
1 http://www.artprice.com/artist/251530/joseph-marioni/lot/futures/8720081/painting-1-77?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist joseph-marioni, page 1, did not work
moving on!
getting the links for alfred-viggo-jensen from 1 pages
(FirstTryFailed) auction 1, for artist alfred-viggo-jensen, page 1, did not work
moving on!
getting the links for aaron-siskind from 1 pages
1 http://www.artprice.com/artist/26765/aaron-siskind/lot/futures/8716788/st-louis-9?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist aaron-siskind, page 1, did not work
moving on!
getting the links for theodoros-stamos from 1 pages
1 http://www.artprice.com/artist/27300/theodoros-stamos/lot/futures/8723987/after-mark-rothko-tapestry?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist theodoros-stamos, page 1, did not work
moving on!
getting the links for clyfford-e--still from 1 pages
(FirstTryFailed) auction 1, for artist clyfford-e--still, page 1, did not work
moving on!
getting the links for antoni-tapies from 1 pages
1 http://www.artprice.com/artist/28011/antoni-tapies/lot/futures/8718136/circle-ii-figure?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist antoni-tapies, page 1, did not work
moving on!
getting the links for mark-tobey from 1 pages
1 http://www.artprice.com/artist/28495/mark-tobey/lot/futures/8749938/ohne-titel?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist mark-tobey, page 1, did not work
moving on!
getting the links for cy-twombly from 1 pages
(FirstTryFailed) auction 1, for artist cy-twombly, page 1, did not work
moving on!
getting the links for jack-tworkov from 1 pages
1 http://www.artprice.com/artist/28958/jack-tworkov/lot/futures/8715440/queen-ii?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist jack-tworkov, page 1, did not work
moving on!
getting the links for norman-bluhm from 1 pages
1 http://www.artprice.com/artist/2907/norman-bluhm/lot/futures/8720091/white-over?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist norman-bluhm, page 1, did not work
moving on!
getting the links for esteban-vicente from 1 pages
1 http://www.artprice.com/artist/29655/esteban-vicente/lot/futures/8715926/untitled?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist esteban-vicente, page 1, did not work
moving on!
getting the links for adja-yunkers from 1 pages
(FirstTryFailed) auction 1, for artist adja-yunkers, page 1, did not work
moving on!
getting the links for ibram-lassaw from 1 pages
(FirstTryFailed) auction 1, for artist ibram-lassaw, page 1, did not work
moving on!
getting the links for james-brooks from 1 pages
(FirstTryFailed) auction 1, for artist james-brooks, page 1, did not work
moving on!
getting the links for nell-blaine from 1 pages
(FirstTryFailed) auction 1, for artist nell-blaine, page 1, did not work
moving on!
(FirstTryFailed) auction 1, for artist louise-bourgeois, page 1, did not work
moving on!
getting the links for george-wellman-morrison from 1 pages
(FirstTryFailed) auction 1, for artist george-wellman-morrison, page 1, did not work
moving on!
getting the links for alfred-immanuel-jensen from 1 pages
(FirstTryFailed) auction 1, for artist alfred-immanuel-jensen, page 1, did not work
moving on!
getting the links for james-brooks from 1 pages
1 http://www.artprice.com/artist/3989/james-brooks/lot/futures/8720102/untitled?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist james-brooks, page 1, did not work
moving on!
getting the links for herbert-ferber from 1 pages
(FirstTryFailed) auction 1, for artist herbert-ferber, page 1, did not work
moving on!
getting the links for alberto-burri from 1 pages
1 http://www.artprice.com/artist/42531/alberto-burri/lot/futures/8740665/cretto?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist alberto-burri, page 1, did not work
moving on!
getting the links for raphael-collazo from 1 pages
(FirstTryFailed) auction 1, for artist raphael-collazo, page 1, did not work
moving on!
getting the links for david-burnham-smith from 1 pages
1 http://www.artprice.com/artist/567405/david-burnham-smith/lot/futures/8747469/st-george-11-plaque?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist david-burnham-smith, page 1, did not work
moving on!
getting the links for seymour-lipton from 1 pages
(FirstTryFailed) auction 1, for artist seymour-lipton, page 1, did not work
moving on!
getting the links for stephen-greene from 1 pages
(FirstTryFailed) auction 1, for artist stephen-greene, page 1, did not work
moving on!
getting the links for cleve-gray from 1 pages
(FirstTryFailed) auction 1, for artist cleve-gray, page 1, did not work
moving on!
getting the links for gertrude-greene from 1 pages
(FirstTryFailed) auction 1, for artist gertrude-greene, page 1, did not work
moving on!
getting the links for gene-davis from 1 pages
1 http://www.artprice.com/artist/7082/gene-davis/lot/futures/8720065/roman-candle?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist gene-davis, page 1, did not work
moving on!
getting the links for karel-appel from 1 pages
1 http://www.artprice.com/artist/711/karel-appel/lot/futures/8720113/standing-elephant-circus-series?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist karel-appel, page 1, did not work
moving on!
getting the links for balcomb-greene from 1 pages
(FirstTryFailed) auction 1, for artist balcomb-greene, page 1, did not work
moving on!
getting the links for lawrence-calcagno from 1 pages
(FirstTryFailed) auction 1, for artist lawrence-calcagno, page 1, did not work
moving on!
getting the links for alfred-v--jensen from 1 pages
(FirstTryFailed) auction 1, for artist alfred-v--jensen, page 1, did not work
moving on!
getting the links for david-hare from 1 pages
1 http://www.artprice.com/artist/73554/david-hare/lot/futures/8739953/mountain-sky-no-1?p=1&iso3=USD&sort=price_desc
getting the links for mary-callery from 1 pages
(FirstTryFailed) auction 1, for artist mary-callery, page 1, did not work
moving on!
getting the links for edward-dugmore from 1 pages
1 http://www.artprice.com/artist/82853/edward-dugmore/lot/futures/8720097/metart-series-15?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist edward-dugmore, page 1, did not work
moving on!
getting the links for albert-kotin from 1 pages
(FirstTryFailed) auction 1, for artist albert-kotin, page 1, did not work
moving on!
getting the links for isamu-noguchi from 1 pages
1 http://www.artprice.com/artist/8585/isamu-noguchi/lot/futures/8715907/mannari?p=1&iso3=USD&sort=price_desc
(FirstTryFailed) auction 1, for artist isamu-noguchi, page 1, did not work
moving on!
getting the links for john-millard-ferren from 1 pages
(FirstTryFailed) auction 1, for artist john-millard-ferren, page 1, did not work
moving on!