In [75]:
from urllib.request import urlopen
from bs4 import BeautifulSoup 
import pandas as pd

html = urlopen("http://www.rosensys.com/previous-auctions")
soup = BeautifulSoup(html, "lxml")

table = soup.table
precords = []
for tr in table.findAll("tr"):
    trs = tr.findAll("td")
    record = []
    ua_link = trs[3].a["href"]
    record.append(ua_link.replace("https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen","") )
    record.append(trs[1].h4.text)
    record.append(trs[2].h5.text)
    record.append(trs[2].find("span", {"class":"date-display-single"}).text)
    precords.append(record)

pdf = pd.DataFrame(data=precords)

#name the columns
pdf.columns = ["aid","ua_name","ua_type","ua_sdate"]

#set index
#pdf.set_index("aid",inplace=True)

#remove the remaining \n
pdf['ua_type'] = pdf['ua_type'].str.replace('\n',"")
#format date
pdf['ua_sdate'] = pd.to_datetime(pdf['ua_sdate'], format='%B %d, %Y - %I:%M%p')
#construct url based on auction id
pdf['URL'] = "https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen" + pdf['aid']  


pdf.to_csv('pauction.csv')
pdf


Out[75]:
aid ua_name ua_type ua_sdate URL
0 20 Quick Sale - FBC Enterprises, LLC. dba Custom ... Online Only 2017-03-10 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...
1 335 March Auction eXchange Online Only 2017-03-07 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...
2 329 FBC Enterprises, LLC. dba Custom Graphic Servi... Online Only 2017-02-28 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...
3 334 Ashtola Exploration Co, Inc. BK Case #16-70406 Online Only 2017-02-23 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...
4 333 Pitts Industries LLC Online Only 2017-02-21 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...
5 332 James E. Helzer BK Case #13-42626 Online Only 2017-02-16 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...
6 330 Ariska Productions Online Only 2017-02-14 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...
7 25 Quick Sale January Auction eXchange Online Only 2017-02-01 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...
8 328 Frisco International Online Only 2017-01-26 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...
9 331 January Auction eXchange Online Only 2017-01-19 10:00:00 https://www.maxanet.com/cgi-bin/mndetails.cgi?...

In [ ]:
pdf.to_csv('pauction.csv')

In [27]:
#pdf['ua_type']

pdf.iloc[1]


Out[27]:
ua_name     March Auction eXchange
ua_type                Online Only
ua_sdate       2017-03-07 10:00:00
Name: 335, dtype: object

In [3]:
pdf['aid']


Out[3]:
0     20
1    335
2    329
3    334
4    333
5    332
6    330
7     25
8    328
9    331
Name: aid, dtype: object

In [9]:
pdf.iloc[0]


Out[9]:
aid                                                        20
ua_name     Quick Sale - FBC Enterprises, LLC. dba Custom ...
ua_type                                           Online Only
ua_sdate                                  2017-03-10 10:00:00
URL         https://www.maxanet.com/cgi-bin/mndetails.cgi?...
Name: 0, dtype: object

In [44]:
pdf.dtypes


Out[44]:
aid                 object
ua_name             object
ua_type             object
ua_sdate    datetime64[ns]
dtype: object

In [17]:
for aid in pdf['URL']:


https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen20
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen335
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen329
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen334
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen333
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen332
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen330
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen25
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen328
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen331
--

In [24]:
#get initial auction details - auction id, number of items


for aid in pdf['URL']:
    #get categories from web page 
    category = soup('table')[0].findAll('tr')[4].findAll('td')[1].text

    #formatting - replace left paren with colon
    new_cat = category.replace(" (" ,":")
    #replace right paren with blank
    new1_cat = new_cat.replace(")","")
    #split on dashes
    category_list = new1_cat.split(" - ")
    #remove "catalog from the list
    #category_list.remove('Catalog')
    #print the list to see if it is good
    #for item in category_list:
        #print(item)

    #set up file name and dictionary

    category = {}

    #put formatted items in a dictionary
    for line in category_list:
        x = line.split(":")
        a=x[0]
        b=x[1]
        category[a]=b

    #remove instructions    
    #del(category["1 INSTRUCTIONS"])
    category.update({'AID':'aid'})

    item_count = category['ALL ITEMS']
    aid = category['AID']

#df = pd.DataFrame({"AID":aid,"item_count":item_count}, index=["AID"])

#df


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-24-807f35bfc144> in <module>()
     26         x = line.split(":")
     27         a=x[0]
---> 28         b=x[1]
     29         category[a]=b
     30 

IndexError: list index out of range

In [67]:
pdf['aid'].to_csv('item_count.csv',index=False)

for i in pdf['URL']:
    #get categories from web page 
    category = soup('table')[0].findAll('tr')[4].findAll('td')[1].text
     #formatting - replace left paren with colon
    
    category = category.replace(" (" ,":")
    #replace right paren with blank
    category = category.replace(")","")
    #split on dashes
    category = category.split(" - ")
    category.remove('Catalog')
      
    
    category_list = {}
    
    for line in category:
        x = line.split(":")
        a=x[0]
        b=x[1]
        category_list[a]=b
    del(category_list["INSTRUCTIONS"])
    aid = pdf['aid']
    category_list.update({'AID': aid})
    print(category_list)
    
    item_count = category['ALL ITEMS']
    aid = category['AID']


{'ALL ITEMS': '2', 'PRINTING PRESS': '1', 'AID': 0     20
1    335
2    329
3    334
4    333
5    332
6    330
7     25
8    328
9    331
Name: aid, dtype: object}
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-67-b8b7b15a6d0f> in <module>()
     26     print(category_list)
     27 
---> 28     item_count = category['ALL ITEMS']
     29     aid = category['AID'],index=False
     30 

TypeError: list indices must be integers or slices, not str

In [111]:
#get initial auction details - auction id, number of items

from urllib.request import urlopen
from bs4 import BeautifulSoup 
import re
import csv
import pandas as pd

url = "https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen331"    
html = urlopen(url)
soup = BeautifulSoup(html, "lxml")

aID = re.compile("https:\\/\\/www\\.maxanet\\.com\\/cgi-bin\\/mndetails\\.cgi\\?rosen(.*)").match(url).groups()

aID = str(aID)
aID1 = aID.replace(",","")
aid1 = aID1.replace("\'","")
aid2 = aid1.replace("(","")
aid = aid2.replace(")","")

#get categories from web page 
category = soup('table')[0].findAll('tr')[4].findAll('td')[1].text

#formatting - replace left paren with colon
new_cat = category.replace(" (" ,":")
#replace right paren with blank
new1_cat = new_cat.replace(")","")
#split on dashes
category_list = new1_cat.split(" - ")
#remove "catalog from the list
category_list.remove('Catalog')
#print the list to see if it is good
#for item in category_list:
    #print(item)

#set up file name and dictionary

category = {}

#put formatted items in a dictionary
for line in category_list:
    x = line.split(":")
    a=x[0]
    b=x[1]
    category[a]=b

#remove instructions    
#del(category["1 INSTRUCTIONS"])
category.update({'AID':aid})

#write category list to file
#with open(aid +'_category.csv', 'w') as file:
#    [file.write('{0},{1}\n'.format(key, value)) for key, value in category.items()]  

item_count = category['ALL ITEMS']
aid = category['AID']

df = pd.DataFrame({"AID":aid,"item_count":item_count}, index=["AID"])

df

df.to_csv('item_count.csv',index=False, mode='a',header=None)

In [112]:
#pdf['aid'].to_csv('item_count.csv',index=False)

cat = pd.read_csv('item_count.csv',header=None)

cat


Out[112]:
0 1
0 20 2
1 335 286
2 329 139
3 324 230
4 333 24
5 332 38
6 330 275
7 25 20
8 328 577
9 331 258

In [78]:



20

In [ ]: