notebook.community

Edit and run



In [75]:

    
from urllib.request import urlopen
from bs4 import BeautifulSoup 
import pandas as pd

html = urlopen("http://www.rosensys.com/previous-auctions")
soup = BeautifulSoup(html, "lxml")

table = soup.table
precords = []
for tr in table.findAll("tr"):
    trs = tr.findAll("td")
    record = []
    ua_link = trs[3].a["href"]
    record.append(ua_link.replace("https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen","") )
    record.append(trs[1].h4.text)
    record.append(trs[2].h5.text)
    record.append(trs[2].find("span", {"class":"date-display-single"}).text)
    precords.append(record)

pdf = pd.DataFrame(data=precords)

#name the columns
pdf.columns = ["aid","ua_name","ua_type","ua_sdate"]

#set index
#pdf.set_index("aid",inplace=True)

#remove the remaining \n
pdf['ua_type'] = pdf['ua_type'].str.replace('\n',"")
#format date
pdf['ua_sdate'] = pd.to_datetime(pdf['ua_sdate'], format='%B %d, %Y - %I:%M%p')
#construct url based on auction id
pdf['URL'] = "https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen" + pdf['aid']  


pdf.to_csv('pauction.csv')
pdf









    Out[75]:






  
    
      
      aid
      ua_name
      ua_type
      ua_sdate
      URL
    
  
  
    
      0
      20
      Quick Sale - FBC Enterprises, LLC. dba Custom ...
      Online Only
      2017-03-10 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...
    
    
      1
      335
      March Auction eXchange
      Online Only
      2017-03-07 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...
    
    
      2
      329
      FBC Enterprises, LLC. dba Custom Graphic Servi...
      Online Only
      2017-02-28 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...
    
    
      3
      334
      Ashtola Exploration Co, Inc. BK Case #16-70406
      Online Only
      2017-02-23 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...
    
    
      4
      333
      Pitts Industries LLC
      Online Only
      2017-02-21 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...
    
    
      5
      332
      James E. Helzer BK Case #13-42626
      Online Only
      2017-02-16 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...
    
    
      6
      330
      Ariska Productions
      Online Only
      2017-02-14 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...
    
    
      7
      25
      Quick Sale January Auction eXchange
      Online Only
      2017-02-01 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...
    
    
      8
      328
      Frisco International
      Online Only
      2017-01-26 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...
    
    
      9
      331
      January Auction eXchange
      Online Only
      2017-01-19 10:00:00
      https://www.maxanet.com/cgi-bin/mndetails.cgi?...



In [ ]:

    
pdf.to_csv('pauction.csv')



In [27]:

    
#pdf['ua_type']

pdf.iloc[1]









    Out[27]:





ua_name     March Auction eXchange
ua_type                Online Only
ua_sdate       2017-03-07 10:00:00
Name: 335, dtype: object



In [3]:

    
pdf['aid']









    Out[3]:





0     20
1    335
2    329
3    334
4    333
5    332
6    330
7     25
8    328
9    331
Name: aid, dtype: object



In [9]:

    
pdf.iloc[0]









    Out[9]:





aid                                                        20
ua_name     Quick Sale - FBC Enterprises, LLC. dba Custom ...
ua_type                                           Online Only
ua_sdate                                  2017-03-10 10:00:00
URL         https://www.maxanet.com/cgi-bin/mndetails.cgi?...
Name: 0, dtype: object



In [44]:

    
pdf.dtypes









    Out[44]:





aid                 object
ua_name             object
ua_type             object
ua_sdate    datetime64[ns]
dtype: object



In [17]:

    
for aid in pdf['URL']:









    



https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen20
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen335
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen329
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen334
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen333
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen332
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen330
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen25
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen328
--
https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen331
--



In [24]:

    
#get initial auction details - auction id, number of items


for aid in pdf['URL']:
    #get categories from web page 
    category = soup('table')[0].findAll('tr')[4].findAll('td')[1].text

    #formatting - replace left paren with colon
    new_cat = category.replace(" (" ,":")
    #replace right paren with blank
    new1_cat = new_cat.replace(")","")
    #split on dashes
    category_list = new1_cat.split(" - ")
    #remove "catalog from the list
    #category_list.remove('Catalog')
    #print the list to see if it is good
    #for item in category_list:
        #print(item)

    #set up file name and dictionary

    category = {}

    #put formatted items in a dictionary
    for line in category_list:
        x = line.split(":")
        a=x[0]
        b=x[1]
        category[a]=b

    #remove instructions    
    #del(category["1 INSTRUCTIONS"])
    category.update({'AID':'aid'})

    item_count = category['ALL ITEMS']
    aid = category['AID']

#df = pd.DataFrame({"AID":aid,"item_count":item_count}, index=["AID"])

#df









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-24-807f35bfc144> in <module>()
     26         x = line.split(":")
     27         a=x[0]
---> 28         b=x[1]
     29         category[a]=b
     30 

IndexError: list index out of range



In [67]:

    
pdf['aid'].to_csv('item_count.csv',index=False)

for i in pdf['URL']:
    #get categories from web page 
    category = soup('table')[0].findAll('tr')[4].findAll('td')[1].text
     #formatting - replace left paren with colon
    
    category = category.replace(" (" ,":")
    #replace right paren with blank
    category = category.replace(")","")
    #split on dashes
    category = category.split(" - ")
    category.remove('Catalog')
      
    
    category_list = {}
    
    for line in category:
        x = line.split(":")
        a=x[0]
        b=x[1]
        category_list[a]=b
    del(category_list["INSTRUCTIONS"])
    aid = pdf['aid']
    category_list.update({'AID': aid})
    print(category_list)
    
    item_count = category['ALL ITEMS']
    aid = category['AID']









    



{'ALL ITEMS': '2', 'PRINTING PRESS': '1', 'AID': 0     20
1    335
2    329
3    334
4    333
5    332
6    330
7     25
8    328
9    331
Name: aid, dtype: object}






    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-67-b8b7b15a6d0f> in <module>()
     26     print(category_list)
     27 
---> 28     item_count = category['ALL ITEMS']
     29     aid = category['AID'],index=False
     30 

TypeError: list indices must be integers or slices, not str



In [111]:

    
#get initial auction details - auction id, number of items

from urllib.request import urlopen
from bs4 import BeautifulSoup 
import re
import csv
import pandas as pd

url = "https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen331"    
html = urlopen(url)
soup = BeautifulSoup(html, "lxml")

aID = re.compile("https:\\/\\/www\\.maxanet\\.com\\/cgi-bin\\/mndetails\\.cgi\\?rosen(.*)").match(url).groups()

aID = str(aID)
aID1 = aID.replace(",","")
aid1 = aID1.replace("\'","")
aid2 = aid1.replace("(","")
aid = aid2.replace(")","")

#get categories from web page 
category = soup('table')[0].findAll('tr')[4].findAll('td')[1].text

#formatting - replace left paren with colon
new_cat = category.replace(" (" ,":")
#replace right paren with blank
new1_cat = new_cat.replace(")","")
#split on dashes
category_list = new1_cat.split(" - ")
#remove "catalog from the list
category_list.remove('Catalog')
#print the list to see if it is good
#for item in category_list:
    #print(item)

#set up file name and dictionary

category = {}

#put formatted items in a dictionary
for line in category_list:
    x = line.split(":")
    a=x[0]
    b=x[1]
    category[a]=b

#remove instructions    
#del(category["1 INSTRUCTIONS"])
category.update({'AID':aid})

#write category list to file
#with open(aid +'_category.csv', 'w') as file:
#    [file.write('{0},{1}\n'.format(key, value)) for key, value in category.items()]  

item_count = category['ALL ITEMS']
aid = category['AID']

df = pd.DataFrame({"AID":aid,"item_count":item_count}, index=["AID"])

df

df.to_csv('item_count.csv',index=False, mode='a',header=None)



In [112]:

    
#pdf['aid'].to_csv('item_count.csv',index=False)

cat = pd.read_csv('item_count.csv',header=None)

cat



In [78]:



In [ ]:

	aid	ua_name	ua_type	ua_sdate	URL
0	20	Quick Sale - FBC Enterprises, LLC. dba Custom ...	Online Only	2017-03-10 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...
1	335	March Auction eXchange	Online Only	2017-03-07 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...
2	329	FBC Enterprises, LLC. dba Custom Graphic Servi...	Online Only	2017-02-28 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...
3	334	Ashtola Exploration Co, Inc. BK Case #16-70406	Online Only	2017-02-23 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...
4	333	Pitts Industries LLC	Online Only	2017-02-21 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...
5	332	James E. Helzer BK Case #13-42626	Online Only	2017-02-16 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...
6	330	Ariska Productions	Online Only	2017-02-14 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...
7	25	Quick Sale January Auction eXchange	Online Only	2017-02-01 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...
8	328	Frisco International	Online Only	2017-01-26 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...
9	331	January Auction eXchange	Online Only	2017-01-19 10:00:00	https://www.maxanet.com/cgi-bin/mndetails.cgi?...

	0	1
0	20	2
1	335	286
2	329	139
3	324	230
4	333	24
5	332	38
6	330	275
7	25	20
8	328	577
9	331	258

	0	1
0	20	2
1	335	286
2	329	139
3	324	230
4	333	24
5	332	38
6	330	275
7	25	20
8	328	577
9	331	258

	0	1
0	20	2
1	335	286
2	329	139
3	324	230
4	333	24
5	332	38
6	330	275
7	25	20
8	328	577
9	331	258