In [75]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
html = urlopen("http://www.rosensys.com/previous-auctions")
soup = BeautifulSoup(html, "lxml")
table = soup.table
precords = []
for tr in table.findAll("tr"):
trs = tr.findAll("td")
record = []
ua_link = trs[3].a["href"]
record.append(ua_link.replace("https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen","") )
record.append(trs[1].h4.text)
record.append(trs[2].h5.text)
record.append(trs[2].find("span", {"class":"date-display-single"}).text)
precords.append(record)
pdf = pd.DataFrame(data=precords)
#name the columns
pdf.columns = ["aid","ua_name","ua_type","ua_sdate"]
#set index
#pdf.set_index("aid",inplace=True)
#remove the remaining \n
pdf['ua_type'] = pdf['ua_type'].str.replace('\n',"")
#format date
pdf['ua_sdate'] = pd.to_datetime(pdf['ua_sdate'], format='%B %d, %Y - %I:%M%p')
#construct url based on auction id
pdf['URL'] = "https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen" + pdf['aid']
pdf.to_csv('pauction.csv')
pdf
Out[75]:
In [ ]:
pdf.to_csv('pauction.csv')
In [27]:
#pdf['ua_type']
pdf.iloc[1]
Out[27]:
In [3]:
pdf['aid']
Out[3]:
In [9]:
pdf.iloc[0]
Out[9]:
In [44]:
pdf.dtypes
Out[44]:
In [17]:
for aid in pdf['URL']:
In [24]:
#get initial auction details - auction id, number of items
for aid in pdf['URL']:
#get categories from web page
category = soup('table')[0].findAll('tr')[4].findAll('td')[1].text
#formatting - replace left paren with colon
new_cat = category.replace(" (" ,":")
#replace right paren with blank
new1_cat = new_cat.replace(")","")
#split on dashes
category_list = new1_cat.split(" - ")
#remove "catalog from the list
#category_list.remove('Catalog')
#print the list to see if it is good
#for item in category_list:
#print(item)
#set up file name and dictionary
category = {}
#put formatted items in a dictionary
for line in category_list:
x = line.split(":")
a=x[0]
b=x[1]
category[a]=b
#remove instructions
#del(category["1 INSTRUCTIONS"])
category.update({'AID':'aid'})
item_count = category['ALL ITEMS']
aid = category['AID']
#df = pd.DataFrame({"AID":aid,"item_count":item_count}, index=["AID"])
#df
In [67]:
pdf['aid'].to_csv('item_count.csv',index=False)
for i in pdf['URL']:
#get categories from web page
category = soup('table')[0].findAll('tr')[4].findAll('td')[1].text
#formatting - replace left paren with colon
category = category.replace(" (" ,":")
#replace right paren with blank
category = category.replace(")","")
#split on dashes
category = category.split(" - ")
category.remove('Catalog')
category_list = {}
for line in category:
x = line.split(":")
a=x[0]
b=x[1]
category_list[a]=b
del(category_list["INSTRUCTIONS"])
aid = pdf['aid']
category_list.update({'AID': aid})
print(category_list)
item_count = category['ALL ITEMS']
aid = category['AID']
In [111]:
#get initial auction details - auction id, number of items
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import csv
import pandas as pd
url = "https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen331"
html = urlopen(url)
soup = BeautifulSoup(html, "lxml")
aID = re.compile("https:\\/\\/www\\.maxanet\\.com\\/cgi-bin\\/mndetails\\.cgi\\?rosen(.*)").match(url).groups()
aID = str(aID)
aID1 = aID.replace(",","")
aid1 = aID1.replace("\'","")
aid2 = aid1.replace("(","")
aid = aid2.replace(")","")
#get categories from web page
category = soup('table')[0].findAll('tr')[4].findAll('td')[1].text
#formatting - replace left paren with colon
new_cat = category.replace(" (" ,":")
#replace right paren with blank
new1_cat = new_cat.replace(")","")
#split on dashes
category_list = new1_cat.split(" - ")
#remove "catalog from the list
category_list.remove('Catalog')
#print the list to see if it is good
#for item in category_list:
#print(item)
#set up file name and dictionary
category = {}
#put formatted items in a dictionary
for line in category_list:
x = line.split(":")
a=x[0]
b=x[1]
category[a]=b
#remove instructions
#del(category["1 INSTRUCTIONS"])
category.update({'AID':aid})
#write category list to file
#with open(aid +'_category.csv', 'w') as file:
# [file.write('{0},{1}\n'.format(key, value)) for key, value in category.items()]
item_count = category['ALL ITEMS']
aid = category['AID']
df = pd.DataFrame({"AID":aid,"item_count":item_count}, index=["AID"])
df
df.to_csv('item_count.csv',index=False, mode='a',header=None)
In [112]:
#pdf['aid'].to_csv('item_count.csv',index=False)
cat = pd.read_csv('item_count.csv',header=None)
cat
Out[112]:
In [78]:
In [ ]: