In [96]:
# Which rows of df['ua_link'] contain
df['ua_sdate'].str.contains('- 10:00am', regex=True)
Out[96]:
In [121]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
html = urlopen("http://www.rosensys.com/previous-auctions")
soup = BeautifulSoup(html, "lxml")
table = soup.table
precords = []
for tr in table.findAll("tr"):
trs = tr.findAll("td")
record = []
ua_link = trs[3].a["href"]
record.append(ua_link.replace("https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen","") )
record.append(trs[1].h4.text)
record.append(trs[2].h5.text)
record.append(trs[2].find("span", {"class":"date-display-single"}).text)
precords.append(record)
pdf = pd.DataFrame(data=precords)
#name the columns
pdf.columns = ["aid","ua_name","ua_type","ua_sdate"]
#set index
pdf.set_index("aid",inplace=True)
#remove the remaining \n
pdf['ua_type'] = pdf['ua_type'].str.replace('\n',"")
# In the column 'raw', extract xxxx-xx-xx in the strings
pdf['ua_stime'] = pdf['ua_sdate'].str.extract('((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)', expand=True)
pdf
#remove time from date
pdf['ua_sdate'] = pdf['ua_sdate'].str.replace('((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)',"")
pdf['ua_sdate'] = pdf['ua_sdate'].str.replace(" -","")
pdf
Out[121]:
In [122]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
html = urlopen("http://www.rosensystems.com/upcoming-auctions")
soup = BeautifulSoup(html, "lxml")
table = soup.table
records = []
for tr in table.findAll("tr"):
trs = tr.findAll("td")
record = []
ua_link = trs[3].a["href"]
record.append(ua_link.replace("https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen","") )
record.append(trs[1].h4.text)
record.append(trs[2].h5.text)
record.append(trs[2].find("span", {"class":"date-display-single"}).text)
records.append(record)
df = pd.DataFrame(data=records)
#name the columns
df.columns = ["aid","ua_name","ua_type","ua_sdate"]
#set index
df.set_index("aid",inplace=True)
#remove the remaining \n
df['ua_type'] = df['ua_type'].str.replace('\n',"")
#remove time from date
#df['ua_sdate'] = df['ua_sdate'].str.replace("- 10:00am","")# In the column 'raw', extract xxxx-xx-xx in the strings
df['ua_stime'] = df['ua_sdate'].str.extract('((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)', expand=True)
df
#remove time from date
df['ua_sdate'] = df['ua_sdate'].str.replace('((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)',"")
df['ua_sdate'] = df['ua_sdate'].str.replace(" -","")
df
Out[122]:
In [ ]:
#!/usr/bin/python
# URL that generated this code:
# http://txt2re.com/index-python.php3?s=-%2010:00am&-16&8&1
import re
txt='- 10:00am'
re1='(-)' # Any Single Character 1
re2='(\\s+)' # White Space 1
re3='((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)' # HourMinuteSec 1
rg = re.compile(re1+re2+re3,re.IGNORECASE|re.DOTALL)
m = rg.search(txt)
if m:
c1=m.group(1)
ws1=m.group(2)
time1=m.group(3)
print "("+c1+")"+"("+ws1+")"+"("+time1+")"+"\n"
#-----
# Paste the code into a new python file. Then in Unix:'
# $ python x.py
#-----