notebook.community

Edit and run



In [96]:

    
# Which rows of df['ua_link'] contain 

df['ua_sdate'].str.contains('- 10:00am', regex=True)









    Out[96]:





aid
325    True
326    True
331    True
328    True
329    True
Name: ua_sdate, dtype: bool



In [121]:

    
from urllib.request import urlopen
from bs4 import BeautifulSoup 
import pandas as pd

html = urlopen("http://www.rosensys.com/previous-auctions")
soup = BeautifulSoup(html, "lxml")

table = soup.table
precords = []
for tr in table.findAll("tr"):
    trs = tr.findAll("td")
    record = []
    ua_link = trs[3].a["href"]
    record.append(ua_link.replace("https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen","") )
    record.append(trs[1].h4.text)
    record.append(trs[2].h5.text)
    record.append(trs[2].find("span", {"class":"date-display-single"}).text)
    precords.append(record)

pdf = pd.DataFrame(data=precords)

#name the columns
pdf.columns = ["aid","ua_name","ua_type","ua_sdate"]

#set index
pdf.set_index("aid",inplace=True)

#remove the remaining \n
pdf['ua_type'] = pdf['ua_type'].str.replace('\n',"")

# In the column 'raw', extract xxxx-xx-xx in the strings
pdf['ua_stime'] = pdf['ua_sdate'].str.extract('((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)', expand=True)
pdf

#remove time from date
pdf['ua_sdate'] = pdf['ua_sdate'].str.replace('((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)',"")
pdf['ua_sdate'] = pdf['ua_sdate'].str.replace(" -","")

pdf









    Out[121]:






  
    
      
      ua_name
      ua_type
      ua_sdate
      ua_stime
    
    
      aid
      
      
      
      
    
  
  
    
      325
      Late Model Automotive Repair Shop
      Online Only
      January 12, 2017
      10:00am
    
    
      327
      Extrusion Dies at PolyWrx
      Online Only
      January 11, 2017
      10:00am
    
    
      6
      Quick Sale - Impact Frac, LLC. BK Case #16-33612
      Online Only
      January 5, 2017
      10:00am
    
    
      20
      Quick Sale PolyWRX, LLC. formerly JMC Killion ...
      Online Only
      January 4, 2017
      10:00am
    
    
      324
      PolyWRX, LLC. formerly JMC Killion Laboratories
      Online Only
      December 20, 2016
      1:00pm
    
    
      322
      Pulsestream Internet Services BK Case #16-41442
      Online Only
      December 20, 2016
      10:00am
    
    
      323
      Collision Equipment Specialists
      Online Only
      December 15, 2016
      10:00am
    
    
      321
      J.L. Myers Company in Conjunction w/ Redrock G...
      Online Only
      December 14, 2016
      10:00am
    
    
      320
      CNC Machine Shop
      Online Only
      December 13, 2016
      10:00am
    
    
      319
      Impact Frac, LLC. BK Case #16-33612
      Online Only
      December 8, 2016
      10:00am



In [122]:

    
from urllib.request import urlopen
from bs4 import BeautifulSoup 
import pandas as pd

html = urlopen("http://www.rosensystems.com/upcoming-auctions")
soup = BeautifulSoup(html, "lxml")

table = soup.table
records = []
for tr in table.findAll("tr"):
    trs = tr.findAll("td")
    record = []
    ua_link = trs[3].a["href"]
    record.append(ua_link.replace("https://www.maxanet.com/cgi-bin/mndetails.cgi?rosen","") )
    record.append(trs[1].h4.text)
    record.append(trs[2].h5.text)
    record.append(trs[2].find("span", {"class":"date-display-single"}).text)
    records.append(record)

df = pd.DataFrame(data=records)

#name the columns
df.columns = ["aid","ua_name","ua_type","ua_sdate"]

#set index
df.set_index("aid",inplace=True)

#remove the remaining \n
df['ua_type'] = df['ua_type'].str.replace('\n',"")

#remove time from date
#df['ua_sdate'] = df['ua_sdate'].str.replace("- 10:00am","")# In the column 'raw', extract xxxx-xx-xx in the strings
df['ua_stime'] = df['ua_sdate'].str.extract('((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)', expand=True)
df

#remove time from date
df['ua_sdate'] = df['ua_sdate'].str.replace('((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)',"")
df['ua_sdate'] = df['ua_sdate'].str.replace(" -","")

df









    Out[122]:






  
    
      
      ua_name
      ua_type
      ua_sdate
      ua_stime
    
    
      aid
      
      
      
      
    
  
  
    
      325
      Late Model Automotive Repair Shop
      Online Only
      January 12, 2017
      10:00am
    
    
      326
      Assets formerly of Megas Production
      Online Only
      January 17, 2017
      10:00am
    
    
      331
      January Auction eXchange
      Online Only
      January 19, 2017
      10:00am
    
    
      328
      Frisco International
      Online Only
      January 26, 2017
      10:00am
    
    
      329
      FBC Enterprises, LLC. dba Custom Graphic Servi...
      Online Only
      January 31, 2017
      10:00am



In [ ]:

    
#!/usr/bin/python
# URL that generated this code:
# http://txt2re.com/index-python.php3?s=-%2010:00am&-16&8&1

import re

txt='- 10:00am'

re1='(-)'	# Any Single Character 1
re2='(\\s+)'	# White Space 1
re3='((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)'	# HourMinuteSec 1

rg = re.compile(re1+re2+re3,re.IGNORECASE|re.DOTALL)
m = rg.search(txt)
if m:
    c1=m.group(1)
    ws1=m.group(2)
    time1=m.group(3)
    print "("+c1+")"+"("+ws1+")"+"("+time1+")"+"\n"

#-----
# Paste the code into a new python file. Then in Unix:'
# $ python x.py 
#-----

	ua_name	ua_type	ua_sdate	ua_stime
aid
325	Late Model Automotive Repair Shop	Online Only	January 12, 2017	10:00am
327	Extrusion Dies at PolyWrx	Online Only	January 11, 2017	10:00am
6	Quick Sale - Impact Frac, LLC. BK Case #16-33612	Online Only	January 5, 2017	10:00am
20	Quick Sale PolyWRX, LLC. formerly JMC Killion ...	Online Only	January 4, 2017	10:00am
324	PolyWRX, LLC. formerly JMC Killion Laboratories	Online Only	December 20, 2016	1:00pm
322	Pulsestream Internet Services BK Case #16-41442	Online Only	December 20, 2016	10:00am
323	Collision Equipment Specialists	Online Only	December 15, 2016	10:00am
321	J.L. Myers Company in Conjunction w/ Redrock G...	Online Only	December 14, 2016	10:00am
320	CNC Machine Shop	Online Only	December 13, 2016	10:00am
319	Impact Frac, LLC. BK Case #16-33612	Online Only	December 8, 2016	10:00am