notebook.community

Edit and run



In [ ]:

    
from bs4 import BeautifulSoup
import pandas as pd



In [ ]:

    
#This could link to a database in the future
# file = open("data/test.html",'r')
file = open("data/Announcements.txt",'r')
data = file.read()
soup = BeautifulSoup(data,"lxml")
file.close()



In [ ]:

    
#Beautifulsoup will find all the email information in <table class="content">.....</table>
val= soup.findAll('table', class_="content")
#This two CSS selectors are able to collect the body of sharepoint email
selectors=['tr div div','td div']
#create an empty dataframe
df = pd.DataFrame()
for content in val:
    title = content.find('td', class_='formbody')
    for selector in selectors:
        body = content.select(selector)
        if len(body) is not 0:
            df = df.append([(title.text.strip(), body[0].text.strip())])
            break          
df.columns = ['Title', 'Body']



In [ ]:

    
df.to_csv("announcement_data.csv")



In [ ]: