In [ ]:
from bs4 import BeautifulSoup
import pandas as pd
In [ ]:
#This could link to a database in the future
# file = open("data/test.html",'r')
file = open("data/Announcements.txt",'r')
data = file.read()
soup = BeautifulSoup(data,"lxml")
file.close()
In [ ]:
#Beautifulsoup will find all the email information in <table class="content">.....</table>
val= soup.findAll('table', class_="content")
#This two CSS selectors are able to collect the body of sharepoint email
selectors=['tr div div','td div']
#create an empty dataframe
df = pd.DataFrame()
for content in val:
title = content.find('td', class_='formbody')
for selector in selectors:
body = content.select(selector)
if len(body) is not 0:
df = df.append([(title.text.strip(), body[0].text.strip())])
break
df.columns = ['Title', 'Body']
In [ ]:
df.to_csv("announcement_data.csv")
In [ ]: