In [86]:
from bs4 import BeautifulSoup # For processing XMLfrom BeautifulSoup
import nltk
import re
In [87]:
doc = open('garden-party.xml').read()
soup = BeautifulSoup(doc, 'lxml')
In [88]:
segs = soup.findAll('seg')
In [89]:
text = ""
for seg in segs:
text += seg.text + " "
In [90]:
def cleanText(text):
text = text.replace('\n', ' ') # change newlines to spaces
text = text.replace('\t', ' ') # change tabs to spaces
text = re.sub('\s+', ' ', text).strip() # remove redundant whitespace
return text
text = cleanText(text)
In [91]:
sents = nltk.sent_tokenize(text) # break the text up into sentences
In [92]:
len(sents) # how many sentences?
Out[92]:
In [85]:
ands = [sent for sent in sents if re.search(r'^And', sent) is not None]
ands # sentences that start with "And"
Out[85]:
In [81]:
len(ands) # number of sentences that start with "And"
Out[81]:
In [84]:
proportionOfAnds = (len(ands) / len(sents)) * 100
proportionOfAnds # percentage of sentences that start with "And"
Out[84]:
Now let's find sentences that begin with "but."
In [94]:
buts = [sent for sent in sents if re.search(r'^But', sent) is not None]
buts # sentences that start with "But"
Out[94]:
In [95]:
len(buts)
Out[95]:
In [96]:
(len(buts) / len(sents)) * 100
Out[96]:
In [ ]: