In [1]:
from bs4 import BeautifulSoup
import nltk
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
# parse sample file
datapath =''
filename =
soup = BeautifulSoup(open(datapath+ 'example_ttml.xml'), 'html.parser')
result = [(p['begin'], p['end'], p.text) for p in soup.find_all('p')]
In [10]:
soup
Out[10]:
In [3]:
#text block
raw= ' '.join([x[2] for x in result])
In [4]:
# tokenzie and position tagging using nltk library
# http://www.nltk.org/book/ch05.html
# to understand the meaning of tags: nltk.help.upenn_tagset()
text = nltk.word_tokenize(raw)
postags= nltk.pos_tag(text)
In [5]:
# turn the result into dataframe for the convenience of processing
df = pd.DataFrame(postags,columns =['word','type'])
In [6]:
#filter words by type, only keeping nouns
typepattern_prefix=['NN']
mask = df.type.str.slice(0,2).isin(typepattern_prefix)
filtered=df[mask]
In [7]:
# plot word frequency
ax=filtered['word'].value_counts().sort_values(ascending=True).plot.barh(figsize=(5,10))
ax.set_ylabel('counts')
ax.set_title('Word frequency', fontsize=16)
Out[7]:
In [8]:
df2=pd.DataFrame(result, columns = ['sTimestamp','eTimestamp','words'])
df2['sTimestamp']=pd.to_datetime(df2['sTimestamp'])
df2['eTimestamp']=pd.to_datetime(df2['eTimestamp'])
In [9]:
from datetime import datetime, timedelta
df2['durSeconds']= (df2['eTimestamp']-df2['sTimestamp'])/ timedelta(seconds=1)
In [10]:
df2['wordcounts']=df2.apply(lambda row: len(row['words'].split(' ')),axis='columns')
In [11]:
df2.sample()
Out[11]:
In [12]:
#fastest and slowest line by speech speed
df2['speechSpeed']=df2['wordcounts']/df2['durSeconds']
vStart=min(df2['sTimestamp'])
df2['offsetVideoStart'] = (df2['sTimestamp']-vStart)/timedelta(seconds=1)
print('--------slowest spoken line:----------------')
print(df2.sort_values(by=['speechSpeed']).iloc[0])
print('--------fastest spoken line:----------------')
print(df2.sort_values(by=['speechSpeed']).iloc[-1])
In [13]:
#fastest and slowest line by speech speed
fig=plt.figure(figsize=(12,5))
ax=fig.add_subplot(111)
df2['speechSpeed']=df2['wordcounts']/df2['durSeconds']
ax.plot(df2['offsetVideoStart'],df2['speechSpeed'],'--')
ax.set_ylabel('words / second')
ax.set_xlabel('time from the start of the video (seconds)')
ax.annotate('\"and I think that I could bring us a stem\"',
xy=(43.45, 3.14465), xycoords='data',
xytext=(-30, -20), textcoords='offset points',
arrowprops=dict(facecolor='black', shrink=0.05),
horizontalalignment='right', verticalalignment='top', size=14)
ax.annotate('\"information management\"',
xy=(41.74, 0.551116), xycoords='data',
xytext=(-30, 20), textcoords='offset points',
arrowprops=dict(facecolor='black', shrink=0.05),
horizontalalignment='right', verticalalignment='bottom', size=14)
Out[13]:
In [15]:
len(df2)
Out[15]:
In [16]:
' '.join(df2["words"])
Out[16]:
In [20]:
#examples of search for a sentence
from re import finditer
#test1
#searchWords = 'hardware devices'
#test2
searchWords = 'i created a python script on a raspberry pi and mounted a webcam'
for match in finditer(searchWords, ' '.join(df2["words"].str.strip())):
#print matches
print(match.span(), match.group())
startPos = match.span()[0]
endPos = match.span()[1]
#find the line indexes of the start and end position of each match
startLineIdx=-1
endLineIdx=-1
pos= 0
for index, row in df2.iterrows():
pos += len(row["words"].strip())+1 # 1 is the space added between lines
if startLineIdx ==-1 and startPos<pos:
startLineIdx=index
if endLineIdx==-1 and endPos<pos:
endLineIdx = index
if startLineIdx>0 and endLineIdx>0:
break
#verify
print(df2.loc[startLineIdx:endLineIdx,["sTimestamp","words"]])
In [ ]: