In [1]:
    
from bs4 import BeautifulSoup
import nltk
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
    
In [3]:
    
# parse sample file
datapath =''
filename = 
soup = BeautifulSoup(open(datapath+ 'example_ttml.xml'), 'html.parser')
result = [(p['begin'], p['end'], p.text) for p in soup.find_all('p')]
    
In [10]:
    
soup
    
    Out[10]:
In [3]:
    
#text block
raw= ' '.join([x[2] for x in result])
    
In [4]:
    
# tokenzie and position tagging using nltk library
# http://www.nltk.org/book/ch05.html
# to understand the meaning of tags: nltk.help.upenn_tagset()
text = nltk.word_tokenize(raw)
postags= nltk.pos_tag(text)
    
In [5]:
    
# turn the result into dataframe for the convenience of processing
df = pd.DataFrame(postags,columns =['word','type'])
    
In [6]:
    
#filter words by type, only keeping nouns
typepattern_prefix=['NN']
mask = df.type.str.slice(0,2).isin(typepattern_prefix)
filtered=df[mask]
    
In [7]:
    
# plot word frequency
ax=filtered['word'].value_counts().sort_values(ascending=True).plot.barh(figsize=(5,10))
ax.set_ylabel('counts')
ax.set_title('Word frequency', fontsize=16)
    
    Out[7]:
    
In [8]:
    
df2=pd.DataFrame(result, columns = ['sTimestamp','eTimestamp','words'])
df2['sTimestamp']=pd.to_datetime(df2['sTimestamp'])
df2['eTimestamp']=pd.to_datetime(df2['eTimestamp'])
    
In [9]:
    
from datetime import datetime, timedelta
df2['durSeconds']= (df2['eTimestamp']-df2['sTimestamp'])/ timedelta(seconds=1)
    
In [10]:
    
df2['wordcounts']=df2.apply(lambda row: len(row['words'].split(' ')),axis='columns')
    
In [11]:
    
df2.sample()
    
    Out[11]:
In [12]:
    
#fastest and slowest line by speech speed
df2['speechSpeed']=df2['wordcounts']/df2['durSeconds']
vStart=min(df2['sTimestamp'])
df2['offsetVideoStart'] = (df2['sTimestamp']-vStart)/timedelta(seconds=1)
print('--------slowest spoken line:----------------')
print(df2.sort_values(by=['speechSpeed']).iloc[0])
print('--------fastest spoken line:----------------')
print(df2.sort_values(by=['speechSpeed']).iloc[-1])
    
    
In [13]:
    
#fastest and slowest line by speech speed
fig=plt.figure(figsize=(12,5))
ax=fig.add_subplot(111)
df2['speechSpeed']=df2['wordcounts']/df2['durSeconds']
ax.plot(df2['offsetVideoStart'],df2['speechSpeed'],'--')
ax.set_ylabel('words / second')
ax.set_xlabel('time from the start of the video (seconds)')
ax.annotate('\"and I think that I could bring us a stem\"',
            xy=(43.45, 3.14465), xycoords='data',
            xytext=(-30, -20), textcoords='offset points',
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='right', verticalalignment='top', size=14)
ax.annotate('\"information management\"',
            xy=(41.74, 0.551116), xycoords='data',
            xytext=(-30, 20), textcoords='offset points',
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='right', verticalalignment='bottom', size=14)
    
    Out[13]:
    
In [15]:
    
len(df2)
    
    Out[15]:
In [16]:
    
' '.join(df2["words"])
    
    Out[16]:
In [20]:
    
#examples of search for a sentence
from re import finditer
    
#test1
#searchWords = 'hardware devices'
#test2
searchWords = 'i created a python script on a raspberry pi and mounted a webcam'
for match in finditer(searchWords, ' '.join(df2["words"].str.strip())):
    #print matches 
    print(match.span(), match.group())
    
    startPos = match.span()[0]
    endPos = match.span()[1]
    
    #find the line indexes of the start and end position of each match
    startLineIdx=-1
    endLineIdx=-1
    pos= 0
    for index, row in df2.iterrows():
        pos += len(row["words"].strip())+1 # 1 is the space added between lines
        if startLineIdx ==-1 and startPos<pos:
            startLineIdx=index
        if endLineIdx==-1 and endPos<pos:
            endLineIdx = index
        if startLineIdx>0 and endLineIdx>0:
            break
    
    #verify 
    print(df2.loc[startLineIdx:endLineIdx,["sTimestamp","words"]])
    
    
In [ ]: