In [1]:

    
from bs4 import BeautifulSoup
import nltk
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline



In [3]:

    
# parse sample file
datapath =''
filename = 
soup = BeautifulSoup(open(datapath+ 'example_ttml.xml'), 'html.parser')
result = [(p['begin'], p['end'], p.text) for p in soup.find_all('p')]



In [10]:

    
soup









    Out[10]:





<?xml version="1.0" encoding="utf-8" ?>\n<tt ttp:profile="http://www.w3.org/TR/profile/sdp-us" xml:lang="en" xmlns="http://www.w3.org/ns/ttml" xmlns:ttm="http://www.w3.org/ns/ttml#metadata" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" xmlns:tts="http://www.w3.org/ns/ttml#styling">\n<head>\n<styling>\n<style tts:displayalign="after" tts:extent="90% 90%" tts:origin="5% 5%" tts:textalign="center" xml:id="s1"></style>\n<style tts:backgroundcolor="black" tts:color="white" tts:fontsize=".72c" xml:id="s2"></style>\n<style tts:color="#E5E5E5" xml:id="s3"></style>\n<style tts:color="#CCCCCC" xml:id="s4"></style>\n</styling>\n<layout>\n<region style="s1" xml:id="r1"></region>\n</layout>\n</head>\n<body region="r1">\n<div>\n<p begin="00:00:00.620" end="00:00:04.049" style="s2">though<span style="s3"> many</span> have joined the stem</p>\n<p begin="00:00:02.639" end="00:00:05.910" style="s2">learning community and have brought many</p>\n<p begin="00:00:04.049" end="00:00:08.010" style="s2">improvements the whole program is still</p>\n<p begin="00:00:05.910" end="00:00:09.450" style="s2">very new if<span style="s3"> I</span> were to<span style="s4"> be</span> in the stem</p>\n<p begin="00:00:08.010" end="00:00:10.800" style="s2">community I think that I could allow a</p>\n<p begin="00:00:09.450" end="00:00:12.150" style="s2">large amount of growth in the field of</p>\n<p begin="00:00:10.800" end="00:00:14.759" style="s2">programming and connecting the web with</p>\n<p begin="00:00:12.150" end="00:00:16.049" style="s2"><span style="s3">real world I have had several projects</span></p>\n<p begin="00:00:14.759" end="00:00:18.000" style="s2">that allow me to control and receive</p>\n<p begin="00:00:16.049" end="00:00:20.310" style="s2">feedback<span style="s3"> from</span> our<span style="s4"> website</span> to hardware</p>\n<p begin="00:00:18.000" end="00:00:22.199" style="s2">devices such as an<span style="s3"> arduino</span> i also</p>\n<p begin="00:00:20.310" end="00:00:23.760" style="s2">created a homemade remote swivel camera</p>\n<p begin="00:00:22.199" end="00:00:26.340" style="s2">to watch stuff and things like him</p>\n<p begin="00:00:23.760" end="00:00:27.930" style="s2">still adjusting to my home i created a</p>\n<p begin="00:00:26.340" end="00:00:30.000" style="s2">python script on a<span style="s4"> raspberry pi and</span></p>\n<p begin="00:00:27.930" end="00:00:31.949" style="s2">mounted a webcam on several allowing you</p>\n<p begin="00:00:30.000" end="00:00:34.739" style="s2">to remotely control the camera to get</p>\n<p begin="00:00:31.949" end="00:00:37.140" style="s2">better angles<span style="s3"> i</span> also have plans<span style="s3"> of</span> a</p>\n<p begin="00:00:34.739" end="00:00:38.820" style="s2">testing home automation system if I was</p>\n<p begin="00:00:37.140" end="00:00:41.040" style="s2">part of stem I can also try to<span style="s3"> organize</span></p>\n<p begin="00:00:38.820" end="00:00:42.360" style="s2">information better there's a great</p>\n<p begin="00:00:41.040" end="00:00:44.070" style="s2">system for project management and</p>\n<p begin="00:00:42.360" end="00:00:45.989" style="s2">information management</p>\n<p begin="00:00:44.070" end="00:00:47.250" style="s2">and I<span style="s4"> think</span><span style="s3"> that</span> I could bring us a stem</p>\n<p begin="00:00:45.989" end="00:00:49.020" style="s2">to help everyone share the project's</p>\n<p begin="00:00:47.250" end="00:00:51.000" style="s2">steps and detailed information for</p>\n<p begin="00:00:49.020" end="00:00:52.770" style="s2">everyone's benefit I hope you can</p>\n<p begin="00:00:51.000" end="00:00:53.879" style="s2">consider my application and I hope for</p>\n<p begin="00:00:52.770" end="00:00:56.899" style="s2">the chance to bring<span style="s3"> these</span> improvements</p>\n<p begin="00:00:53.879" end="00:00:56.899" style="s2">thank you</p>\n</div>\n</body>\n</tt>\n

Analysis of word frequency (only considering nouns)



In [3]:

    
#text block
raw= ' '.join([x[2] for x in result])



In [4]:

    
# tokenzie and position tagging using nltk library
# http://www.nltk.org/book/ch05.html
# to understand the meaning of tags: nltk.help.upenn_tagset()
text = nltk.word_tokenize(raw)
postags= nltk.pos_tag(text)



In [5]:

    
# turn the result into dataframe for the convenience of processing
df = pd.DataFrame(postags,columns =['word','type'])



In [6]:

    
#filter words by type, only keeping nouns
typepattern_prefix=['NN']
mask = df.type.str.slice(0,2).isin(typepattern_prefix)
filtered=df[mask]



In [7]:

    
# plot word frequency
ax=filtered['word'].value_counts().sort_values(ascending=True).plot.barh(figsize=(5,10))
ax.set_ylabel('counts')
ax.set_title('Word frequency', fontsize=16)









    Out[7]:





<matplotlib.text.Text at 0x7fabddefc0f0>

Analysis of speech speed on the video timeline



In [8]:

    
df2=pd.DataFrame(result, columns = ['sTimestamp','eTimestamp','words'])
df2['sTimestamp']=pd.to_datetime(df2['sTimestamp'])
df2['eTimestamp']=pd.to_datetime(df2['eTimestamp'])



In [9]:

    
from datetime import datetime, timedelta
df2['durSeconds']= (df2['eTimestamp']-df2['sTimestamp'])/ timedelta(seconds=1)



In [10]:

    
df2['wordcounts']=df2.apply(lambda row: len(row['words'].split(' ')),axis='columns')



In [11]:

    
df2.sample()









    Out[11]:







  
    
      
      sTimestamp
      eTimestamp
      words
      durSeconds
      wordcounts
    
  
  
    
      22
      2017-08-28 00:00:42.360
      2017-08-28 00:00:45.989
      information management
      3.629
      2



In [12]:

    
#fastest and slowest line by speech speed
df2['speechSpeed']=df2['wordcounts']/df2['durSeconds']

vStart=min(df2['sTimestamp'])
df2['offsetVideoStart'] = (df2['sTimestamp']-vStart)/timedelta(seconds=1)

print('--------slowest spoken line:----------------')
print(df2.sort_values(by=['speechSpeed']).iloc[0])
print('--------fastest spoken line:----------------')
print(df2.sort_values(by=['speechSpeed']).iloc[-1])









    



--------slowest spoken line:----------------
sTimestamp          2017-08-28 00:00:42.360000
eTimestamp          2017-08-28 00:00:45.989000
words                   information management
durSeconds                               3.629
wordcounts                                   2
speechSpeed                           0.551116
offsetVideoStart                         41.74
Name: 22, dtype: object
--------fastest spoken line:----------------
sTimestamp                        2017-08-28 00:00:44.070000
eTimestamp                        2017-08-28 00:00:47.250000
words               and I think that I could bring us a stem
durSeconds                                              3.18
wordcounts                                                10
speechSpeed                                          3.14465
offsetVideoStart                                       43.45
Name: 23, dtype: object



In [13]:

    
#fastest and slowest line by speech speed
fig=plt.figure(figsize=(12,5))
ax=fig.add_subplot(111)
df2['speechSpeed']=df2['wordcounts']/df2['durSeconds']

ax.plot(df2['offsetVideoStart'],df2['speechSpeed'],'--')
ax.set_ylabel('words / second')
ax.set_xlabel('time from the start of the video (seconds)')

ax.annotate('\"and I think that I could bring us a stem\"',
            xy=(43.45, 3.14465), xycoords='data',
            xytext=(-30, -20), textcoords='offset points',
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='right', verticalalignment='top', size=14)

ax.annotate('\"information management\"',
            xy=(41.74, 0.551116), xycoords='data',
            xytext=(-30, 20), textcoords='offset points',
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='right', verticalalignment='bottom', size=14)









    Out[13]:





<matplotlib.text.Annotation at 0x7fabdb6c1908>

search for sentences



In [15]:

    
len(df2)









    Out[15]:





30



In [16]:

    
' '.join(df2["words"])









    Out[16]:





"though many have joined the stem learning community and have brought many improvements the whole program is still very new if I were to be in the stem community I think that I could allow a large amount of growth in the field of programming and connecting the web with real world I have had several projects that allow me to control and receive feedback from our website to hardware devices such as an arduino i also created a homemade remote swivel camera to watch stuff and things like him still adjusting to my home i created a python script on a raspberry pi and mounted a webcam on several allowing you to remotely control the camera to get better angles i also have plans of a testing home automation system if I was part of stem I can also try to organize information better there's a great system for project management and information management and I think that I could bring us a stem to help everyone share the project's steps and detailed information for everyone's benefit I hope you can consider my application and I hope for the chance to bring these improvements thank you"



In [20]:

    
#examples of search for a sentence

from re import finditer
    
#test1
#searchWords = 'hardware devices'

#test2
searchWords = 'i created a python script on a raspberry pi and mounted a webcam'

for match in finditer(searchWords, ' '.join(df2["words"].str.strip())):
    #print matches 
    print(match.span(), match.group())
    
    startPos = match.span()[0]
    endPos = match.span()[1]
    
    #find the line indexes of the start and end position of each match
    startLineIdx=-1
    endLineIdx=-1
    pos= 0
    for index, row in df2.iterrows():
        pos += len(row["words"].strip())+1 # 1 is the space added between lines
        if startLineIdx ==-1 and startPos<pos:
            startLineIdx=index
        if endLineIdx==-1 and endPos<pos:
            endLineIdx = index
        if startLineIdx>0 and endLineIdx>0:
            break
    
    #verify 
    print(df2.loc[startLineIdx:endLineIdx,["sTimestamp","words"]])









    



(519, 583) i created a python script on a raspberry pi and mounted a webcam
                sTimestamp                                     words
13 2017-08-28 00:00:23.760    still adjusting to my home i created a
14 2017-08-28 00:00:26.340       python script on a raspberry pi and
15 2017-08-28 00:00:27.930  mounted a webcam on several allowing you



In [ ]: