In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import os

In [6]:
# parse a list of files into a python dictionary
datapath = "./transcripts/"
transcripts = {}

for file in os.listdir(datapath):
    if file.endswith(".xml"):
        soup = BeautifulSoup(open(datapath+ file), 'html.parser')
        result = [(p['begin'], p['end'], p.text) for p in soup.find_all('p')]
        transcripts[file] = result

In [56]:
# load dict into pandas dataframe
transcripts_pd = pd.DataFrame()
for transcript in sorted(transcripts): # may want to limit the list for convience/testing
    df2=pd.DataFrame(transcripts[transcript], columns = ['sTimestamp','eTimestamp','words'])
    #words dont always seemm to line up with the video, so rounding is implemented. lets see if this works well overall
    df2['sTime'] = pd.to_datetime(df2['sTimestamp']).dt.round('s').dt.strftime("%Hh%Mm%Ss")
    df2['videoId'] = transcript
    #take the file format off the ID. Why are we keeping this? Not sure, perhaps in order to track the file it came from in case we switch formats
    videoId_strip = transcript[:-4]
    #create the youtube permalink for sharing at the specified time
    df2['share_url'] = "https://youtu.be/" + videoId_strip + "?t=" + df2['sTime']
    transcripts_pd = transcripts_pd.append(df2)

In [57]:
transcripts_pd.head()


Out[57]:
sTimestamp eTimestamp words sTime videoId share_url
0 00:00:00.000 00:00:09.960 I have a night class in graph theory 00h00m00s -Q9iAXpWxjA.xml https://youtu.be/-Q9iAXpWxjA?t=00h00m00s
1 00:00:07.830 00:00:12.000 that I hate because it's a night class 00h00m08s -Q9iAXpWxjA.xml https://youtu.be/-Q9iAXpWxjA?t=00h00m08s
2 00:00:09.960 00:00:14.610 and it's filled with algorithms that are 00h00m10s -Q9iAXpWxjA.xml https://youtu.be/-Q9iAXpWxjA?t=00h00m10s
3 00:00:12.000 00:00:17.460 hard to pronounce but last week's class 00h00m12s -Q9iAXpWxjA.xml https://youtu.be/-Q9iAXpWxjA?t=00h00m12s
4 00:00:14.610 00:00:19.380 was particularly weird after the lecture 00h00m15s -Q9iAXpWxjA.xml https://youtu.be/-Q9iAXpWxjA?t=00h00m15s

search function


In [74]:
def search_string_cols(df, string, col):
    """searches specified colummn in dataframe for arbitrary string"""
    results = pd.DataFrame(df[df[col].str.contains(string, na=False)])
    return results

results = search_string_cols(transcripts_pd, 'Chicago', 'words')

In [75]:
results.head()


Out[75]:
sTimestamp eTimestamp words sTime videoId share_url
37 00:01:28.350 00:01:33.750 different hats clerks here in Chicago 00h01m28s -l7k2T1nvow.xml https://youtu.be/-l7k2T1nvow?t=00h01m28s
151 00:05:53.690 00:05:59.030 your office to me in Chicago or even one 00h05m54s -l7k2T1nvow.xml https://youtu.be/-l7k2T1nvow?t=00h05m54s
12 00:00:37.380 00:00:41.489 general for the city of Chicago and if 00h00m37s 1fQWo1BmwAc.xml https://youtu.be/1fQWo1BmwAc?t=00h00m37s
16 00:00:46.530 00:00:51.809 General of the city of Chicago and my 00h00m47s 1fQWo1BmwAc.xml https://youtu.be/1fQWo1BmwAc?t=00h00m47s
23 00:01:05.430 00:01:09.390 all of you follow Chicago politics 31 00h01m05s 1fQWo1BmwAc.xml https://youtu.be/1fQWo1BmwAc?t=00h01m05s

In [79]:
results['share_url']


Out[79]:
37     https://youtu.be/-l7k2T1nvow?t=00h01m28s
151    https://youtu.be/-l7k2T1nvow?t=00h05m54s
12     https://youtu.be/1fQWo1BmwAc?t=00h00m37s
16     https://youtu.be/1fQWo1BmwAc?t=00h00m47s
23     https://youtu.be/1fQWo1BmwAc?t=00h01m05s
25     https://youtu.be/1fQWo1BmwAc?t=00h01m09s
37     https://youtu.be/1fQWo1BmwAc?t=00h01m45s
69     https://youtu.be/1fQWo1BmwAc?t=00h03m00s
155    https://youtu.be/1fQWo1BmwAc?t=00h06m21s
158    https://youtu.be/1fQWo1BmwAc?t=00h06m27s
256    https://youtu.be/1fQWo1BmwAc?t=00h09m57s
270    https://youtu.be/1fQWo1BmwAc?t=00h10m30s
320    https://youtu.be/1fQWo1BmwAc?t=00h12m16s
360    https://youtu.be/1fQWo1BmwAc?t=00h13m39s
364    https://youtu.be/1fQWo1BmwAc?t=00h13m45s
393    https://youtu.be/1fQWo1BmwAc?t=00h14m50s
512    https://youtu.be/1fQWo1BmwAc?t=00h19m38s
516    https://youtu.be/1fQWo1BmwAc?t=00h19m45s
541    https://youtu.be/1fQWo1BmwAc?t=00h20m39s
560    https://youtu.be/1fQWo1BmwAc?t=00h21m35s
580    https://youtu.be/1fQWo1BmwAc?t=00h22m15s
605    https://youtu.be/1fQWo1BmwAc?t=00h23m03s
662    https://youtu.be/1fQWo1BmwAc?t=00h25m09s
667    https://youtu.be/1fQWo1BmwAc?t=00h25m21s
738    https://youtu.be/1fQWo1BmwAc?t=00h28m14s
745    https://youtu.be/1fQWo1BmwAc?t=00h28m30s
768    https://youtu.be/1fQWo1BmwAc?t=00h29m35s
777    https://youtu.be/1fQWo1BmwAc?t=00h29m51s
780    https://youtu.be/1fQWo1BmwAc?t=00h29m57s
789    https://youtu.be/1fQWo1BmwAc?t=00h30m14s
                         ...                   
912    https://youtu.be/ulrLTCrHeeo?t=00h37m25s
917    https://youtu.be/ulrLTCrHeeo?t=00h37m33s
66     https://youtu.be/utgpFx-LhkU?t=00h03m49s
89     https://youtu.be/utgpFx-LhkU?t=00h05m04s
341    https://youtu.be/utgpFx-LhkU?t=00h18m08s
376    https://youtu.be/utgpFx-LhkU?t=00h19m55s
465    https://youtu.be/utgpFx-LhkU?t=00h24m24s
5      https://youtu.be/vG1nwjwUlqc?t=00h00m19s
11     https://youtu.be/vG1nwjwUlqc?t=00h00m33s
14     https://youtu.be/vG1nwjwUlqc?t=00h00m40s
20     https://youtu.be/vG1nwjwUlqc?t=00h00m55s
59     https://youtu.be/vG1nwjwUlqc?t=00h02m42s
88     https://youtu.be/vG1nwjwUlqc?t=00h03m46s
95     https://youtu.be/vG1nwjwUlqc?t=00h04m06s
210    https://youtu.be/w6dq9mfDhU4?t=00h07m55s
664    https://youtu.be/w6dq9mfDhU4?t=00h23m31s
5      https://youtu.be/wOaw-H_T-is?t=00h00m15s
29     https://youtu.be/wOaw-H_T-is?t=00h01m19s
85     https://youtu.be/wOaw-H_T-is?t=00h03m54s
168    https://youtu.be/wOaw-H_T-is?t=00h07m21s
182    https://youtu.be/wOaw-H_T-is?t=00h07m56s
279    https://youtu.be/wOaw-H_T-is?t=00h11m26s
343    https://youtu.be/wOaw-H_T-is?t=00h13m53s
379    https://youtu.be/wOaw-H_T-is?t=00h15m21s
398    https://youtu.be/wOaw-H_T-is?t=00h16m07s
436    https://youtu.be/wOaw-H_T-is?t=00h17m27s
476    https://youtu.be/wOaw-H_T-is?t=00h18m43s
34     https://youtu.be/zjxGtP7-elU?t=00h01m18s
226    https://youtu.be/zjxGtP7-elU?t=00h09m54s
387    https://youtu.be/zjxGtP7-elU?t=00h17m01s
Name: share_url, Length: 623, dtype: object

In [ ]: