Extract Parallel Texts from TED talks

Derived / inspired by : Ajinkya Kulkarni's GitHub.


In [ ]:
import requests
from bs4 import BeautifulSoup
#import shutil
#import codecs
import os, glob
import csv
import time, random

In [ ]:
def enlist_talk_names(url, dict_):
    time.sleep( random.random()*5.0+5.0 )
    r = requests.get(url)
    print("  Got %d bytes from %s" % (len(r.text), url))
    soup = BeautifulSoup(r.text, 'html.parser')
    talks= soup.find_all("a", class_='')
    for i in talks:
        if i.attrs['href'].find('/talks/')==0 and dict_.get(i.attrs['href'])!=1:
            dict_[i.attrs['href']]=1
    return dict_

In [ ]:
all_talk_names={}

# Get all pages of talks (seems a bit abusive)
#for i in xrange(1,61):
#    url='https://www.ted.com/talks?page=%d'%(i)
#    all_talk_names=enlist_talk_names(url, all_talk_names)

# A specific seach term
#url='https://www.ted.com/talks?sort=newest&q=ai'

# Specific topics
url='https://www.ted.com/talks?sort=newest&topics[]=AI'
#url='https://www.ted.com/talks?sort=newest&topics[]=machine+learning'
#url='https://www.ted.com/talks?sort=newest&topics[]=mind'
#url='https://www.ted.com/talks?sort=newest&topics[]=mind&page=2'
all_talk_names=enlist_talk_names(url, all_talk_names)
len(all_talk_names)

In [ ]:
data_path = './data'
if not os.path.exists(data_path):
    os.makedirs(data_path)

def extract_talk_languages(url, talk_name, language_list=['en', 'ko', 'ja']):
    need_more_data=False
    for lang in language_list:
        talk_lang_file = os.path.join(data_path, talk_name+'-'+lang+'.csv')
        if not os.path.isfile( talk_lang_file ) :
            need_more_data=True
    if not need_more_data:
        print("  Data already retrieved for %s" % (url,))
        return

    time.sleep( random.random()*5.0+5.0 )
    r = requests.get(url)
    print("  Got %d bytes from %s" % (len(r.text), url))
    if len(r.text)<1000: return # FAIL!
    soup = BeautifulSoup(r.text, 'html.parser')
    for i in soup.findAll('link'):
        if i.get('href')!=None and i.attrs['href'].find('?language=')!=-1:
            #print i.attrs['href']
            lang=i.attrs['hreflang']
            url_lang=i.attrs['href']
            if not lang in language_list:
                continue
                
            talk_lang_file = os.path.join(data_path, talk_name+'-'+lang+'.csv')
            if os.path.isfile( talk_lang_file ) :
                continue
                
            time.sleep( random.random()*5.0+5.0 )
            r_lang = requests.get(url_lang)
            print("    Lang[%s] : Got %d bytes" % (lang, len(r_lang.text), ))
            if len(r.text)<1000: return # FAIL!
            lang_soup = BeautifulSoup(r_lang.text, 'html.parser')

            talk_data = []
            for i in lang_soup.findAll('span',class_='talk-transcript__fragment'):
                d = [ int( i.attrs['data-time'] ), i.text.replace('\n',' ') ]
                talk_data.append(d)
            
            with open(talk_lang_file, 'w') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(['ts', 'txt'])
                writer.writerows(talk_data)            

if False:
    # Now flatten out the talk_data into time_step order
    talk_data_csv = [ ['ts']+language_list, ]
    for ts in sorted(talk_data.keys(), key=int):
        row = [ts] + [ talk_data[ts].get(lang, '') for lang in language_list]
        talk_data_csv.append(row)
        
    with open(os.path.join(data_path, talk_name+'.csv'), 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(talk_data_csv)

In [ ]:
for name in all_talk_names:
    extract_talk_languages('https://www.ted.com'+name+'/transcript', name[7:])
    #break
print("Finished extract_talk_languages for all_talk_names")

In [ ]: