The goal of this notebook is to download an mp3 from NPR's embedded and then transcribe it using pocketsphinx.


In [1]:
from IPython.display import Audio
from pydub import AudioSegment
import glob
from math import ceil
import os
import json
import requests
import csv
import sys
import speech_recognition as sr
r = sr.Recognizer()

In [2]:
#!wget http://play.podtrac.com/npr-510311/npr.mc.tritondigital.com/NPR_510311/media/anon.npr-mp3/npr/embd/2016/05/20160505_embd_embedded_final_audio__-_we_found_joy_1156.mp3

In [3]:
def transform_mp3_wav(AUDIO_FILENAME, AUDIO_SEGMENT_SECONDS):
    filename = AUDIO_FILENAME.replace('.mp3','')
    with open(AUDIO_FILENAME):
        audio = AudioSegment.from_mp3(AUDIO_FILENAME)
        xs = 0
        while xs < audio.duration_seconds:
            ys = min(xs + AUDIO_SEGMENT_SECONDS, ceil(audio.duration_seconds))
            fname = str(xs).rjust(5, '0') + '-' + str(ys).rjust(5, '0') + '.wav'
            audio[xs*1000:ys*1000].export(os.getcwd() + '/' + filename + fname, format='wav')
            print("Saved", fname)
            xs = ys

In [4]:
os.chdir('/Users/sheldon/podcasts/speech_notebook/')
os.getcwd()


Out[4]:
'/Users/sheldon/podcasts/speech_notebook'

In [5]:
transform_mp3_wav('embedded_we_found_joy.mp3', 300)


('Saved', '00000-00300.wav')
('Saved', '00300-00600.wav')
('Saved', '00600-00900.wav')
('Saved', '00900-01200.wav')
('Saved', '01200-01500.wav')
('Saved', '01500-1728.0.wav')

In [6]:
wav_list = glob.glob('embedded_we_found_joy*')
wav_list = filter(lambda x: '.mp3' not in x, wav_list)

In [7]:
wav_list


Out[7]:
['embedded_we_found_joy00000-00300.wav',
 'embedded_we_found_joy00300-00600.wav',
 'embedded_we_found_joy00600-00900.wav',
 'embedded_we_found_joy00900-01200.wav',
 'embedded_we_found_joy01200-01500.wav',
 'embedded_we_found_joy01500-1728.0.wav']

In [8]:
%%time
trans_list = []
transcription = None
for wav_file in wav_list: 
    print 'transcribing: ' + wav_file
    with sr.AudioFile(wav_file) as source:
        audio = r.record(source)
        transcription = r.recognize_sphinx(audio)
        print 'transcription completed'
    trans_list.extend(transcription)


transcribing: embedded_we_found_joy00000-00300.wav
transcription completed
transcribing: embedded_we_found_joy00300-00600.wav
transcription completed
transcribing: embedded_we_found_joy00600-00900.wav
transcription completed
transcribing: embedded_we_found_joy00900-01200.wav
transcription completed
transcribing: embedded_we_found_joy01200-01500.wav
transcription completed
transcribing: embedded_we_found_joy01500-1728.0.wav
transcription completed
CPU times: user 28min 6s, sys: 17.3 s, total: 28min 24s
Wall time: 29min 7s

In [15]:
transcription = ''.join(trans_list)

In [16]:
file = open('transcription_cmu_full.txt','w')
file.write(transcription)
file.close()

Baseline comparison of the results


In [2]:
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import numpy as np
import nltk.data
from __future__ import division  # Python 2 users only
import nltk, re, pprint
from nltk import word_tokenize
%matplotlib inline

In [31]:
cmu_trans = open('transcription_cmu_full.txt','rU').read()
stop = set(stopwords.words('english'))
grammar_symbols = [',','"',"'","."]
def tokenize_and_lower(textfile):
    
    lower = [w.lower() for w in tokens]
    filtered_words = [word for word in lower if word not in stop]
    remove_contractions = [word for word in filtered_words if "'" not in word]
    remove_periods = [word for word in remove_contractions if "." not in word]
    tokens = word_tokenize(remove_periods)
    return tokens


cmu = tokenize_and_lower(cmu_trans)
'''
cmu = pd.Series.to_frame(cmu)
cmu.columns = [['words']]
cmu = cmu.groupby('words').size().reset_index()
cmu.columns = [['words','count']]
'''


Out[31]:
"\ncmu = pd.Series.to_frame(cmu)\ncmu.columns = [['words']]\ncmu = cmu.groupby('words').size().reset_index()\ncmu.columns = [['words','count']]\n"

In [32]:
cmu


Out[32]:
['support',
 'comes',
 'hot',
 'spot',
 'ask',
 'todd',
 'spot',
 'usual',
 'message',
 'goes',
 'replied',
 'dear',
 'npr',
 'love',
 'great',
 'stories',
 'really',
 'wan',
 'na',
 'say',
 'thank',
 'sharing',
 'fan',
 'club',
 'spa',
 'kelly',
 'recovers',
 'endure',
 'typecast',
 'take',
 'story',
 'news',
 'anchor',
 'deep',
 'week',
 'gon',
 'na',
 'something',
 'little',
 'different',
 'listening',
 'pod',
 'castro',
 'narrow',
 'first',
 'episode',
 'least',
 'budget',
 'time',
 'town',
 'called',
 'boston',
 'indiana',
 'people',
 'addicted',
 'destroy',
 'called',
 'pan',
 'town',
 'center',
 'biggest',
 'outbreaks',
 'years',
 'many',
 'people',
 'sharing',
 'needles',
 'sunset',
 'episode',
 'lot',
 'told',
 'us',
 'know',
 'intimate',
 'happen',
 'people',
 'met',
 'austin',
 'bevan',
 'samantha',
 'jasmine',
 'joe',
 'way',
 'sell',
 'episode',
 'going',
 'back',
 'indiana',
 'tannenbaum',
 'wearing',
 'number',
 'something',
 'really',
 'want',
 'find',
 'enjoying',
 'still',
 'using',
 'first',
 'name',
 'dead',
 'illegally',
 'enjoy',
 'nurse',
 'heard',
 'back',
 'work',
 'indict',
 'addicted',
 'pain',
 'pills',
 'prescription',
 'ranches',
 'title',
 'street',
 'joy',
 'told',
 'us',
 'never',
 'thought',
 'would',
 'use',
 'needle',
 'never',
 'said',
 'salinas',
 'cook',
 'nap',
 'slow',
 'hey',
 'nice',
 'little',
 'eastern',
 'europe',
 'said',
 'earlier',
 'quarter',
 'know',
 'pat',
 'eighth',
 'silas',
 'love',
 'full',
 'peace',
 'love',
 'guess',
 'famous',
 'changes',
 'think',
 'tough',
 'toast',
 'top',
 'enjoyed',
 'told',
 'us',
 'antiquated',
 'fisher',
 'purse',
 'show',
 'us',
 'bring',
 'appointment',
 'get',
 'treatment',
 'drug',
 'addiction',
 'us',
 'guess',
 'wan',
 'na',
 'hurt',
 'red',
 'army',
 'get',
 'something',
 'like',
 'life',
 'mommy',
 'laughed',
 'wanted',
 'control',
 'get',
 'fixed',
 'appointed',
 'said',
 'thirteenth',
 'next',
 'month',
 'joy',
 'story',
 'really',
 'stuck',
 'us',
 'fans',
 'lauderdale',
 'many',
 'people',
 'struggling',
 'appeal',
 'addiction',
 'country',
 'joy',
 'fit',
 'picture',
 'heads',
 'drug',
 'addict',
 'good',
 'job',
 'three',
 'kids',
 'serbs',
 'girl',
 'scout',
 'leader',
 'honey',
 'go',
 'bad',
 'living',
 'house',
 'bunch',
 'projects',
 'fall',
 'apart',
 'dramatically',
 'tom',
 'ever',
 'get',
 'going',
 'bass',
 'album',
 'boom',
 'boom',
 'boom',
 'reason',
 'enjoy',
 'honestly',
 'find',
 'joy',
 'producer',
 'tom',
 'rise',
 'talk',
 'devon',
 'samantha',
 'young',
 'couple',
 'tested',
 'positive',
 'able',
 'say',
 'great',
 'using',
 'cannon',
 'smith',
 'living',
 'halfway',
 'house',
 'really',
 'hard',
 'find',
 'jeffcoat',
 'iraq',
 'war',
 'got',
 'addicted',
 'pain',
 'pills',
 'tree',
 'hurt',
 'back',
 'ca',
 'find',
 'joy',
 'find',
 'address',
 'turns',
 'peter',
 'parents',
 'house',
 'austin',
 'african',
 'tree',
 'rather',
 'one',
 'morning',
 'till',
 'one',
 'story',
 'house',
 'screen',
 'porch',
 'apart',
 'outback',
 'lots',
 'cut',
 'wed',
 'dog',
 'one',
 'cones',
 'head',
 'one',
 'hot',
 'pink',
 'parking',
 'us',
 'like',
 'crazy',
 'devotee',
 'answers',
 'door',
 'go',
 'back',
 'yesterday',
 'knocked',
 'door',
 'end',
 'alex',
 'jelly',
 'ca',
 'believe',
 'remember',
 'good',
 'even',
 'really',
 'long',
 'time',
 'say',
 'decision',
 'wan',
 'na',
 'talk',
 'ever',
 'parents',
 'tell',
 'days',
 'later',
 'could',
 'grab',
 'threat',
 'mile',
 'house',
 'han',
 'park',
 'outside',
 'country',
 'store',
 'sort',
 'india',
 'said',
 'car',
 'wash',
 'joy',
 'story',
 'tugboat',
 'tube',
 'would',
 'like',
 'say',
 'good',
 'thank',
 'lynn',
 'sherr',
 'message',
 'one',
 'sponsors',
 'stance',
 'dot',
 'com',
 'mailing',
 'shipping',
 'seem',
 'like',
 'win',
 'situation',
 'trips',
 'post',
 'office',
 'time',
 'consuming',
 'increasing',
 'postage',
 'meter',
 'expensive',
 'better',
 'way',
 'stands',
 'dot',
 'com',
 'imprints',
 'official',
 'us',
 'posted',
 'letter',
 'package',
 'using',
 'computer',
 'sign',
 'stamps',
 'dot',
 'com',
 'special',
 'offer',
 'trial',
 'plus',
 'postage',
 'good',
 'stands',
 'dot',
 'com',
 'put',
 'microphone',
 'injured',
 'embedded',
 'bond',
 'rental',
 'car',
 'part',
 'thanks',
 'little',
 'store',
 'screen',
 'see',
 'plenty',
 'time',
 'joy',
 'start',
 'telling',
 'us',
 'life',
 'like',
 'started',
 'using',
 'needle',
 'home',
 'hair',
 'color',
 'household',
 'furniture',
 'linus',
 'big',
 'sectional',
 'sofa',
 'less',
 'coverage',
 'handle',
 'rolled',
 'heavy',
 'cherry',
 'dine',
 'etc',
 'washer',
 'dryer',
 'dishwasher',
 'flat',
 'screen',
 'sets',
 'reflex',
 'pinky',
 'beats',
 'playstation',
 'three',
 'weeks',
 'let',
 'finish',
 'lot',
 'things',
 'happened',
 'ended',
 'selling',
 'staff',
 'buy',
 'drug',
 'addicted',
 'cocaine',
 'lost',
 'house',
 'timely',
 'manner',
 'injecting',
 'pan',
 'see',
 'day',
 'staying',
 'another',
 'house',
 'addicts',
 'constant',
 'terror',
 'go',
 'point',
 'told',
 'us',
 'leave',
 'house',
 'staying',
 'nineteen',
 'left',
 'nowhere',
 'stay',
 'slept',
 'sladek',
 'hard',
 'hit',
 'elementary',
 'school',
 'biggs',
 'general',
 'janet',
 'plastic',
 'scientists',
 'lived',
 'school',
 'houses',
 'often',
 'empty',
 'houses',
 'board',
 'windows',
 'kid',
 'doors',
 'board',
 'salvador',
 'sustained',
 'house',
 'two',
 'nights',
 'anything',
 'happen',
 'hear',
 'giselle',
 'nice',
 'meet',
 'someone',
 'wrinkles',
 'training',
 'camps',
 'nowhere',
 'time',
 'last',
 'year',
 'home',
 'huh',
 'mick',
 'hansen',
 'brings',
 'finger',
 'keogh',
 'money',
 'bank',
 'twelve',
 'less',
 'twelve',
 'gone',
 'joy',
 'reach',
 'two',
 'daughters',
 'grown',
 'person',
 'fourteen',
 'time',
 'lived',
 'bed',
 'time',
 'street',
 'back',
 'live',
 'dad',
 'suffer',
 'joy',
 'everyday',
 'figuring',
 'money',
 'get',
 'high',
 'says',
 'feeling',
 'need',
 'dishes',
 'something',
 'injection',
 'mean',
 'get',
 'warm',
 'rush',
 'celestin',
 'asia',
 'get',
 'say',
 'set',
 'anti',
 'takes',
 'take',
 'full',
 'breadth',
 'good',
 'hal',
 'let',
 'us',
 'make',
 'calendar',
 'euphoria',
 'last',
 'seconds',
 'couple',
 'minutes',
 'think',
 'many',
 'minutes',
 'sarah',
 'monday',
 'mean',
 'unlike',
 'four',
 'five',
 'times',
 'day',
 'rest',
 'time',
 'chasing',
 'exactly',
 'got',
 'chase',
 'crate',
 'rash',
 'every',
 'two',
 'hours',
 'joy',
 'got',
 'monthly',
 'check',
 'divorce',
 'money',
 'ran',
 'would',
 'steal',
 'get',
 'pills',
 'sir',
 'life',
 'months',
 'managed',
 'stand',
 'trial',
 'stay',
 'alive',
 'first',
 'week',
 'july',
 'one',
 'day',
 'things',
 'changed',
 'country',
 'road',
 'place',
 'cops',
 'swarmed',
 'place',
 'seven',
 'eight',
 'nine',
 'cop',
 'cars',
 'came',
 'like',
 'crazy',
 'in-house',
 'officer',
 'smelled',
 'marijuana',
 'house',
 'couple',
 'guys',
 'ally',
 'smoke',
 'enjoying',
 'yellow',
 'find',
 'saying',
 'enough',
 'charge',
 'maintaining',
 'common',
 'incense',
 'charged',
 'visiting',
 'economy',
 'since',
 'charge',
 'something',
 'get',
 'house',
 'people',
 'using',
 'drugs',
 'son',
 'andretti',
 'rested',
 'forty',
 'one',
 'years',
 'never',
 'cut',
 'think',
 'way',
 'enough',
 'went',
 'jail',
 'get',
 'couple',
 'days',
 'wearing',
 'monitor',
 'honor',
 'cold',
 'conditioner',
 'release',
 'fast',
 'curfew',
 'withdrawing',
 'piano',
 'big-time',
 'could',
 'think',
 'us',
 'getting',
 'back',
 'austin',
 'getting',
 'bus',
 'get',
 'home',
 'nine',
 'bed',
 'came',
 'lead',
 'delicate',
 'nomination',
 'skateboard',
 'letting',
 'carry',
 'dance',
 'like',
 'al',
 'nine',
 'timeline',
 'cops',
 'knbc',
 'media',
 'resnick',
 'knew',
 'parents',
 'see',
 'peres',
 'said',
 'thought',
 'act',
 'son',
 'bracelet',
 'marry',
 'escape',
 'war',
 'sell',
 'lead',
 'gps',
 'honey',
 'find',
 'like',
 'beg',
 'unheeded',
 'embellish',
 'camp',
 'hell',
 'would',
 'find',
 'whether',
 'next',
 'morning',
 'bad',
 'days',
 'terrific',
 'fan',
 'getting',
 'pails',
 'case',
 'hightotal',
 'mourner',
 'army',
 'became',
 'schrader',
 'lads',
 'straight',
 'railway',
 'outlast',
 'elderly',
 'closet',
 'door',
 'nice',
 'hiding',
 'place',
 'way',
 'police',
 'arrested',
 'enjoy',
 'went',
 'back',
 'jail',
 'charges',
 'serious',
 'violated',
 'conditions',
 'release',
 'days',
 'schuster',
 'really',
 'trying',
 'piano',
 'everybody',
 'talked',
 'says',
 'feels',
 'like',
 'worst',
 'flew',
 'could',
 'possibly',
 'imagine',
 'fact',
 'picture',
 'line',
 'rested',
 'klooster',
 'mug',
 'shot',
 'county',
 'jail',
 'led',
 'package',
 'seen',
 'horrible',
 'buckshot',
 'world',
 'air',
 'near',
 'mommy',
 'friends',
 'seen',
 'look',
 'worst',
 'drug',
 'abuse',
 'way',
 'lost',
 'lost',
 'protest',
 'left',
 'april',
 'july',
 'love',
 'probably',
 'thirty',
 'five',
 'came',
 'home',
 'alec',
 'five',
 'days',
 'without',
 'even',
 'drink',
 'water',
 'like',
 'come',
 'jason',
 'pills',
 'care',
 'bout',
 'hydration',
 'nutrition',
 'know',
 'touched',
 'white',
 'men',
 'shot',
 'definitely',
 'horror',
 'used',
 'basketball',
 'coach',
 'couch',
 'girl',
 'scout',
 'leader',
 'children',
 'friends',
 'held',
 'infinitely',
 'plastered',
 'war',
 'deceit',
 'listen',
 'top',
 'withdrawing',
 'depression',
 'gon',
 'na',
 'put',
 'bet',
 'life',
 'know',
 'look',
 'lost',
 'live',
 'arrest',
 'jeff',
 'fahey',
 'miami',
 'nine',
 'broad',
 'diner',
 'wires',
 'bra',
 'managed',
 'break',
 'sharpen',
 'emptiness',
 'celsius',
 'guests',
 'tonight',
 'canvas',
 'held',
 'jail',
 'best',
 'six',
 'days',
 'salad',
 'jailed',
 'conscious',
 'lunch',
 'action',
 'pre',
 'med',
 'superficial',
 'damage',
 'noah',
 'enough',
 'actually',
 'get',
 'plane',
 'field',
 'need',
 'medical',
 'attention',
 'relate',
 'really',
 'turn',
 'overreact',
 'time',
 'outset',
 'thought',
 'allies',
 'really',
 'dead',
 'neon',
 'think',
 'attention',
 'get',
 'south',
 'make',
 'hospital',
 'life',
 'bottomed',
 'know',
 'thought',
 'live',
 'know',
 'eventually',
 'joy',
 'started',
 'talking',
 'nurse',
 'worked',
 'jail',
 'first',
 'said',
 'main',
 'tank',
 'battalion',
 'think',
 'done',
 'getting',
 'get',
 'life',
 'going',
 'elson',
 'tail',
 'around',
 ...]

In [33]:
type(cmu)
join_words = " ".join(cmu)

In [34]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS




wordcloud = WordCloud(
                      font_path='/Users/sheldon/Library/Fonts/Adelle_light.otf',
                      stopwords=STOPWORDS,
                      background_color='black',
                      width=1800,
                      height=1400
                     ).generate(join_words)

plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('./my_twitter_wordcloud_1.png', dpi=300)
plt.show()



In [ ]: