In [128]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib2
import re
import xmltodict
import wget
import os
os.chdir('/Users/sheldon/git/springboard_capstone/acquire_podcasts')
df = pd.read_csv('pcast.csv',names=['name','url'])
urls = df.url.tolist()
urls = filter(lambda string: 'feeds.' in string or 'feed.' in string, urls)

In [127]:
def homepage(request):
    file = urllib2.urlopen(request)
    data = file.read()
    file.close()

    def get_mp3s(data):
        data = data.split()
        data = filter(lambda word: word.endswith('.mp3"') , data)
        data = list(set(data))
        return data
    data = get_mp3s(data)  

    def parse_mp3(urlstring):
        urlstring = re.split('url=', urlstring)[1]
        return urlstring.replace('"','')
    
    data = map(parse_mp3, data)

    return data

def download_mp3(podcastseries, urls):
    os.chdir('/Users/sheldon/git/springboard_capstone/acquire_podcasts')
    os.mkdir(urls.split('/')[-1])
    os.chdir(urls.split('/')[-1])
    mp3_list = []
    def download(episode):
        print 'downloading: ',episode
        episode = wget.download(episode)
        print 'downloaded: ',episode

    for number, episode in enumerate(podcastseries):
        if len(mp3_list) < 11:
            print number, ': ', episode
            mp3_list.append(episode)
            download(episode)
            print 'length: ',len(mp3_list)
        else:
            break
    os.chdir('/Users/sheldon/git/springboard_capstone/acquire_podcasts')
    
for number, series in enumerate(urls):
    print 'starting: ',number, ' - ',series
    data = homepage(series)
    download_mp3(data, series)
    print 'completed: ',number, ' - ',series
    urls.pop(number)
    print 'remaining: ',len(urls)


starting:  0  -  http://feeds.feedburner.com/HollywoodBabbleOnPod
0 :  http://feeds.soundcloud.com/stream/146162002-hollywoodbabbleon-hollywood-babble-on-75-75-1.mp3
downloading:  http://feeds.soundcloud.com/stream/146162002-hollywoodbabbleon-hollywood-babble-on-75-75-1.mp3
---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-127-69940e65c642> in <module>()
     42     print 'starting: ',number, ' - ',series
     43     data = homepage(series)
---> 44     download_mp3(data, series)
     45     print 'completed: ',number, ' - ',series
     46     urls.pop(number)

<ipython-input-127-69940e65c642> in download_mp3(podcastseries, urls)
     33             print number, ': ', episode
     34             mp3_list.append(episode)
---> 35             download(episode)
     36             print 'length: ',len(mp3_list)
     37         else:

<ipython-input-127-69940e65c642> in download(episode)
     26     def download(episode):
     27         print 'downloading: ',episode
---> 28         episode = wget.download(episode)
     29         print 'downloaded: ',episode
     30 

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/site-packages/wget.pyc in download(url, out, bar)
    524     else:
    525         binurl = url
--> 526     (tmpfile, headers) = ulib.urlretrieve(binurl, tmpfile, callback)
    527     filename = detect_filename(url, out, headers)
    528     if outdir:

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in urlretrieve(url, filename, reporthook, data, context)
     96     else:
     97         opener = _urlopener
---> 98     return opener.retrieve(url, filename, reporthook, data)
     99 def urlcleanup():
    100     if _urlopener:

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in retrieve(self, url, filename, reporthook, data)
    243             except IOError:
    244                 pass
--> 245         fp = self.open(url, data)
    246         try:
    247             headers = fp.info()

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in open(self, fullurl, data)
    211         try:
    212             if data is None:
--> 213                 return getattr(self, name)(url)
    214             else:
    215                 return getattr(self, name)(url, data)

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in open_http(self, url, data)
    362         else:
    363             if data is None:
--> 364                 return self.http_error(url, fp, errcode, errmsg, headers)
    365             else:
    366                 return self.http_error(url, fp, errcode, errmsg, headers, data)

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in http_error(self, url, fp, errcode, errmsg, headers, data)
    375             method = getattr(self, name)
    376             if data is None:
--> 377                 result = method(url, fp, errcode, errmsg, headers)
    378             else:
    379                 result = method(url, fp, errcode, errmsg, headers, data)

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in http_error_301(self, url, fp, errcode, errmsg, headers, data)
    669     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
    670         """Error 301 -- also relocated (permanently)."""
--> 671         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
    672 
    673     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in http_error_302(self, url, fp, errcode, errmsg, headers, data)
    639                         "Internal Server Error: Redirect Recursion", headers)
    640         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
--> 641                                         data)
    642         self.tries = 0
    643         return result

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in redirect_internal(self, url, fp, errcode, errmsg, headers, data)
    665                           headers)
    666 
--> 667         return self.open(newurl)
    668 
    669     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in open(self, fullurl, data)
    211         try:
    212             if data is None:
--> 213                 return getattr(self, name)(url)
    214             else:
    215                 return getattr(self, name)(url, data)

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in open_http(self, url, data)
    362         else:
    363             if data is None:
--> 364                 return self.http_error(url, fp, errcode, errmsg, headers)
    365             else:
    366                 return self.http_error(url, fp, errcode, errmsg, headers, data)

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in http_error(self, url, fp, errcode, errmsg, headers, data)
    375             method = getattr(self, name)
    376             if data is None:
--> 377                 result = method(url, fp, errcode, errmsg, headers)
    378             else:
    379                 result = method(url, fp, errcode, errmsg, headers, data)

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in http_error_401(self, url, fp, errcode, errmsg, headers, data)
    687         if not 'www-authenticate' in headers:
    688             URLopener.http_error_default(self, url, fp,
--> 689                                          errcode, errmsg, headers)
    690         stuff = headers['www-authenticate']
    691         import re

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/urllib.pyc in http_error_default(self, url, fp, errcode, errmsg, headers)
    384         """Default error handler: close the connection and raise IOError."""
    385         fp.close()
--> 386         raise IOError, ('http error', errcode, errmsg, headers)
    387 
    388     if _have_ssl:

IOError: ('http error', 401, 'Unauthorized', <httplib.HTTPMessage instance at 0x106292488>)

In [ ]: