In [1]:
#!/usr/bin/env python
# Copyright 2016 Aaron ciuffo

version = '''NPR Podcast Downloader V5.2

by Aaron Ciuffo (txoof.com)
released without warranty under GPLV3:
http://www.gnu.org/licenses/gpl-3.0.html
Please don't sue me.
'''

programName = 'podcastdownload'

# Imports
from datetime import datetime # for time stuff
#import pytz
import logging # logging library
#from urllib2 import urlopen # standard library for interfacing with web resources
import urllib2 # standard library for interfacing with web resources 
from urllib2 import URLError
import re # regular expressions
import json # handle JSON objects
import os # Opperating System interface 
import sys # internal opperations including a list of imported modules
import fnmatch # used by cleanup method in Episode
import glob # used by m3u method - consider replacing with some other library
import shutil # used by cleanup method
import argparse # parse command line arguments
import ConfigParser # parse config files
from random import SystemRandom

In [2]:
releaseNotes = '''Release Notes
V 5.2
* Added html headers to HTML download method for dealing with cookies
V 5.1
* Added "Artist" tag to NPR Segments
* Added date to album name
V5.0
* Rewrite and cleanup 
 - Cleanup of variables
 - Tidy messy loops
* Adapt NPREpisode object to use new class attributes for output paths
'''

TO DO

Other

  • Add command line option to show HTML that was downloaded for debugging ## Downloading
  • add User-Agent string to NPREpisode class getEpisode https://docs.python.org/2/library/urllib2.html
  • add command line option to download a show at a specific URL
  • flawed logic causes the def download to return "false" if any segment does not download causing no m3u to be written later
  • add feature to retry failed segments up to N times

Configuration

  • Add configuration check - offer to create a configuration
  • Add configuration option to download album art from a specific URL and shove it into each episode folder
  • Add configuration option to specify cookie paramaters or files

Completed

  • Move configuration to ~/.config/podcastdownload/config.ini
  • General rewrite and cleanup
    • Move variables to one place
    • reconsider some of the messier loops
  • remove % in front of section names in configuration
  • change name from 'Default' to 'Main'
  • Adapt NPREpisode object to use new class attributes for output paths
  • complete the cleanup method
  • remove any 'stale' episodes
  • add a check to see if a program is already downloaded (maybe look for m3u) or at the download log
  • -v overrides configuration file
  • remove download logging - this is not necessary; it's a holdover from previous versions
  • reorganize configuration options to allow commandline to influence logging
    • only log to a file if a logfile is specified
    • add support for setting log from configuration file, setting logging level
  • consider removing all the day and time checking for episodes; it's not relevant for HTML queries
    • the day and time checking may be needed for API queries if this is implemented
  • consider removing all the day and time checking for episodes; it's not relevant for HTML queries
    • consider removing date and time check from showConfig class
  • implement User-Agent in urllib2 request
  • consider chainging import from urllib2; 2x import because of URLError AND urlopen
  • Add option to generate configuration file if it is missing
  • change default name of configuration file to ~/.programname.ini
  • Test command line
    • test all command line options
    • test all configuration options (remove options, sections, and otherwise break the config file)

In [3]:
def loadModules():
    '''load non standard python modules'''
    import logging
    logging.basicConfig()
    logging.debug('loading module: requests')
    try:
        global requests
        import requests
    except Exception as e:
        logging.critical('Fatal Error\nFailed to load module: requests\n%s', e)
        logging.critical('Please install requests module: http://docs.python-requests.org/')
        exit(2)
        return(False)

    logging.debug('loading module: mutagen.mp3')
    # create a global list of all the taggers available
    global taggers
    taggers = {}
    try:
        global MP3
        from mutagen.mp3 import EasyMP3 as MP3
    except Exception, e:
        logging.critical('Failed to load module: mutagen.mp3\n%s', e)
        logging.critical('mp3 tagging may not be available')    
    taggers['mp3'] = MP3

    
    logging.debug('loading module: mutagen.mp4')
    try:
        global MP4
        from mutagen.mp4 import MP4
    except Exception, e:
        logging.critical('Failed to load module: mutagen.mp4\n%s', e)
        logging.critical('mp4 tagging may not be available')    
    taggers['mp4'] = MP4

    return(True)

In [4]:
def div(num = 10, char = '*'):
    '''
    returns a multiple copies of a passed string
    Args:
        num (int): number of times to repeat string
        char (string): characters to repeat
    Returns:
        char*n (string)
    '''
    if isinstance(num, int):
        return(str(str(char)*num))
    else:
        return(str(char))

In [5]:
class Episode():
    '''Podcast episode object'''

    def __init__(self, name = 'No Name', programURL = 'undef', outputBasePath = './', 
                 m3u = 'playlist.m3u', downloadLog = 'download.log', keep = 3, showDate = None,):
        '''
        Args:
            name (str): name of episode/podcast
            programURL (str): Index URL containing list of files to download
            showDate (str): date of episode
            outputBasePath (str): base path to use for output of files (default is ./)
            m3u (str): m3u playlist filename
            downloadLog (str): download log filename
            keep(int): maximumnumber of programs to keep
            
        Attributes:
            name (str): name of episode/podcast
            programURL (str): Index URL containing list of files to download
            segments (list): Segment() objects to be downloaded
            showDate (str): date of episode
            outputBasePath (str): base path to use for output of files (default is ./)
            outputShowPath (str): path within outputBasePath - slugified version of name
            outputPath (str): path within outputShowPath - set to outputShowPath by default
            m3u (str): m3u playlist filename
            downloadLog (str): download log filename
            keep (int): maximum number of programs to keep
        '''
        self.name = name # str
        self.programURL = programURL # str
        self.segments = [] # list
        self.segmentsFailed = [] #
        self.showDate = showDate # str
        self.outputBasePath = self._slash(outputBasePath) # str
        self.outputShowPath = self.outputBasePath + self._slash(self._slugify(self.name))
        self.outputPath = self.outputShowPath
        self.m3u = m3u
        self.downloadLog = downloadLog  
        self.keep = keep
    
    def attributes(self, display = None):
        '''
        method to show relevant attributes of
        Args:
            display (list): list of specific attributes to display
        Retruns:
            Specific attributes
        '''
        if isinstance(display, list):
            display = display
        else:
            display = ['name', 'programURL', 'showDate', 'outputBasePath', 'outputShowPath', 'outputPath', 
                   'm3u', 'downloadLog', 'keep']
        attributes = {}
        for key in self.__dict__:
            if (key in display) and (key in self.__dict__):
                attributes[key] = self.__dict__[key]
        
        return(attributes)
                
        
    
    def _slugify(self, value):
        """
        Normalizes string, converts to lowercase, removes non-alpha characters,
        and converts spaces to hyphens.

        From Django's "django/template/defaultfilters.py".
        Args:
            value (str): string to be normalized for use with a filename
        
        Returns:
            unicode: sluggified string
        """
        _slugify_strip_re = re.compile(r'[^\w\s-]')
        _slugify_hyphenate_re = re.compile(r'[-\s]+')

        import unicodedata
        if not isinstance(value, unicode):
            value = unicode(value)
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
        value = unicode(_slugify_strip_re.sub('', value).strip())
        return _slugify_hyphenate_re.sub('-', value)

    def _slash(self, value):
        '''
        Ensures path has a trailing slash
        
        Args:
            value (str): string to check and modify
        
        Returns:
            value (str): string with trailing slash
            
        '''
        if not re.match('.*\/$', value):
            logging.debug('adding trailing slash to path: %s', value)
            return(value + '/')
        else:
            return(value)
    
    def setOutputPath(self, outputShowPath = None, outputEpisodePath = None):
        '''
        Method to update the output paths
        Args:
            outputShowPath (str): path within the outputBasePath
            outputEpisodePath (str): path within outputShowPath
        Returns:
            outputEpisodePath (str)
        '''
        if outputShowPath:
            self.outputShowPath = self._slash(self.outputBasePath) + self._slash(outputShowPath)
        
        if outputEpisodePath:
            self.outputPath = self._slash(self.outputShowPath) + self._slash(outputEpisodePath)
        else:
            self.outputPath = self.outputShowPath
            
        return(self.outputPath)
    
    def setM3U(self, name = 'playlist'):
        '''
        Update the m3u file name
        Args:
            name (str): filename for the m3u
        '''
        self.m3u = self._slugify(name) + '.m3u'
        return(True)
    
    def writeM3U(self, filename = False):
        '''
        Write M3U playlist for the episode in the root of the output directory
        Args:
            filename (str): path to output filename
        Returns:
            bool: True
        '''
        
        logging.info('opening m3u playlist: %s for writing', self.m3u)
        if filename:
            self.setm3u(filename)
        
        try:
            #m3ufile = open(self.outputBasePath + self.m3u, 'w')
            m3ufile = open(self._slash(self.outputPath) + self.m3u, 'w')
        except Exception as e:
            logging.error('could not open m3u file: %s\n%s', self.m3u, e)
            return(False)
        logging.debug('writing segments to: %s', self.m3u)
        # recurse all the segments 
        for segment in self.segments:
            # if it was successfully downloaded write it to the m3u file
            if segment.downloaded:
                logging.debug('writing segment to m3u file: %s', segment.filename)
                try:
                    #m3ufile.write(self.outputPath + segment.filename + '\n')
                    m3ufile.write(segment.filename + '\n')
                except Exception as e:
                    logging.error('could not write to: %s\n%s', self.m3u, e)
                    logging.error('halting m3u writing')
                    return(False)
        # cleanup
        try:
            m3ufile.close()
        except Exception as e:
            logging.error('could not close m3u file: %s\n%s', self.m3u, e)
            return(False)
        
        return(True)
    
    
    def download(self, dryrun = False, timeout = 5, useragent = ''):
        '''
        Download all segments in self.segment into self.outputPath
        Args:
            dryrun (bool): When true do all other steps, but do not download and return: False
            timeout (real): time in seconds to wait for a download to complete before timing out
        
        Returns: 
            bool: True for successful download of one or more segments
        '''
        
        success = False
        lockfile = self.outputPath + '.' + programName + '.lock'
        logging.info('downloading program: %s', self.name)
        
        # check for output path
        logging.debug('checking for output directory: %s', self.outputPath)
        if not os.path.isdir(self.outputPath):
            logging.debug('output directory (%s) not found', self.outputPath)
            logging.debug('attempiting to create output directory')
            try:
                os.makedirs(self.outputPath)
            except Exception as e:
                logging.error('could not create outputpath for this episdoe at: %s\n%s', self.outputPath, e)
                logging.error('download failed')
                return(False)
            
            # make a 'lock file' in the folder to help with cleanup later  
            logging.debug('writing lockfile: %s', lockfile)
            try:
                with open(lockfile, 'a'):
                    os.utime(lockfile, None)
            except Exception as e:
                logging.error('could not create lockfile: %s', lockfile)
                logging.error('file error: %s', e)
        
        # check for existing m3u files; stop downloading if it exists
        if len(glob.glob(self.outputPath + '/*.m3u')) > 0:
            logging.info('episode previously downloaded; skipping')
            return(False)
        
        logging.debug('dryrun = %s', dryrun)
        if dryrun:
            logging.info('downloads will be simulated')
        # begin downloading
        for segment in self.segments:
            # update the path for the current segment
            filePath = self.outputPath + segment.filename
            logging.debug('downloading %s', segment.audioURL)
            logging.debug('using URL: %s', segment.audioURL)
            logging.debug('using User-Agent: %s', useragent)
            if not dryrun:
                try:
#                     audioFile = urlopen(segment.audioURL, timeout = timeout, 
#                           data = {'User-Agent' : useragent}).read()
# #                     audioFile = urlopen(segment.audioURL, timeout = timeout).read()
                    request = urllib2.Request(segment.audioURL, headers = {'User-Agent' : useragent})
                    audioFile = urllib2.urlopen(request, timeout = timeout).read()
                except (urllib2.URLError, ValueError) as e:
                    logging.warning('could not download segment number: %s', segment.number)
                    logging.warning('error: %s; timeout: %s', e, timeout)
                    continue
                # if one segment was downloaded report a successful download
                success=True
            
            logging.info('writing file to %s', filePath)
            
            if not dryrun:
                try:
                    with open(filePath, 'wb') as code:
                        code.write(audioFile)
                        # record if the writing was successful
                        segment.downloaded = True
                except Exception as e:
                    logging.warning('could not write segment number %s to %s\nerrors follow', segment.number, filePath)
                    logging.warning(e)
                    success = False
                    continue
            else:
                # record succsessful downloading of all segments when doing a dry run
                segment.downloaded = True
                # Dry runs return "false"
                success = False
            
        
        # This is a holdover from a previous version; it is not really needed
        #self.logDownload()
            
        return(success)       
            
    def logDownload(self):
        '''
        Holdover from a previous version as a method for tracking files that were downloaded; no longer needed
        Log successfully downloaded episodes
        Args:
        Returns: 
            bool: True
        '''
        logFile = self.outputBasePath + self.downloadLog
        
        logging.debug('opening log file: %s', logFile)
        try:
            f = open(logFile, 'a')
        except Exception as e:
            logging.error('could not open log file: %s\n%s', logFile, e)
            return(False)
        
        try: 
            f.write(self.outputPath + '\n')
        except Exception as e:
            logging.error('could not write to log file: %s\n%s', logFile, e)
            return(False)
        
        try:
            f.close()
        except Exception as e:
            logging.error('could not close log file: %s\n%s', logFile, e)
            return(False)
        
        return(True)
            
    
    def addSegment(self, segment):
        '''
        Add a downloadable segment to the segment list
        Args:
            segment (Segment): Segment() object containing information
        Returns:
            bool: True
        '''
        self.segments.append(segment)
        return(True)
        
            
    def tagSegments(self):
        '''
        Tag all downloaded segments
        Args:

        Returns:
            bool: True
        '''
        logging.info('tagging segments')
        
        for segment in self.segments:
            if segment.downloaded:
                logging.debug('title: %s,\n tracknumber: %s,\n album: %s,\n artist: %s', segment.title, segment.number, 
                              segment.programName, segment.artist)

                filename = self.outputPath + segment.filename
                try:
                    # find the file extension and guess at the type based on the extension
                    filetype = re.search('\.(\w+$)', filename).group(1)
                except:
                    filetype = None

                if filetype.lower() in taggers: # check to see if this is a known filetype
                    logging.debug('tagging %s', filename)
                    myTagger = taggers[filetype] # create a tagger object with the appropriate mutagen module
                    audio = myTagger(filename) 

                    # write the appropriate tags
                    audio['title'] = segment.title
                    audio['tracknumber'] = str(segment.number)
                    audio['album'] = segment.programName + '-' + self.showDate
                    audio['artist'] = segment.artist

                    try:
                        audio.save()
                    except Exception as e:
                        logging.error('could not write tags for: %s\n%s', filename, e)        
                else:
                    logging.info('could not tag, unknown filetype: %s', filename)
            else:
                logging.warn('segment %s not downloaded; skipping tagging', segment.title)
                
    def cleanUp(self, dryrun = False, lockfile = '*.lock', keep = None):
        '''
        Remove stale episodes, keeping at maximum self.keep episodes

        Args:
            dryrun (bool): when true, do not actually delete anything
            lockfile (str): lockfile pattern glob to use when searching for lockfiles; default:*.lock
            keep (int): maximum number of episodes to keep
        Returns:
            removed (list): removed paths
        '''
      
        if keep:
            self.keep = keep
        if self.keep <= 0:
            self.keep = 1
            
        logging.info('cleaning up stale shows for %s', self.name)
        if not isinstance(self.keep, int):
            logging.error('%s is not an integer: keep')
        logging.info('keeping a maximum of %s shows', self.keep)
        # candididate directories that contain lockfiles for deletion
        matchdir = {}
        logging.debug('searching path: %s', self.outputShowPath)
        for root, dirnames, filenames in os.walk(self.outputShowPath):
            logging.debug('%s', root)
            for filename in fnmatch.filter(filenames, lockfile):
                logging.debug('      %s', filename)
                matchdir[root] = filename
        
        logging.debug('previously downloaded episodes found: %s', len(matchdir))
        # files to delete
        delete = []
        
        # files successfully deleted:
        removed = []
        for directory in range(0, len(sorted(matchdir))-self.keep):
            logging.debug('flagged for deletion: %s', sorted(matchdir)[directory])
            delete.append(sorted(matchdir)[directory])
        
        for key, val in enumerate(delete):
            lockfile = os.path.join(delete[key], matchdir[delete[key]])
            logging.debug('attempting to clean episode files in: %s', delete[key])
            # double check that a *.lock file exists before attempting a delete
            if os.path.isfile(lockfile):
                logging.debug('found lock file in path: %s', delete[key])

                if dryrun:
                    logging.info('dryrun: simulating deletion (nothing will be removed)')
                else:
                    logging.debug('deleting path: %s\n', delete[key])
                    try:
                        shutil.rmtree(delete[key])
                        # record those paths removed
                        removed.append(delete[key])
                    except OSError as e:
                        logging.error('could not delete path: %s', e)
                    
                
            else:
                logging.warn('discovered missing lock file when attempting cleanup: %s', lockfile)
                logging.warn('manual deletion required: %s', delete[key])
                logging.warn('skipping path: %s\n', delete[key])

        return(removed)

In [6]:
class NPREpisode(Episode, object):
    '''NPR program episode object
        Args:
            name (str): name of episode/podcast
            programURL (str): Index URL containing list of files to download
            showDate (str): date of episode
            outputBasePath (str): base path to use for output of files (default is ./)
            m3u (str): m3u playlist filename
            downloadLog (str): download log filename
            jsonData 
    '''
    
    
    def __init__(self, name = 'unknown', programURL = None, outputBasePath = './', m3u ='playlist.m3u', 
                 downloadLog = 'download.log', keep = 3, htmlheaders = {}):
        super(NPREpisode, self).__init__(name = name, programURL = programURL, outputBasePath = outputBasePath, 
                                         m3u = m3u, downloadLog = downloadLog, keep = keep)
        self.jsonData = None
        self.htmlheaders = htmlheaders

    def recentEpisodes(self):
        '''Identify the most recent episodes
        Not yet implemented
        '''
        pass
        
    def addHeader(self, key, string):
        '''
        Add headers to be used when making url request
        Attributes set here:
            self.htmlheaders (dictionary) - {'Name': 'Content string'} optional headers to send with request
        Paramaters:
            key (str) - header key name
            string (str) - header content
        '''
        self.htmlheaders[key] = string
        
    def getepisode_API():
        '''
        Use the NPR API to get a list of episodes
        Not yet implemented
        '''
        pass
    
    def getepisode_HTML(self):
        '''
        Scrape the HTML for JSON containing the date segment and title information
        Attributes set here:
            self.jsonData (json obj) - JSON listing of episodes from NPR
            self.showDate (str) - YYYY-MM-DD formatted string
            self.name (str) - human readable show name 
            self.segments (:obj: Segment) - episode segments are populated and added
            
        Returns: 
            bool: True if episode information is scraped from the HTML, False otherwise
        '''
        
        logging.debug('fetching episode info via HTML method')
        logging.debug('source: %s' % self.programURL)
        
        # search terms hardcoded here
        search_PlayAll = "<b.*data-play-all='({.*})'><\/b>" #re search string for JSON data in program HTML
        search_FileName = "(^[\s|\w|\.|'|-]*)\[?|$]" #(anySpaces OR anyWords OR anyPeriod OR any' OR any-)? OR EOL
        search_showDate = "datetime=\"(\d{4}-\d{2}-\d{2})" #re search for show date
               
        
        # variables defined here
        filename = '' # extracted filename for each segment
        defaultArtist = 'National Public Radio' # default artist for NPR Episodes
        
        # add an extension to help differentiate between episodes; set to epoch seconds to prevent clobbering
        # if no valid extension is set elsewhere
        output_extension = int((datetime.now() - datetime.utcfromtimestamp(0)).total_seconds())
        
        # FIXME - build a header for the request here including the appropriate cookie data
        opener = urllib2.build_opener()
        for header in self.htmlheaders:
            logging.debug('adding html header: {0}: {1}'.format(header, self.htmlheaders[header]))
            opener.addheaders.append((header, self.htmlheaders[header]))
        try: # fetch the full show HTML
            programHTML = opener.open(self.programURL).read()
#            programHTML = urllib2.urlopen(self.programURL).read()
        except Exception as e:
            logging.warning('could not fetch episode information from %s' % self.programURL)
            logging.error(e)
            return(False)
        logging.debug('HTML retrieved successfully')
        
        # find the show date and record it 
        # FIXME - Wrap this in a try: in the event that there is no "showdate"
        try:
            self.showDate = re.search(search_showDate, programHTML).group(1)
        except AttributeError as e:
            logging.warning('no date found in HTML; setting to 2000-01-01')
            self.showDate = '2000-01-01'
        
        if len(self.showDate) < 1:
            logging.warning('no valid showDate found')
        else: logging.debug('show date: %s', self.showDate)
        
        try: # find the JSON program data
            self.jsonData = json.loads(re.search(search_PlayAll, programHTML).group(1))
        except Exception as e:
            logging.error('no valid JSON episode listing found in HTML from %s', self.programURL)
            logging.error(e)
            return(False)
        
        # check that some JSON data was found - not terribly robust
        if len(self.jsonData['audioData']) > 1:
            logging.debug('JSON program information found for %s', self.jsonData['audioData'][0]['program'].upper())
            logging.debug('setting name to: %s', self.name)
            self.name = self.jsonData['audioData'][0]['program'].upper() # set the episode name
            logging.debug('segments found: %s', len(self.jsonData['audioData']))
        else:
            logging.warn('no valid audioData found in JSON object for program (%s)', self.name)
            return(False)
        
        # grab the first character of each word in the program name; grab the last two characters of the last word
        if len(self.name) > 0:
            short_name = '_'
            output_extension = '_'
            for each, val in enumerate(self.name.split(' ')):
                if each + 1 >= len(self.name.split(' ')):
                    char = 2
                else: 
                    char = 1
                output_extension = output_extension + val[:char]
                short_name = short_name + val[:char]

        # create a sub directory within the output path
        self.setOutputPath(outputEpisodePath = self.showDate + short_name) 
        logging.debug('output path set to: %s', self.outputPath)
        
        #set m3u name
        self.setM3U(self.showDate + '-' + self.name)
        logging.debug('m3u filename set to: %s', self.m3u)
        
        # recurse the JSON object and find all the audioData information
        for key, val in enumerate(self.jsonData['audioData']):
            artist = '' # set the artist to an empty string for each loop
            
            logging.debug('%s - %s', int(key)+1, val['title'] )
            try:
                audioURL = val['audioUrl'] 
                title = val['title']
            except Exception as e:
                    logging.warning('failed to find URL or title data: %s', e)
                    
            # search for artist data
            try:
                artist = val['artist']
            except Exception as e:
                logging.warning('failed to find artist data: %s', e)
            
            if len(artist)<1:
                logging.info('no artist data provided in JSON; using default: %s', defaultArtist)
                artist = defaultArtist
                    
            number = int(key)+1 # set the human readable segment number
            filename = re.search(search_FileName, val['audioUrl'].split('/')[-1:][0]).group(1) # set the filename
            
            # append the segment number
            filename = str(number).zfill(3) + '_' + filename
            
            if filename < 1:
                logging.warning('no filename found; dropping segment')
                continue

            self.addSegment(Segment(audioURL = audioURL, filename = filename, 
                                    number = number, programName = self.name,
                                    title = title, artist = artist))
            
        return(True)

In [7]:
class Segment():
    '''One segment of a podcast'''
    
    def __init__(self, audioURL = None, filename = None, number = 0, programName = None, artist = None, title = None):
        '''
        Args:
            audioURL (str): URL to specific downloadable content
            number (int): ordinal number of segment
            filename (str): output filename
            programName (str): program Name
            artist(str): artist
            title (str): human readable segment title
            downloaded (bool): true if segment was successfully downloaded
            
        '''
        self.audioURL = audioURL
        self.number = number
        self.filename = filename
        self.title = title
        self.programName = programName
        self.artist = artist
        self.downloaded = False

In [8]:
class showConfig():
    '''Configuration object for a downloadable show'''
   
    def __init__(self, optionsDict = {}):
        '''
        Args:
            optionsDict (dict): dictionary of options to be used in configuration
                showname (str): human readable string
                fetchmethod (str): method for downloading show (NPR_HTML or NRP_API)
                programs (int): number of programs to keep
                updatedays (list): integers [0-6] representing days of the week to update (sun-sat)
                updatetime (str): time in 24H HH:MM format after which an update should be attempted
                timezone (str): timezone in which to preform time calculatinos
                url (str): url to NPR program page
        Attributes:
            options (dict): dictionary of options
            showName (str): human readable name of show
            fetchMethod (str): method for downloading show (NPR_HTML or NPR_API)
            programs (int): number of programs to keep
            updateDays (list): integers [0-6] representing days of the week to update (sun-sat)
            updateTime (str): time in HH:MM after which an update should be attempted
            timezone (str): timezone in which to preform time calculations
            url (str): url to NPR program page
    
        '''
        
        self.options = optionsDict
        self.showName = 'No Name'
        self.fetchMethod = 'NPR_HTML'
        self.programs = 1
        self.updateDays = []
        self.updateTime = ''
        self.timezone = 'EST'
        self.url = None
        
    def verifyConfig(self):
        '''
        
        Validates and sets configuration paramaters for a downloadable show:
        
        Attributes:
            showName (str): human readable name of show
            fetchMethod (str): method for downloading show (NPR_HTML or NPR_API)
            programs (int): number of programs to keep
            updateDays (list): integers [0-6] representing days of the week to update (sun-sat)
            updateTime (str): time in HH:MM after which an update should be attempted
            timezone (str): timezone in which to preform time calculations
            
        Args:
            None
        
        Returns: 
            bool: True - configuration is OK or has been made OK
            
        '''
        
        logging.debug('verifying configuration')
        
        if 'showname' in self.options:
            self.showName = self.options['showname']
            logging.debug('show name set to: %s', self.showName)
        else: 
            logging.warn('no show name found; set to: %s', self.showName)
        
        if 'programs' in self.options:
            try:
                self.programs = int(self.options['programs'])
            except ValueError as e:
                logging.error('programs option not an integer: %s', e)
                logging.error('programs set to: %s', self.programs)
        else:
            logging.warning('no programs setting found in configuration file for %s; set to: %s', self.showName, self.programs)
        
        
        if 'url' in self.options:
            if re.match('^http:\/\/.*', self.options['url'].lower()):
                self.url = self.options['url']
            else:
                logging.error('no vlaid URL found for %s: %s', self.showName, self.options['url'])
                return(False)
        else:
            logging.error('no valid URL found for %s', self.showName)
            logging.error('valid url format: http://host.com/show/')
            return(False)
        
        
        if 'fetchmethod' in self.options:
            self.fetchMethod = self.options['fetchmethod']
            logging.debug('fetchmethod set to: %s', self.fetchMethod)
        else:
            logging.warning('no fetchmethod set; setting to: %s', self.fetchMethod)
        
        # This all may be undeeded; consider removing all of this.
        # user cmd+/ to uncomment the block below        
#         defaultUpdateDays = [1, 2, 3, 4, 5, 6, 7]
#         if 'updatedays' in self.options:
#             # remove any non-numerals, -, or commas
#             self.options['updatedays'] = re.sub('[^\,0-9]+', '', self.options['updatedays'])
#             # clear out any superflous commas
#             self.options['updatedays'] = re.sub('\,\,', ',', self.options['updatedays']) 
            
#             try:
#                 self.updateDays = map(int, self.options['updatedays'].split(','))
#             except ValueError as e:
#                 logging.warn('bad or missing update date format: %s',e )
#                 logging.warn('using sun through sat')
#                 self.updateDays = defaultUpdateDays
 
#             badValues = []
#             for index in self.updateDays:
#                 # check for bad values that are less than 1 or greater than 7
#                 if index > 7 or index < 1:
#                     logging.warn('found invalid day in configuration file: %s',index)
#                     badValues.append(index)   
                    
#             # get rid of bad values
#             for index in badValues:
#                 logging.warn('removing invalid day: %s', index)
#                 self.updateDays.remove(index)
#             # sort the list 
#             self.updateDays.sort()
#         else:
#             # supply a list if none is supplied
#             logging.warn('no update days were supplied using sun through sat')
#             self.updateDays = defaultUpdateDays
        
        
#         # do some validation of valid timezones
#         if 'timezone' in self.options:
#             if self.options['timezone'].upper() in pytz.all_timezones:
#                 self.timezone = self.options['timezone'].upper()
#             else: 
#                 logging.error('specified timezone not found in database: %s', self.options['timezone'])
#                 logging.error('setting timezone to: UTC')
#                 self.timezone = 'UTC'
                
#         else:
#             logging.warning('no timezone found; setting timezone to: %s', self.timezone)

    
        
        # do some validation of valid times
        # time format
        timeFMT = '%H:%M'
        defaultTime = '23:59'
        if 'updatetime' in self.options:
            # sanitize the time string datetime.time(datetime.strptime('13:55', timeFMT))
            try:
                self.updateTime = datetime.time(datetime.strptime(re.sub('[^0-9\:]+', '', self.options['updatetime']), timeFMT))
            except ValueError as e:
                logging.error('bad updatetime time format: %s', self.options['updatetime'])
                logging.error('setting updatetime to: %s', defaultTime)
                self.updateTime = datetime.time(datetime.strptime(defaultTime, timeFMT))    
        else:
            self.updateTime = datetime.time(datetime.strptime(defaultTime, timeFMT))
            
        
        return(True)

In [10]:
def main(argv=None):
    ############### init variables 
    
    ##### LOGGING INIT
    # init the log; this removes any old log handlers (this is particularly useful when testing in an IDE)
    log = logging.getLogger()
    if len(log.handlers) > 0:
        for each in range(0, len(log.handlers)):
            log.removeHandler(log.handlers[0])
            
    # set the log format:
    # [  DEBUG 2017-02-12 19:14] loading module: requests
    logFormatter = logging.Formatter('[%(levelname)8s %(asctime)s] %(message)s', '%Y-%m-%d %H:%M')
    consoleFormatter = logging.Formatter('[%(levelname)-8s] %(message)s')
    # set root logger
    rootLogger = logging.getLogger()       
    
    # add a conshole handle to the root logger
    consoleHandler = logging.StreamHandler(sys.stdout)
    consoleHandler.setFormatter(logFormatter)
    rootLogger.addHandler(consoleHandler) 
    
    ############### CONFIGURATION VARIABLES
    # default configuration file
    homeDir = os.path.expanduser('~')
    cfgFile = homeDir + '/.config/podcastdownload/config.ini' 
    
    # set the configuration parser
    configParser = ConfigParser.SafeConfigParser()

    # required options in 'Main' section in 
    # dict {'option name' : [configParser.getfloat; get; getboolean, 'default value]}   
    mainSection = 'Main'    
    # list any special reserved section names here
    reservedSectionNames = [mainSection]
    # required items in the main section
    required = {'outputpath' : [configParser.get, homeDir + '/DownloadedShows']}
    
    # optional items in the configuration file
    optional = {'dryrun' : [configParser.getboolean, False],
                'timeout' : [configParser.getfloat, 5], 
                'loglevel': [configParser.get, 'ERROR'],
                'logfile' : [configParser.getboolean, False],
                'useragent': [configParser.get, '']}
    
    
    # sample show for creating a configuration file
    sampleShow = {
            '#showname': 'human readable name of the show',
            'showname' : 'SAMPLE SHOW: All Things Considered',
            '#url': 'url to index page for current NPR show',
            'url' : 'http://www.npr.org/programs/all-things-considered/',
            '#fetchmethod': 'method to use when fetching "NPR_HTML", "NPR_API" <-not yet implemented',
            'fetchmethod' : 'NPR_HTML',
            '#programs': 'number of downloaded programs to keep',
            'programs' : 2,
            '#loglevel': 'level of logging: "CRITICAL", "WARN", "ERROR", "DEBUG"',
            'loglevel': 'ERROR',
            '#useragent': 'list of strings to send with request separated with a "|"',
            'useragent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; FSL 7.0.6.01001)|Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1'}
    
    ############### SHOW/DOWNLOAD VARIABLES
    # list of show configurations found in configuration file
    shows = []
    
    # list of program episodes to download
    downloadEpisodes = []
    
    # random generator object
    randomGenerator = SystemRandom()
    

    ############### READ AND ACT ON COMMAND LINE ARGUMENTS  
    # disable -h for help so the second parser can deal with this
    # http://stackoverflow.com/questions/3609852/which-is-the-best-way-to-allow-configuration-options-be-overridden-at-the-comman
    cmdlineParser = argparse.ArgumentParser(description = __doc__, 
                                           formatter_class = argparse.RawDescriptionHelpFormatter,
                                          add_help = False)
    # handle the jupyter -f option while developing in jupyter ipython notebook
    #cmdlineParser.add_argument('-f', '--fconfig', help='fake config file', action='store')
    # set the configuration file
    cmdlineParser.add_argument('-c', '--configfile', help='configuration file', metavar='FILE',
                              action='store', default = cfgFile)
    cmdlineParser.add_argument('-C', '--createconfig', help='create configuration file (can be used with -c)', 
                              action='store_true', default=False)
    # determine if this is a dry run or not
    cmdlineParser.add_argument('-d', '--dryrun', help='preform a dry-run with no downloads',
                              action='store_true', default=False)
    cmdlineParser.add_argument('-L', '--logfile', help = 'enable logging to file', 
                               action = 'store_true', default = False)
    cmdlineParser.add_argument('-o', '--outputpath', action = 'store', metavar = 'PATH', 
                        help = 'path to output downloaded files')
    cmdlineParser.add_argument('-t', '--timeout', action = 'store')
    cmdlineParser.add_argument('-v', '--verbose', action = 'count', 
                        help = 'verbose mode; add more -v to increase verbosity')
    cmdlineParser.add_argument('-V', '--version', action = 'store_true', default = False, help = 'print version and quit')

  
    # reamining arguments stored in unknownArgs
    args, unknownArgs = cmdlineParser.parse_known_args()
    
    if args.version:
        print version
        sys.exit()
        
    # set the logging level based on command line options
    if args.verbose:
        # remove 10 for each V bringing the level from 40 (ERROR) down
        logLevel = logging.ERROR - args.verbose * 10
        # if the log level shold somehow end up above 50 or below 10 it is set to 10 (DEBUG)
        if (50 < logLevel) or (logLevel < 10):
            logLevel = logging.DEBUG
        rootLogger.setLevel(logLevel)
    else:
        # the default level is ERROR 
        rootLogger.setLevel(logging.ERROR)    
    
    # create the configuration file and exit 
    if args.createconfig:
        logging.info('%s writing sample configuration file: %s', div(10, '-'), args.configfile)
        configParser.add_section(mainSection)
        logging.debug('adding section: %s', mainSection)
        logging.debug('adding required options: ')
        for value in required:
            logging.debug('     %s = %s', value, required[value][1])
            configParser.set(mainSection, str(value), str(required[value][1]))

        logging.debug('adding optional options:')
        for value in optional:
            logging.debug('     %s = %s', value, optional[value][1])            
            configParser.set(mainSection, str(value), str(optional[value][1]))

        configParser.add_section(sampleShow['showname'])
        logging.debug('adding sample show: %s', sampleShow['showname'])
        logging.debug('with options: ')
        for value in sampleShow:
            logging.debug('     %s = %s', value, sampleShow[value])
            configParser.set(sampleShow['showname'], str(value), str(sampleShow[value]))
        if os.path.isfile(args.configfile):
            print 'cowardly refusing to overwrite existing configuration file:', args.configfile
            print 'remove or rename existing config file before attempting to create a new one'
        else:
            try:
                with open(args.configfile, 'wb') as configoutput:
                    configParser.write(configoutput)
            except (IOError, OSError) as e:
                print 'error writing to configuration file', e
    
    ############### READ AND ACT ON CONFIGURATION FILE
    configParser.read(args.configfile)
    
    if mainSection not in configParser.sections():
        logging.error('No "%s" section in configuration file: %s', mainSection, args.configfile)
        logging.error('exiting')
        sys.exit()
    
    # look for each required option and set to default specified above if not found
    # container for all default settings read from config file
    default = {}
        
    for key in required:
        try:
            #default[key] = configParser.get(mainSection, key)
            default[key] = required[key][0](mainSection, key)
        except (ConfigParser.NoSectionError, ConfigParser.NoOptionError) as e:
            logging.error('problem in configuraiton file: %s', e)
            logging.error('using default value: %s = %s', key, required[key][1])
            default[key] = required[key][1]     
    
    
    for key in optional:
        try:
            default[key] = optional[key][0](mainSection, key)
        except (ConfigParser.NoSectionError, ConfigParser.NoOptionError) as e:
            logging.info('"%s" optional setting not found in configuration file "%s" section', key, 'Default')
            logging.info('this is OK!')
            logging.info('using default value: %s', optional[key][1])
            default[key] = optional[key][1]     
            
    
    ############### MERGE COMMANDLINE AND CONFIGURATION FILES TOGETHER
    # add in commandline arguments
    parser = argparse.ArgumentParser(parents=[cmdlineParser])
    # add in configuration file defaults
    parser.set_defaults(**default)
    
    # add all the known arguments to the parserArgs namespace, discard any unknown arguments
    parserArgs, uknownArgs = parser.parse_known_args()
 
    # add a file handler for the file log if needed
    if parserArgs.logfile:
        # Add the a file handle to the root logger
        fileHandler = logging.FileHandler(programName+'.log')
        fileHandler.setFormatter(logFormatter)
        rootLogger.addHandler(fileHandler)

    # match the loging level set in the config file or on the command line
    # commandline -v options override
    if parserArgs.loglevel and not args.verbose:
        if isinstance(logging.getLevelName(parserArgs.loglevel.upper()), int):
            rootLogger.setLevel(parserArgs.loglevel.upper())

    # verify configuration options before proceeding
    # deal with unknwon options
    if len(unknownArgs) > 0:
        logging.warn('ignoring unknown command line options:')
        for arg in unknownArgs:
            logging.warn('     %s', arg)
        
    # check for unreasonable timeouts
    if parserArgs.timeout > 120:
        logging.warn('timeout values under 120s are reccomended: %s', parserArgs.timeout)    
    
    # add a trailing '/' to the output path
    if not re.match('.*\/$', parserArgs.outputpath):
        parserArgs.outputpath = str(parserArgs.outputpath) + str('/')   
    
    # expand out any path variables
    parserArgs.outputpath = os.path.expanduser(parserArgs.outputpath)
                                 
    ############### LOAD NON STANDARD MODULES
    loadModules()
        
    ############### READ SHOWS FROM CONFIGURATION FILE
    logging.info('%s searching config file for shows', div(10, '-'))
        
    for section in configParser.sections():
        if section not in reservedSectionNames and '#' not in section:
            logging.info('%s found show: %s', div(5), section)
            show = (showConfig((dict(configParser.items(section)))))
            if show.verifyConfig():
                shows.append(show)
            else:
                logging.error('bad configuration for show "%s", skipping', section)
    if len(shows) <= 0:
        logging.critical('no shows found in configuration file') 
        logging.critical('nothing to do')
        sys.exit()
    
    #pdb.set_trace()
    
    ############### PARSE CONIFIGURATION FOR EACH SHOW
    logging.info('%s parsing show information', div(10, '-'))
    for show in shows:
        # create an NPREpisode object and populate
        logging.debug('%s parsing configuration for show: [%s]', div(5), show.showName)
        # FIXME - cookie html header is hard coded here; add option to read from file; config file; etc.
        myEpisode = NPREpisode(name = show.showName, outputBasePath = parserArgs.outputpath, keep = show.programs, htmlheaders = {'Cookie': 'trackingChoice=true; choiceVersion=1'})
        #myEpisode.outputBasePath = parserArgs.outputpath
        myEpisode.programURL = show.url
        if myEpisode.getepisode_HTML():
            downloadEpisodes.append(myEpisode)
        else:
            logging.warning('error fetching show JSON information; see errors above')
    
    ############### DOWNLOAD EACH SHOW
    logging.info('%s downloading episodes', div(10, '-'))
    logging.debug('found %s episodes', len(downloadEpisodes))

    for episode in downloadEpisodes:
                
        logging.info('%s downloading: %s', div(5), episode.name)
        if episode.download(dryrun = parserArgs.dryrun, timeout = parserArgs.timeout,
                        useragent = randomGenerator.choice(parserArgs.useragent.split('|'))):
            
            if not parserArgs.dryrun:
                logging.debug('attempting to write M3U file')
                episode.writeM3U()
                episode.tagSegments()
            logging.info('success!')
            logging.info('%s cleaning up old episodes fpr %s', div(5), episode.name)
            #logging.debug('keeping a maximum of %s episodes', episode.keep)
            removed = episode.cleanUp()
            logging.debug('removed: %s', removed)

    print 'done'
    
    return(shows)

if __name__ == '__main__':
    main()


[ WARNING 2018-05-27 17:43] ignoring unknown command line options:
[ WARNING 2018-05-27 17:43]      -f
[ WARNING 2018-05-27 17:43]      /run/user/1000/jupyter/kernel-48c930b5-ee73-41f0-8595-f99b046ea50d.json
done

In [46]:
#from IPython.core.debugger import Tracer; Tracer()()