In [38]:
import os  
from itertools import groupby
from collections import namedtuple

file_dir = '/Users/whitehat/Downloads/2 - Architecture & Principles Subtitles/'

for dirname, dirnames, filenames in os.walk(file_dir):
    for filename in filenames:
        with open(os.path.join(dirname, filename)) as f:
            all_lines = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]
            with open(os.path.join(dirname, os.path.splitext(filename)[0] + '.txt'), 'w') as f_to_write:      
                for line in all_lines:
                    if len(line) == 3:
                        f_to_write.write(line[2].rstrip())

In [6]:
import os  
from itertools import groupby
from collections import namedtuple

file_dir = '/Users/whitehat/Downloads/2 - Architecture & Principles Subtitles'
# file_dir = '/Users/whitehat/Downloads/3 - Switching Subtitles'

with open(os.path.join(file_dir, 'output.txt'), 'w') as output_f:
    for dirname, dirnames, filenames in os.walk(file_dir):
        for filename in filenames:
            with open(os.path.join(dirname, filename)) as f:
                all_lines = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]
                output_f.write(filename)
                output_f.write('\n')
                for line in all_lines:
                    if len(line) == 3:
                        output_f.write(line[2].rstrip() + ' ')
                output_f.write('\n\n')

In [1]:
import os
from itertools import groupby
from collections import OrderedDict
import socket


class LectureParser():
    """ Creates a single ~/Desktop/output.txt file in the following format:
            Lecture1
                Lesson1
                ------------
                Lesson2
                ------------
            =======================
        Assumptions:
            The list of lecture directories in Downloads folder
            The user executing this script has admin privilages. 
                
    """
    def __init__(self):
        """ Args:
                self.home_dir: a user's home directory independent of the Unix systems used.
                self.output_file: a path to the user's ~/Desktop where the output file is generated.
        """
        self.home_dir = os.path.expanduser('~')
        self.output_file = os.path.join(self.get_desktop_dir(), 'output.txt')
        
    def get_downloads_dir(self):
        """ Returns:
                downloads_dir: the user's ~/Downloads directory.
        """
        downloads_dir = os.path.join(self.home_dir, 'Downloads')
        return downloads_dir

    def get_desktop_dir(self):
        """ Returns:
                desktop_dir: the user's ~/Desktop directory.
        """
        desktop_dir = os.path.join(self.home_dir, 'Desktop')
        return desktop_dir

    def get_lectures_dirs(self):
        """ Generates an ordered dictionary based on the available list of lectures.
            Returns:
                lectures: an ordered dictioanry of format:
                            {'lecture_name_1': {}, 'lecture_name_2: {}, ... }
        """
        dirs = ['1 - Introduction Subtitles', '2 - Architecture & Principles Subtitles', 
                '3 - Switching Subtitles', '4 - Routing Subtitles', 
                '5 - Naming, Addressing & Forwarding Subtitles', '5. 1 - Router Design Basics Subtitles', 
                '5.2 - DNS Subtitles', '6 - Congestion control & streaming Subtitles', 
                '7 - Rate limiting and traffic shaping Subtitles', '8 - Content distribution Subtitles', 
                '9 - Software Defined Networking Subtitles', '9.1 - Programming SDNs Subtitles', 
                '10 - Traffic Engineering Subtitles', '11 - Network Security Subtitles', 
                '11.1 - Internet Worms Subtitles', '11.2 - Spam Subtitles', '11.3 - Denial of Service Attacks Subtitles',]
        
        # initiate ordered dictionary (imported from collections)
        lectures = OrderedDict()
        # loop through the list of lectures and set the lecture names as the key in dictioanry
        #+ .setdefault method allows for the key to point to a data structure (dictionary in our case)
        #+ without values
        for dir in dirs:
            lectures.setdefault(dir, {})
        return lectures
    
    def get_lectures(self):
        """ Continues to build ordered dictionary by adding full path and file names.
            Returns:
                lectures_all: an ordered dictionary of format:
                            {'lecture_name_1': {'full/path/': [lesson_file_name_1, lesson_file_name_2, ...]}, ...}
        """
        # get the ordered dictionary with keys and empty values
        lectures_all = self.get_lectures_dirs()
        # get the keys from the dictionary to avoid other files in the ~/Downloads folder
        valid_dirs = lectures_all.keys()
        # walk via ~/Downloads folder: 
            # dirname - full path to each file in the ~/Downloads directory
            # dirnames - only the names of other directories inside ~Downloads directory
            # filenames - all file names in the ~/Downloads directory and its children 
        for dirname, dirnames, filenames in os.walk(self.get_downloads_dir()):
            # while looping via all directories, check if the directory basename (last part of the path)
            #+ is within the desired list
            dir_basename = os.path.basename(dirname)
            # if it is, add the full path as a value to our ordered dictionary
            # use .setdefault to generate a dictioanry of dictionary of lists:
                # lectures_all -> {'2 - Architecture & Principles Subtitles': 
                #                       {
                #                           '/Users/user_name/Downloads/2 - Architecture & Principles Subtitles': [],
                #                            ...
                #                       }, 
                #                       ...
                #               }
            if dir_basename in valid_dirs:
                lectures_all[dir_basename].setdefault(dirname, [])
                for filename in filenames:
                    # finally, complete the ordered dictionary to follow the following 
                        # lectures_all -> {'2 - Architecture & Principles Subtitles': 
                        #                       {
                        #                           '/Users/user_name/Downloads/2 - Architecture & Principles Subtitles': 
                        #                            ['01 - Lesson 2 Intro.srt', 
                        #                             '02 - A Brief History of the Internet.srt', 
                        #                              ...
                        #                             ], 
                        #                            ...
                        #                       }, 
                        #                       ...
                        #               }
                    lectures_all[dir_basename][dirname].append(filename)
        return lectures_all
    
    def build_transcript(self):
        """ Here the transcripts are parsed and written to the output file.
        """
        with open(self.output_file, 'w') as output_file:
            # loop through our ordered dictionary
            for lecture, filenames in self.get_lectures().iteritems():
                # write the name of the lecture
                output_file.writelines(['\n\n', 'Lecture ', lecture, '\n\n'])
                # loop through our ordered dictionary of dictionaries
                for file_dir, file_name_list in filenames.iteritems():
                    # loop through our ordered dictionary of dictionaries of lists
                    for file_name in file_name_list:
                        # write the name of the lesson
                        output_file.writelines([file_name, '\n'])
                        # open the Udacity provided .srt file
                        with open(os.path.join(file_dir, file_name) , 'r') as f:
                            # the following list comprehention of code:
                                # goes through each line in the file
                                # groups sets of lines separated by an empty line
                                # converts the groupped lines into lists of lists of lines
                                # the actual text is the 2nd element (0-based) of each list
                            all_lines = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]
                            # if the list is not empty
                            if all_lines:
                                for line in all_lines:
                                    # if the list inside the list contains 3 items, that is the valid list
                                    if len(line) == 3:
                                        # write out the text
                                        output_file.write(line[2].rstrip() + ' ')
                                output_file.writelines(['\n\n', '---'*10, '\n'])
                            else:
                                output_file.writelines(['Lecture without words :)','\n\n', '---'*10, '\n'])
                    output_file.write('==='*20)
                    output_file.write('\n\n')

# Initiate the class
parser = LectureParser()
# run the build method
parser.build_transcript()

In [ ]: