In [38]:
import os
from itertools import groupby
from collections import namedtuple
file_dir = '/Users/whitehat/Downloads/2 - Architecture & Principles Subtitles/'
for dirname, dirnames, filenames in os.walk(file_dir):
for filename in filenames:
with open(os.path.join(dirname, filename)) as f:
all_lines = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]
with open(os.path.join(dirname, os.path.splitext(filename)[0] + '.txt'), 'w') as f_to_write:
for line in all_lines:
if len(line) == 3:
f_to_write.write(line[2].rstrip())
In [6]:
import os
from itertools import groupby
from collections import namedtuple
file_dir = '/Users/whitehat/Downloads/2 - Architecture & Principles Subtitles'
# file_dir = '/Users/whitehat/Downloads/3 - Switching Subtitles'
with open(os.path.join(file_dir, 'output.txt'), 'w') as output_f:
for dirname, dirnames, filenames in os.walk(file_dir):
for filename in filenames:
with open(os.path.join(dirname, filename)) as f:
all_lines = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]
output_f.write(filename)
output_f.write('\n')
for line in all_lines:
if len(line) == 3:
output_f.write(line[2].rstrip() + ' ')
output_f.write('\n\n')
In [1]:
import os
from itertools import groupby
from collections import OrderedDict
import socket
class LectureParser():
""" Creates a single ~/Desktop/output.txt file in the following format:
Lecture1
Lesson1
------------
Lesson2
------------
=======================
Assumptions:
The list of lecture directories in Downloads folder
The user executing this script has admin privilages.
"""
def __init__(self):
""" Args:
self.home_dir: a user's home directory independent of the Unix systems used.
self.output_file: a path to the user's ~/Desktop where the output file is generated.
"""
self.home_dir = os.path.expanduser('~')
self.output_file = os.path.join(self.get_desktop_dir(), 'output.txt')
def get_downloads_dir(self):
""" Returns:
downloads_dir: the user's ~/Downloads directory.
"""
downloads_dir = os.path.join(self.home_dir, 'Downloads')
return downloads_dir
def get_desktop_dir(self):
""" Returns:
desktop_dir: the user's ~/Desktop directory.
"""
desktop_dir = os.path.join(self.home_dir, 'Desktop')
return desktop_dir
def get_lectures_dirs(self):
""" Generates an ordered dictionary based on the available list of lectures.
Returns:
lectures: an ordered dictioanry of format:
{'lecture_name_1': {}, 'lecture_name_2: {}, ... }
"""
dirs = ['1 - Introduction Subtitles', '2 - Architecture & Principles Subtitles',
'3 - Switching Subtitles', '4 - Routing Subtitles',
'5 - Naming, Addressing & Forwarding Subtitles', '5. 1 - Router Design Basics Subtitles',
'5.2 - DNS Subtitles', '6 - Congestion control & streaming Subtitles',
'7 - Rate limiting and traffic shaping Subtitles', '8 - Content distribution Subtitles',
'9 - Software Defined Networking Subtitles', '9.1 - Programming SDNs Subtitles',
'10 - Traffic Engineering Subtitles', '11 - Network Security Subtitles',
'11.1 - Internet Worms Subtitles', '11.2 - Spam Subtitles', '11.3 - Denial of Service Attacks Subtitles',]
# initiate ordered dictionary (imported from collections)
lectures = OrderedDict()
# loop through the list of lectures and set the lecture names as the key in dictioanry
#+ .setdefault method allows for the key to point to a data structure (dictionary in our case)
#+ without values
for dir in dirs:
lectures.setdefault(dir, {})
return lectures
def get_lectures(self):
""" Continues to build ordered dictionary by adding full path and file names.
Returns:
lectures_all: an ordered dictionary of format:
{'lecture_name_1': {'full/path/': [lesson_file_name_1, lesson_file_name_2, ...]}, ...}
"""
# get the ordered dictionary with keys and empty values
lectures_all = self.get_lectures_dirs()
# get the keys from the dictionary to avoid other files in the ~/Downloads folder
valid_dirs = lectures_all.keys()
# walk via ~/Downloads folder:
# dirname - full path to each file in the ~/Downloads directory
# dirnames - only the names of other directories inside ~Downloads directory
# filenames - all file names in the ~/Downloads directory and its children
for dirname, dirnames, filenames in os.walk(self.get_downloads_dir()):
# while looping via all directories, check if the directory basename (last part of the path)
#+ is within the desired list
dir_basename = os.path.basename(dirname)
# if it is, add the full path as a value to our ordered dictionary
# use .setdefault to generate a dictioanry of dictionary of lists:
# lectures_all -> {'2 - Architecture & Principles Subtitles':
# {
# '/Users/user_name/Downloads/2 - Architecture & Principles Subtitles': [],
# ...
# },
# ...
# }
if dir_basename in valid_dirs:
lectures_all[dir_basename].setdefault(dirname, [])
for filename in filenames:
# finally, complete the ordered dictionary to follow the following
# lectures_all -> {'2 - Architecture & Principles Subtitles':
# {
# '/Users/user_name/Downloads/2 - Architecture & Principles Subtitles':
# ['01 - Lesson 2 Intro.srt',
# '02 - A Brief History of the Internet.srt',
# ...
# ],
# ...
# },
# ...
# }
lectures_all[dir_basename][dirname].append(filename)
return lectures_all
def build_transcript(self):
""" Here the transcripts are parsed and written to the output file.
"""
with open(self.output_file, 'w') as output_file:
# loop through our ordered dictionary
for lecture, filenames in self.get_lectures().iteritems():
# write the name of the lecture
output_file.writelines(['\n\n', 'Lecture ', lecture, '\n\n'])
# loop through our ordered dictionary of dictionaries
for file_dir, file_name_list in filenames.iteritems():
# loop through our ordered dictionary of dictionaries of lists
for file_name in file_name_list:
# write the name of the lesson
output_file.writelines([file_name, '\n'])
# open the Udacity provided .srt file
with open(os.path.join(file_dir, file_name) , 'r') as f:
# the following list comprehention of code:
# goes through each line in the file
# groups sets of lines separated by an empty line
# converts the groupped lines into lists of lists of lines
# the actual text is the 2nd element (0-based) of each list
all_lines = [list(g) for b,g in groupby(f, lambda x: bool(x.strip())) if b]
# if the list is not empty
if all_lines:
for line in all_lines:
# if the list inside the list contains 3 items, that is the valid list
if len(line) == 3:
# write out the text
output_file.write(line[2].rstrip() + ' ')
output_file.writelines(['\n\n', '---'*10, '\n'])
else:
output_file.writelines(['Lecture without words :)','\n\n', '---'*10, '\n'])
output_file.write('==='*20)
output_file.write('\n\n')
# Initiate the class
parser = LectureParser()
# run the build method
parser.build_transcript()
In [ ]: