In [182]:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import json, re, datetime
import isodate
with open('data1.json', 'r') as f:
data = json.load(f)
In [183]:
isodate.parse_duration(data[100]['contentDetails']['duration']).total_seconds()/60
data[100]['contentDetails']['duration']
Out[183]:
In [184]:
class IdleVideo(object):
REGULAR = "REGULAR"
SPECIAL = "SPECIAL"
def __init__(self, type, title, episode, description, videoId, date, duration, timecodes):
self.type = type
self.title = title
self.episode = episode
self.description = description
self.videoId = videoId
self.date = date
self.duration = duration
self.timecodes = timecodes
def validate(self):
pass
class IdleTimecode(object):
standard = "Standard"
readerMail = "Reader Mail"
robotNews = "Robot News"
intro = "Intro"
outro = "Outro"
def __init__(self, type, startTime, endTime, title, comment):
self.type = type
self.startTime = startTime
self.endTime = endTime
self.title = title
self.comment = comment
class IdleVideoError(Exception):
pass
In [185]:
def get_podcast_title_no_and_type(item):
regularTitleParts = re.split(r'Idle Thumbs ([\d\.]+)(?: -|:) ', item['snippet']['title'])
specialTitleParts = re.split(r'Idle Thumbs(?: -|:) ', item['snippet']['title']) #
title = ""
if len(regularTitleParts) < 3 and len(specialTitleParts) < 2:
if "Idle Thumbs XI - " in item['snippet']['title']:
title = item['snippet']['title'].split("Idle Thumbs XI - ")[1]
type = IdleVideo.REGULAR
epNo = 11
elif "Idle Thumbs XI-2 - " in item['snippet']['title']:
title = item['snippet']['title'].split("Idle Thumbs XI-2 - ")[1]
type = IdleVideo.REGULAR
epNo = 12
else:
raise IdleVideoError(u' '.join(["Something wrong with parsing this title\n ",str(item['snippet']['title']).encode('utf-8')]))
return None
elif len(regularTitleParts) >= 3:
epNo = regularTitleParts[1]
title = regularTitleParts[2]
type = IdleVideo.REGULAR
else:
epNo = -1
title = specialTitleParts[1]
type = IdleVideo.SPECIAL
return (title, float(epNo), type)
In [201]:
def process_video_data(item):
title, epNo, type = get_podcast_title_no_and_type(item)
# print type + " - " + str(epNo) + " - " + title
timecode_re_search = re.search(r"\n(?:\d+:)?\d\d:\d\d.*", item['snippet']['description'], flags=re.DOTALL)
timecodes = []
if timecode_re_search is None:
# raise ValueError("Something wrong with parsing this description\n " + item['snippet']['description'] + "\n\n\n https://www.youtube.com/watch?v="+item['snippet']['resourceId']['videoId'])
return None
duration = None
if item.get('contentDetails', None) is not None:
duration = isodate.parse_duration(item.get('contentDetails').get('duration'))
for x in list(reversed(timecode_re_search.group().split('\n'))):
# 04:29 — Heat Signature - Sort of like Hotline Miami with overlapping systems
if u"\u2014" not in x:
continue
x = x.replace(";", ":")
timecodeParts = x.split(u" \u2014 ")
timestamp = timecodeParts[0]
try:
topicParts = timecodeParts[1].split(" - ")
except:
print timecodeParts
raise IdleVideoError(u''.join([u"Something wrong with parsing this timecode: ", unicode(x)]))
topicTitle = topicParts[0]
if topicTitle == "Reader Mail":
type = IdleTimecode.readerMail
elif topicTitle == "Outro":
type = IdleTimecode.outro
elif topicTitle == "Intro":
type = IdleTimecode.intro
elif topicTitle == "Robot News":
type = IdleTimecode.robotNews
else:
type = IdleTimecode.standard
topicComment = u"".join(topicParts[1:])
timestampParts = timestamp.split(":")
if len(timestampParts) == 2:
startTime = int(timestampParts[0])*60 + int(timestampParts[1])
elif len(timestampParts) == 3:
startTime = int(timestampParts[0])*3600 + int(timestampParts[1])*60 + int(timestampParts[2])
else:
raise IdleVideoError(u''.join(["Something wrong with parsing this timecode: <", timestamp, ">\n\n Episode:", str(item['snippet']['title']).encode('utf-8')]))
if len(timecodes) == 0:
if duration:
endTime = duration.total_seconds()
else:
endTime = -1
else:
previousTimecode = timecodes[-1]
endTime = previousTimecode.startTime
timecodes.append(IdleTimecode(type=type, startTime=startTime, endTime=endTime, title=topicTitle, comment=topicComment))
return IdleVideo(type=type, title=title, episode=epNo, description="", videoId=item['id'], date=isodate.parse_datetime(item['snippet']['publishedAt']), duration=duration, timecodes=timecodes)
In [206]:
videos = []
for item in data:
try:
y = process_video_data(item)
if y is not None:
videos.append(y)
except Exception:
print item['snippet']['title']
print len(videos)
In [ ]: