notebook.community

Edit and run



In [182]:

    
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import json, re, datetime
import isodate
with open('data1.json', 'r') as f:
    data = json.load(f)



In [183]:

    
isodate.parse_duration(data[100]['contentDetails']['duration']).total_seconds()/60
data[100]['contentDetails']['duration']









    Out[183]:





u'PT1H27M30S'



In [184]:

    
class IdleVideo(object):
    REGULAR = "REGULAR"
    SPECIAL = "SPECIAL"
    def __init__(self, type, title, episode, description, videoId, date, duration, timecodes):
        self.type = type
        self.title = title
        self.episode = episode
        self.description = description
        self.videoId = videoId
        self.date = date
        self.duration = duration
        self.timecodes = timecodes
    
    def validate(self):
        pass


class IdleTimecode(object):
    standard = "Standard"
    readerMail = "Reader Mail"
    robotNews = "Robot News"
    intro = "Intro"
    outro = "Outro"
    def __init__(self, type, startTime, endTime, title, comment):
        self.type = type
        self.startTime = startTime
        self.endTime = endTime
        self.title = title
        self.comment = comment

    
class IdleVideoError(Exception):
    pass



In [185]:

    
def get_podcast_title_no_and_type(item):
    
    regularTitleParts = re.split(r'Idle Thumbs ([\d\.]+)(?: -|:) ', item['snippet']['title'])
    specialTitleParts = re.split(r'Idle Thumbs(?: -|:) ', item['snippet']['title']) #
    title = ""
    
    
    if len(regularTitleParts) < 3 and len(specialTitleParts) < 2:
        
        if "Idle Thumbs XI - " in item['snippet']['title']:
            title = item['snippet']['title'].split("Idle Thumbs XI - ")[1]
            type = IdleVideo.REGULAR
            epNo = 11
        elif "Idle Thumbs XI-2 - " in item['snippet']['title']:
            title = item['snippet']['title'].split("Idle Thumbs XI-2 - ")[1]
            type = IdleVideo.REGULAR
            epNo = 12
        else:
            raise IdleVideoError(u' '.join(["Something wrong with parsing this title\n ",str(item['snippet']['title']).encode('utf-8')]))
            return None
    
    elif len(regularTitleParts) >= 3:
        epNo = regularTitleParts[1]
        title = regularTitleParts[2]
        type = IdleVideo.REGULAR
    
    else:
        epNo = -1
        title = specialTitleParts[1]
        type = IdleVideo.SPECIAL
    
    return (title, float(epNo), type)



In [201]:

    
def process_video_data(item):
    
    title, epNo, type = get_podcast_title_no_and_type(item)
    
#     print type + " - " + str(epNo) + " - " + title
    timecode_re_search = re.search(r"\n(?:\d+:)?\d\d:\d\d.*", item['snippet']['description'], flags=re.DOTALL)
    
    timecodes = []
    
    if timecode_re_search is None:
#         raise ValueError("Something wrong with parsing this description\n " + item['snippet']['description'] + "\n\n\n https://www.youtube.com/watch?v="+item['snippet']['resourceId']['videoId'])
        return None
    duration = None
    if item.get('contentDetails', None) is not None:
        duration = isodate.parse_duration(item.get('contentDetails').get('duration'))
        
    for x in list(reversed(timecode_re_search.group().split('\n'))):
        
        # 04:29 — Heat Signature - Sort of like Hotline Miami with overlapping systems
        
        if u"\u2014" not in x:
            continue
        x = x.replace(";", ":")
        timecodeParts = x.split(u" \u2014 ")
        timestamp = timecodeParts[0]
        try:
            topicParts = timecodeParts[1].split(" - ")
        except:
            print timecodeParts
            raise IdleVideoError(u''.join([u"Something wrong with parsing this timecode: ", unicode(x)]))
        topicTitle = topicParts[0]
        if topicTitle == "Reader Mail":
            type = IdleTimecode.readerMail

        elif topicTitle == "Outro":
            type = IdleTimecode.outro

        elif topicTitle == "Intro":
            type = IdleTimecode.intro

        elif topicTitle == "Robot News":
            type = IdleTimecode.robotNews
        else:
            type = IdleTimecode.standard
    
        topicComment = u"".join(topicParts[1:])
        
        timestampParts = timestamp.split(":")        
        
        if len(timestampParts) == 2:
            startTime = int(timestampParts[0])*60 + int(timestampParts[1])
            
        elif len(timestampParts) == 3:
            startTime = int(timestampParts[0])*3600 + int(timestampParts[1])*60 + int(timestampParts[2])

        else:
            raise IdleVideoError(u''.join(["Something wrong with parsing this timecode: <", timestamp, ">\n\n Episode:", str(item['snippet']['title']).encode('utf-8')]))
        if len(timecodes) == 0:
            if duration:
                endTime = duration.total_seconds()
            else:
                endTime = -1
        else:
            previousTimecode = timecodes[-1]
            endTime = previousTimecode.startTime
        timecodes.append(IdleTimecode(type=type, startTime=startTime, endTime=endTime, title=topicTitle, comment=topicComment))
        
    return IdleVideo(type=type, title=title, episode=epNo, description="", videoId=item['id'], date=isodate.parse_datetime(item['snippet']['publishedAt']), duration=duration, timecodes=timecodes)



In [206]:

    
videos = []
for item in data:
    try:
        y = process_video_data(item)
        if y is not None:
            videos.append(y)
    except Exception:
        print item['snippet']['title']
print len(videos)



In [ ]: