notebook.community

Edit and run



In [24]:

    
""" IMDB - Movie Writers and Directors

    IMDB has a wealth of information about movies, TV shows, Video Games, etc.
    I know more actors from my favorite movies than directors, and don't know
    anything about the writers.
    So I want to use IMDB's data to learn about writers and diretors.
    First things first, I need to pare down these Giant IMDB dump files.
    The data is available at: http://www.imdb.com/interfaces
"""
import gzip
import pandas as pd


def trim_person_file(file_path, person_type, start_line, end_line):
    """ This function read the IMDB database dump text file and
        writes a tab-separated CSV file with just the writer or
        directors of movies. The input file format is like:
    Coen, Ethan\t\tA Fever in the Blood (2002)  (story "A Fever in the Blood")
    \t\t\tA Serious Man (2009)  (written by)  <1,1,2>
    \t\t\tBarton Fink (1991)  (written by)  <1,1,2>
    
    Barnes, Craig (VII)\t"Mike's Ma's Balls" (2015)
    """
    line_num = 0
    person = ''
    csv = [[person_type, 'MOVIE_TITLE', 'YEAR']]

    # read the original IMDB data dump file
    f = gzip.open(file_path, 'rb')
    for line in f.readlines():
        # deal with this awful IMDB headers
        line_num += 1
        if line_num < start_line:
            continue
        elif line_num > end_line:
            break
        elif not line.strip():
            continue
        
        # finally, parse the data
        line = line.decode('ISO-8859-1')
        ln = line.split('\t')
        if line[0] != '\t':
            person = ln[0]
        if ln[-1][0] == '"' or ' (V)' in line or ' (TV)' in line or ' (VG)' in line:
            continue
        data = ln[-1].strip().split(' (')
        title = data[0]
        try:
            year = str(int(data[1][:4]))
        except:
            year = ''
        csv.append([person, title, year])
    
    f.close()
    
    # write the tab-seperated writer CSV file (& gzip it)
    fout = gzip.open(file_path.rstrip('gz') + 'trimmed.gz', 'wb')
    for line in csv:
        fout.write(('\t'.join(line) + '\n').encode('ISO-8859-1'))
    fout.close()



In [25]:

    
""" Now let's create the trimmed writers and directors CSV files. """

trim_person_file('writers.list.gz', 'WRITER', 303, 4298496)
trim_person_file('directors.list.gz', 'DIRECTOR', 236, 2757403)



In [37]:

    
def trim_ratings_file(file_path, start_line, end_line):
    """ This function read the IMDB database dump text file and
        writes a tab-separated CSV file with just the weighted
        user rating of each movie. The input file format is like:
          1000000102      62   6.2  "#1 Single" (2006)
          1.0..01103      14   7.1  "#7DaysLater" (2013)
          2....0.013      11   6.8  "#Bikerlive" (2014)
          2.01..3.01      13   5.5  "#ByMySide" (2012)
          0000011101      24   6.5  The Big Leap (2013)
          0000001222  491417   8.2  The Big Lebowski (1998)
    """
    line_num = 0
    person = ''
    csv = [['VOTES', 'RANK', 'MOVIE_TITLE', 'YEAR']]

    # read the original IMDB data dump file
    f = gzip.open(file_path, 'rb')
    for line in f.readlines():
        # deal with this awful IMDB headers
        line_num += 1
        if line_num < start_line:
            continue
        elif line_num > end_line:
            break
        
        # finally, parse the data
        line = line.decode('ISO-8859-1')
        if line[32] == '"':
            # this is a TV show, skip
            continue
        elif ' (V)' in line or ' (TV)' in line:
            # this isn't a real movie
            continue
        votes = line[16:25].strip()
        rank = line[25:30].strip()
        try:
            dummy = int(votes)
            dummy = float(rank)
        except:
            print(votes, rank)
            print(line)
            return
        movie = line[31:].strip().split(' (')
        movie_title = movie[0]
        try:
            year = str(int(movie[1][:4]))
        except:
            year = ''
        csv.append([votes, rank, movie_title, year])

    f.close()
    
    # write the tab-seperated writer CSV file (& gzip it)
    fout = gzip.open(file_path.rstrip('gz') + 'trimmed.gz', 'wb')
    for line in csv:
        fout.write(('\t'.join(line) + '\n').encode('ISO-8859-1'))
    fout.close()



In [38]:

    
""" Now let's create the trimmed movie ratins file. """

trim_ratings_file('ratings.list.gz', 297, 667884)



In [22]:

    
notes = """ It is probably interesting to note that when IMDB
            comes up with their official "top 250" movies, they
            use this formula:

            TOP 250 MOVIES (25000+ VOTES)
            
            The formula used to calculate the top 250 movies is:
            
            weighted rank = (v/(v+k))*X + (k/(v+k))*C

            where:

            X = average for the movie (mean)
            v = number of votes for the movie
            k = minimum votes required to be listed in the top 250 (currently 25000)
            C = the mean vote across the whole report (currently 6.90)
        """



In [ ]: