In [24]:
""" IMDB - Movie Writers and Directors
IMDB has a wealth of information about movies, TV shows, Video Games, etc.
I know more actors from my favorite movies than directors, and don't know
anything about the writers.
So I want to use IMDB's data to learn about writers and diretors.
First things first, I need to pare down these Giant IMDB dump files.
The data is available at: http://www.imdb.com/interfaces
"""
import gzip
import pandas as pd
def trim_person_file(file_path, person_type, start_line, end_line):
""" This function read the IMDB database dump text file and
writes a tab-separated CSV file with just the writer or
directors of movies. The input file format is like:
Coen, Ethan\t\tA Fever in the Blood (2002) (story "A Fever in the Blood")
\t\t\tA Serious Man (2009) (written by) <1,1,2>
\t\t\tBarton Fink (1991) (written by) <1,1,2>
Barnes, Craig (VII)\t"Mike's Ma's Balls" (2015)
"""
line_num = 0
person = ''
csv = [[person_type, 'MOVIE_TITLE', 'YEAR']]
# read the original IMDB data dump file
f = gzip.open(file_path, 'rb')
for line in f.readlines():
# deal with this awful IMDB headers
line_num += 1
if line_num < start_line:
continue
elif line_num > end_line:
break
elif not line.strip():
continue
# finally, parse the data
line = line.decode('ISO-8859-1')
ln = line.split('\t')
if line[0] != '\t':
person = ln[0]
if ln[-1][0] == '"' or ' (V)' in line or ' (TV)' in line or ' (VG)' in line:
continue
data = ln[-1].strip().split(' (')
title = data[0]
try:
year = str(int(data[1][:4]))
except:
year = ''
csv.append([person, title, year])
f.close()
# write the tab-seperated writer CSV file (& gzip it)
fout = gzip.open(file_path.rstrip('gz') + 'trimmed.gz', 'wb')
for line in csv:
fout.write(('\t'.join(line) + '\n').encode('ISO-8859-1'))
fout.close()
In [25]:
""" Now let's create the trimmed writers and directors CSV files. """
trim_person_file('writers.list.gz', 'WRITER', 303, 4298496)
trim_person_file('directors.list.gz', 'DIRECTOR', 236, 2757403)
In [37]:
def trim_ratings_file(file_path, start_line, end_line):
""" This function read the IMDB database dump text file and
writes a tab-separated CSV file with just the weighted
user rating of each movie. The input file format is like:
1000000102 62 6.2 "#1 Single" (2006)
1.0..01103 14 7.1 "#7DaysLater" (2013)
2....0.013 11 6.8 "#Bikerlive" (2014)
2.01..3.01 13 5.5 "#ByMySide" (2012)
0000011101 24 6.5 The Big Leap (2013)
0000001222 491417 8.2 The Big Lebowski (1998)
"""
line_num = 0
person = ''
csv = [['VOTES', 'RANK', 'MOVIE_TITLE', 'YEAR']]
# read the original IMDB data dump file
f = gzip.open(file_path, 'rb')
for line in f.readlines():
# deal with this awful IMDB headers
line_num += 1
if line_num < start_line:
continue
elif line_num > end_line:
break
# finally, parse the data
line = line.decode('ISO-8859-1')
if line[32] == '"':
# this is a TV show, skip
continue
elif ' (V)' in line or ' (TV)' in line:
# this isn't a real movie
continue
votes = line[16:25].strip()
rank = line[25:30].strip()
try:
dummy = int(votes)
dummy = float(rank)
except:
print(votes, rank)
print(line)
return
movie = line[31:].strip().split(' (')
movie_title = movie[0]
try:
year = str(int(movie[1][:4]))
except:
year = ''
csv.append([votes, rank, movie_title, year])
f.close()
# write the tab-seperated writer CSV file (& gzip it)
fout = gzip.open(file_path.rstrip('gz') + 'trimmed.gz', 'wb')
for line in csv:
fout.write(('\t'.join(line) + '\n').encode('ISO-8859-1'))
fout.close()
In [38]:
""" Now let's create the trimmed movie ratins file. """
trim_ratings_file('ratings.list.gz', 297, 667884)
In [22]:
notes = """ It is probably interesting to note that when IMDB
comes up with their official "top 250" movies, they
use this formula:
TOP 250 MOVIES (25000+ VOTES)
The formula used to calculate the top 250 movies is:
weighted rank = (v/(v+k))*X + (k/(v+k))*C
where:
X = average for the movie (mean)
v = number of votes for the movie
k = minimum votes required to be listed in the top 250 (currently 25000)
C = the mean vote across the whole report (currently 6.90)
"""
In [ ]: