This notebook contains the Python-3 script to get IMDB movie data into a database from their downloadable datasets. It assumes you have psycopg2, which you can install from conda. I recommend making a new conda environment, so your root environment won't get corrupted. For example, the following command in a conda shell would create a new Python-3.6 environment with psycopg2.
$ conda create -n py36-imdb python=3.6.3 psycopg2
psycopg2 is a binding to the PostgreSQL C-API that uses the Python DB-API 2.0 specification. This means it uses the same syntax to connect and execute queries as the builtin sqlite3
Python module.
In [1]:
# imports
import csv
import gzip
import os
import urllib
import psycopg2, psycopg2.extras
import logging
# logging
logging.basicConfig()
LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.DEBUG)
In [2]:
# meta data about the URL, the data tarballs, and their schema
IMDB_datasets = 'https://datasets.imdbws.com/'
datasets = {
'title_basics': 'title.basics.tsv.gz', # information for titles
'title_crew': 'title.crew.tsv.gz', # the director and writer information for all the titles in IMDb
'title_episode': 'title.episode.tsv.gz', # the tv episode information
'title_principals': 'title.principals.tsv.gz',
'title_ratings': 'title.ratings.tsv.gz',
'name_basics': 'name.basics.tsv.gz'
}
DETAILS = {
'title_basics': [
('tconst', 'TEXT PRIMARY KEY'), ('titleType', 'TEXT'), ('primaryTitle', 'TEXT'),
('originalTitle', 'TEXT'), ('isAdult', 'INT'), ('startYear', 'INT'),
('endYear', 'INT'), ('runtimeMinutes', 'INT'), ('genres', 'TEXT')
],
}
In [3]:
# info to connect to an online database set up for this tutorial
DBNAME = 'breaking-bytes_imdb'
HOST = 'postgresql-breaking-bytes.alwaysdata.net'
PORT = 5432
USER = os.getenv('USER')
PASSWORD = os.getenv('PASSWORD')
# connect to the database and get a Python DB-API 2.0 "cursor"
conn = psycopg2.connect(dbname=DBNAME, user=USER, password=PASSWORD, host=HOST, port=PORT)
cur = conn.cursor()
In [4]:
# create title.basic table
EXPR = ('CREATE TABLE title_basics(%s, %s, %s, %s, %s, %s, %s, %s, %s);'
% tuple(' '.join(kv) for kv in DETAILS['title_basics']))
cur.execute(EXPR)
conn.commit()
In [5]:
# get the 'title.basics' dataset
with urllib.request.urlopen(IMDB_datasets + datasets['title_basics']) as http_response:
tsv = gzip.decompress(http_response.read())
In [6]:
# split the lines into rows
rows = tsv.decode('utf-8').split('\n')
LOGGER.debug(rows[0].split('\t')) # take a look at the headers
# skip the first row with the headers and the last row with only a newline
rows = rows[1:-1]
# split the rows into columns
records = [row.split('\t') for row in rows]
# replace IMDB's NULL, "\\N", which None
records = [tuple(None if r == '\\N' else r for r in record)
for record in records]
In [7]:
# copy this data to the database
COUNT = len(records)
LOOPS = 100
CHUNKS = COUNT // LOOPS
idx = 0
for n in range(LOOPS):
jdx = idx + CHUNKS
LOGGER.debug('loop: %d, start: %d, stop: %d', n, idx, jdx)
psycopg2.extras.execute_values(cur, 'INSERT INTO title_basics VALUES %s;', records[idx:jdx])
conn.commit()
idx += CHUNKS
In [8]:
cur.close()
conn.close()
In [14]:
print(COUNT)
In [ ]: