Load and Filter Data

Load movie data from the supplied json files. Filter out duplicates and movies with missing information.


In [1]:
import json

In [2]:
movie_data = []

with open('download/movies.json') as f:
    movie_data.extend(json.load(f)['movies'])
with open('download/more_movies.json') as f:
    movie_data.extend(json.load(f)['movies'])
    
len(movie_data)


Out[2]:
213

In [3]:
sorted(movie_data[1].keys())


Out[3]:
['abridged_cast',
 'alternate_ids',
 'id',
 'links',
 'mpaa_rating',
 'posters',
 'ratings',
 'release_dates',
 'runtime',
 'synopsis',
 'title',
 'year']

In [4]:
requiredFields = set([
    'id',
    'links',
    'mpaa_rating',
    'posters',
    'ratings',
    'runtime',
    'synopsis',
    'title',
    'year'
])

Filter out movies that don't have enough data


In [5]:
def allFilled(movie):
    for k in movie:
        if not movie.get(k) and k in requiredFields:
            return False
    return True

filtered_data = [m for m in movie_data if allFilled(m)]
len(filtered_data)


Out[5]:
192

In [6]:
ids = set()
dupes = set()

for m in filtered_data:
    if m['id'] in ids:
        dupes.add(m['id'])
    ids.add(m['id'])
    
print("uniques:\t", len(ids))
print("duplicates:\t", len(dupes))


uniques:	 78
duplicates:	 46

In [7]:
movie_dict = {m['id']: m for m in filtered_data}

Download Hi-Res Posters

Many of the rottentomatoes movies don't have high-resolution data, but we can find it.

As a last resort, downloads photos from TMDB. Set your API key as environment variable TMDB_KEY.


In [8]:
import os
import re
import requests
import time

In [14]:
def posterPath(rt_id):
    return os.path.join('..', 'starter', 'static', 'posters', rt_id + '.jpg')

In [15]:
def getURLFromTMDB(imdb_id):
    KEY = os.environ['TMDB_KEY']
    IMG_PATTERN = 'http://api.themoviedb.org/3/movie/{imdbid}/images?api_key={key}' 
    r = requests.get(IMG_PATTERN.format(imdbid=imdb_id, key=KEY))
    api_response = r.json()
    print (api_response)
    if not api_response.get('posters'):
        return None
    for p in api_response['posters']:
        if p['iso_639_1'] == 'en':
            return 'http://image.tmdb.org/t/p/original' + p['file_path']
    return None


def getPosterURL(movie):
    link = m['posters']['original']
    # links with 'resizing' in them only return thumbnails
    if 'resizing' in link:
        # we can find the flixter link
        if 'movie' in link:
            print("> extracted: {} ({})".format(movie['title'], movie['id']))
            path = re.match('^.*?(/movie.*)$', link).group(1)
            return "http://content9.flixster.com" + path
        elif m.get('alternate_ids') and m['alternate_ids'].get('imdb'):
            print("> tmdb: {} ({})".format(movie['title'], movie['id']))
            imdb_id = m['alternate_ids']['imdb']
            return None
        else:
            return None
    print("> direct: {} ({})".format(movie['title'], movie['id']))
    return link

In [16]:
notFound = []

for m in movie_dict.values():
    if os.path.exists(posterPath(m['id'])):
        continue
    print (posterPath(m['id']))
    url = getPosterURL(m)    
    if url:
        response = requests.get(url)
        if response.status_code == 200:
            with open(posterPath(m['id']), 'wb') as f:
                f.write(response.content)
        elif response.status_code == 404:
            print(">> Not found:", m['id'], url)
            notFound.append(m['id'])
        else:
            print(">> Error:", response.status_code, m['id'], url)
        time.sleep(0.1)  # respect our API limits!


../starter/static/posters/771418294.jpg
> tmdb: Dough (771418294)
../starter/static/posters/771306118.jpg
> tmdb: Inside Out (771306118)
../starter/static/posters/290370817.jpg
> tmdb: Dad's Army (290370817)
../starter/static/posters/771416410.jpg
> tmdb: Risen (771416410)
../starter/static/posters/771422075.jpg
> direct: The Iron Giant: Signature Edition (771422075)
../starter/static/posters/771385342.jpg
> direct: Jungle Shuffle (771385342)
../starter/static/posters/771420263.jpg
> tmdb: Rams (Hrutar) (771420263)
../starter/static/posters/771321699.jpg
> tmdb: Star Wars: Episode VII - The Force Awakens (771321699)
../starter/static/posters/771355933.jpg
> direct: The Lunchbox (771355933)
../starter/static/posters/771374538.jpg
> tmdb: Race (771374538)
../starter/static/posters/771413218.jpg
../starter/static/posters/771428953.jpg

Filter out movies with no poster:


In [18]:
print("Before:", len(movie_dict))
for rt_id in set(movie_dict.keys()):
    if not os.path.exists(posterPath(rt_id)):
        del movie_dict[rt_id]
print("After:", len(movie_dict))


Before: 69
After: 66

Generate Thumbnail Sprites


In [26]:
from PIL import Image
import thumbnails

In [27]:
THUMB_SIZE = (200, 300)
MAX_SPRITE_SIZE = 5
SPRITE_SIZE = (THUMB_SIZE[0] * MAX_SPRITE_SIZE, THUMB_SIZE[1] * MAX_SPRITE_SIZE)

In [29]:
thumb_indices = {}  # Each index is a 3-tuple: file_number, row, column
sprites = []

for i, movie in enumerate(movie_dict.values()):
    f = int(i / MAX_SPRITE_SIZE ** 2)
    j = int(i % MAX_SPRITE_SIZE ** 2)
    row = int(j / MAX_SPRITE_SIZE)
    col = int(j % MAX_SPRITE_SIZE)
    
    if len(sprites) == f:
        sprites.append(Image.new(mode='RGB', size=SPRITE_SIZE))
    
    path = os.path.abspath(posterPath(movie['id']))
    thumb = thumbnails.get_thumbnail(path, "{}x{}".format(*THUMB_SIZE), crop='center')
    sprites[-1].paste(thumb.image, (col * THUMB_SIZE[0], row * THUMB_SIZE[1]))
    
    thumb_indices[movie['id']] = (f, row, col)

In [31]:
SPRITE_DIR = os.path.join('..', 'starter', 'static', 'thumb_sprites')

for i, s in enumerate(sprites):
    s.save(os.path.join(SPRITE_DIR, "sprite-{}.jpg".format(i)))
    
spriteInfo = {
    'thumb_size': THUMB_SIZE,
    'thumb_indices': thumb_indices,
    'n_sheets': len(sprites),
    'sheet_size': MAX_SPRITE_SIZE
}

with open(os.path.join('..', 'starter', 'templates', "sprites.json"), 'w') as f:
    json.dump(spriteInfo, f)

Generate Django Data Fixture


In [32]:
def generateFixture(movie):
    return {
        'model': 'starter.Movie',
        'pk': movie['id'],
        'fields': {
            'rt_link': movie['links'].get('alternate'),
            'runtime': movie.get('runtime'),
            'audience_score': movie['ratings'].get('audience_score'),
            'critics_score': movie['ratings'].get('critics_score'),
            'title': movie.get('title'),
            'year': movie.get('year'),
            'synopsis': movie.get('synopsis'),
            'mpaa_rating': movie.get('mpaa_rating'),
            'imdb_id': movie['alternate_ids'].get('imdb') if movie.get('alternate_ids') else '',
            'actors': json.dumps([member['name'] for member in movie.get('abridged_cast')]),
            'audience_rating': movie['ratings'].get('audience_rating', ''),
            'critics_rating': movie['ratings'].get('critics_rating', ''),
        }
    }

In [34]:
fixtures = [generateFixture(m) for m in movie_dict.values()]

with open(os.path.join('..', 'starter', 'fixtures', 'movie_fixture.json'), 'w') as f:
    json.dump(fixtures, f, indent=2)

In [ ]: