In [1]:
import json
In [2]:
movie_data = []
with open('download/movies.json') as f:
movie_data.extend(json.load(f)['movies'])
with open('download/more_movies.json') as f:
movie_data.extend(json.load(f)['movies'])
len(movie_data)
Out[2]:
In [3]:
sorted(movie_data[1].keys())
Out[3]:
In [4]:
requiredFields = set([
'id',
'links',
'mpaa_rating',
'posters',
'ratings',
'runtime',
'synopsis',
'title',
'year'
])
Filter out movies that don't have enough data
In [5]:
def allFilled(movie):
for k in movie:
if not movie.get(k) and k in requiredFields:
return False
return True
filtered_data = [m for m in movie_data if allFilled(m)]
len(filtered_data)
Out[5]:
In [6]:
ids = set()
dupes = set()
for m in filtered_data:
if m['id'] in ids:
dupes.add(m['id'])
ids.add(m['id'])
print("uniques:\t", len(ids))
print("duplicates:\t", len(dupes))
In [7]:
movie_dict = {m['id']: m for m in filtered_data}
Many of the rottentomatoes movies don't have high-resolution data, but we can find it.
As a last resort, downloads photos from TMDB. Set your API key as environment variable TMDB_KEY
.
In [8]:
import os
import re
import requests
import time
In [14]:
def posterPath(rt_id):
return os.path.join('..', 'starter', 'static', 'posters', rt_id + '.jpg')
In [15]:
def getURLFromTMDB(imdb_id):
KEY = os.environ['TMDB_KEY']
IMG_PATTERN = 'http://api.themoviedb.org/3/movie/{imdbid}/images?api_key={key}'
r = requests.get(IMG_PATTERN.format(imdbid=imdb_id, key=KEY))
api_response = r.json()
print (api_response)
if not api_response.get('posters'):
return None
for p in api_response['posters']:
if p['iso_639_1'] == 'en':
return 'http://image.tmdb.org/t/p/original' + p['file_path']
return None
def getPosterURL(movie):
link = m['posters']['original']
# links with 'resizing' in them only return thumbnails
if 'resizing' in link:
# we can find the flixter link
if 'movie' in link:
print("> extracted: {} ({})".format(movie['title'], movie['id']))
path = re.match('^.*?(/movie.*)$', link).group(1)
return "http://content9.flixster.com" + path
elif m.get('alternate_ids') and m['alternate_ids'].get('imdb'):
print("> tmdb: {} ({})".format(movie['title'], movie['id']))
imdb_id = m['alternate_ids']['imdb']
return None
else:
return None
print("> direct: {} ({})".format(movie['title'], movie['id']))
return link
In [16]:
notFound = []
for m in movie_dict.values():
if os.path.exists(posterPath(m['id'])):
continue
print (posterPath(m['id']))
url = getPosterURL(m)
if url:
response = requests.get(url)
if response.status_code == 200:
with open(posterPath(m['id']), 'wb') as f:
f.write(response.content)
elif response.status_code == 404:
print(">> Not found:", m['id'], url)
notFound.append(m['id'])
else:
print(">> Error:", response.status_code, m['id'], url)
time.sleep(0.1) # respect our API limits!
Filter out movies with no poster:
In [18]:
print("Before:", len(movie_dict))
for rt_id in set(movie_dict.keys()):
if not os.path.exists(posterPath(rt_id)):
del movie_dict[rt_id]
print("After:", len(movie_dict))
In [26]:
from PIL import Image
import thumbnails
In [27]:
THUMB_SIZE = (200, 300)
MAX_SPRITE_SIZE = 5
SPRITE_SIZE = (THUMB_SIZE[0] * MAX_SPRITE_SIZE, THUMB_SIZE[1] * MAX_SPRITE_SIZE)
In [29]:
thumb_indices = {} # Each index is a 3-tuple: file_number, row, column
sprites = []
for i, movie in enumerate(movie_dict.values()):
f = int(i / MAX_SPRITE_SIZE ** 2)
j = int(i % MAX_SPRITE_SIZE ** 2)
row = int(j / MAX_SPRITE_SIZE)
col = int(j % MAX_SPRITE_SIZE)
if len(sprites) == f:
sprites.append(Image.new(mode='RGB', size=SPRITE_SIZE))
path = os.path.abspath(posterPath(movie['id']))
thumb = thumbnails.get_thumbnail(path, "{}x{}".format(*THUMB_SIZE), crop='center')
sprites[-1].paste(thumb.image, (col * THUMB_SIZE[0], row * THUMB_SIZE[1]))
thumb_indices[movie['id']] = (f, row, col)
In [31]:
SPRITE_DIR = os.path.join('..', 'starter', 'static', 'thumb_sprites')
for i, s in enumerate(sprites):
s.save(os.path.join(SPRITE_DIR, "sprite-{}.jpg".format(i)))
spriteInfo = {
'thumb_size': THUMB_SIZE,
'thumb_indices': thumb_indices,
'n_sheets': len(sprites),
'sheet_size': MAX_SPRITE_SIZE
}
with open(os.path.join('..', 'starter', 'templates', "sprites.json"), 'w') as f:
json.dump(spriteInfo, f)
In [32]:
def generateFixture(movie):
return {
'model': 'starter.Movie',
'pk': movie['id'],
'fields': {
'rt_link': movie['links'].get('alternate'),
'runtime': movie.get('runtime'),
'audience_score': movie['ratings'].get('audience_score'),
'critics_score': movie['ratings'].get('critics_score'),
'title': movie.get('title'),
'year': movie.get('year'),
'synopsis': movie.get('synopsis'),
'mpaa_rating': movie.get('mpaa_rating'),
'imdb_id': movie['alternate_ids'].get('imdb') if movie.get('alternate_ids') else '',
'actors': json.dumps([member['name'] for member in movie.get('abridged_cast')]),
'audience_rating': movie['ratings'].get('audience_rating', ''),
'critics_rating': movie['ratings'].get('critics_rating', ''),
}
}
In [34]:
fixtures = [generateFixture(m) for m in movie_dict.values()]
with open(os.path.join('..', 'starter', 'fixtures', 'movie_fixture.json'), 'w') as f:
json.dump(fixtures, f, indent=2)
In [ ]: